RE: [EXT] Re: [PATCH v9 1/9] hw/vfio/common: Remove error print on mmio region translation by viommu

2020-03-26 Thread Bharat Bhushan
Hi Alex, Eric,

> -Original Message-
> From: Alex Williamson 
> Sent: Thursday, March 26, 2020 11:23 PM
> To: Auger Eric 
> Cc: Bharat Bhushan ; peter.mayd...@linaro.org;
> pet...@redhat.com; eric.auger@gmail.com; kevin.t...@intel.com;
> m...@redhat.com; Tomasz Nowicki [C] ;
> drjo...@redhat.com; linuc.dec...@gmail.com; qemu-devel@nongnu.org; qemu-
> a...@nongnu.org; bharatb.li...@gmail.com; jean-phili...@linaro.org;
> yang.zh...@intel.com; David Gibson 
> Subject: [EXT] Re: [PATCH v9 1/9] hw/vfio/common: Remove error print on mmio
> region translation by viommu
> 
> External Email
> 
> --
> On Thu, 26 Mar 2020 18:35:48 +0100
> Auger Eric  wrote:
> 
> > Hi Alex,
> >
> > On 3/24/20 12:08 AM, Alex Williamson wrote:
> > > [Cc +dwg who originated this warning]
> > >
> > > On Mon, 23 Mar 2020 14:16:09 +0530
> > > Bharat Bhushan  wrote:
> > >
> > >> On ARM, the MSI doorbell is translated by the virtual IOMMU.
> > >> As such address_space_translate() returns the MSI controller MMIO
> > >> region and we get an "iommu map to non memory area"
> > >> message. Let's remove this latter.
> > >>
> > >> Signed-off-by: Eric Auger 
> > >> Signed-off-by: Bharat Bhushan 
> > >> ---
> > >>  hw/vfio/common.c | 2 --
> > >>  1 file changed, 2 deletions(-)
> > >>
> > >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c index
> > >> 5ca11488d6..c586edf47a 100644
> > >> --- a/hw/vfio/common.c
> > >> +++ b/hw/vfio/common.c
> > >> @@ -426,8 +426,6 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb,
> void **vaddr,
> > >>   , , writable,
> > >>   MEMTXATTRS_UNSPECIFIED);
> > >>  if (!memory_region_is_ram(mr)) {
> > >> -error_report("iommu map to non memory area %"HWADDR_PRIx"",
> > >> - xlat);
> > >>  return false;
> > >>  }
> > >>
> > >
> > > I'm a bit confused here, I think we need more justification beyond
> > > "we hit this warning and we don't want to because it's ok in this
> > > one special case, therefore remove it".  I assume the special case
> > > is that the device MSI address is managed via the SET_IRQS ioctl and
> > > therefore we won't actually get DMAs to this range.
> > Yes exactly. The guest creates a mapping between one giova and this
> > gpa (corresponding to the MSI controller doorbell) because MSIs are
> > mapped on ARM. But practically the physical device is programmed with
> > an host chosen iova that maps onto the physical MSI controller's
> > doorbell. so the device never performs DMA accesses to this range.
> >
> >   But I imagine the case that
> > > was in mind when adding this warning was general peer-to-peer
> > > between and assigned and emulated device.
> > yes makes sense.
> >
> >   Maybe there's an argument to be made
> > > that such a p2p mapping might also be used in a non-vIOMMU case.  We
> > > skip creating those mappings and drivers continue to work, maybe
> > > because nobody attempts to do p2p DMA with the types of devices we
> > > emulate, maybe because p2p DMA is not absolutely reliable on bare
> > > metal and drivers test it before using it.
> > MSI doorbells are mapped using the IOMMU_MMIO flag (dma-iommu.c
> > iommu_dma_get_msi_page).
> > One idea could be to pass that flag through the IOMMU Notifier
> > mechanism into the iotlb->perm. Eventually when we get this in
> > vfio_get_vaddr() we would not print the warning. Could that make sense?
> 
> Yeah, if we can identify a valid case that doesn't need a warning, that's 
> fine by me.
> Thanks,

Will change as per above suggestion by Eric.

Thanks
-Bharat

> 
> Alex




RE: [EXT] Re: [PATCH v9 8/9] virtio-iommu: Implement probe request

2020-03-26 Thread Bharat Bhushan
Hi Eric,

> -Original Message-
> From: Auger Eric 
> Sent: Thursday, March 26, 2020 9:18 PM
> To: Bharat Bhushan ; peter.mayd...@linaro.org;
> pet...@redhat.com; eric.auger@gmail.com; alex.william...@redhat.com;
> kevin.t...@intel.com; m...@redhat.com; Tomasz Nowicki [C]
> ; drjo...@redhat.com; linuc.dec...@gmail.com; qemu-
> de...@nongnu.org; qemu-...@nongnu.org; bharatb.li...@gmail.com; jean-
> phili...@linaro.org; yang.zh...@intel.com
> Subject: [EXT] Re: [PATCH v9 8/9] virtio-iommu: Implement probe request
> 
> External Email
> 
> --
> Hi Bharat
> 
> On 3/23/20 9:46 AM, Bharat Bhushan wrote:
> > This patch implements the PROBE request. Currently supported page size
> > mask per endpoint is returned. Also append a NONE property in the end.
> >
> > Signed-off-by: Bharat Bhushan 
> > Signed-off-by: Eric Auger 
> > ---
> >  include/standard-headers/linux/virtio_iommu.h |   6 +
> Changes to virtio_iommu.h should be in a separate patch you should use
> ./scripts/update-linux-headers.sh See for instance:
> ddda37483d  linux-headers: update
> until the uapi updates are not upstream you can link to your kernel branch and
> mention this is a temporary linux header update or partial if you just want 
> to pick
> up the iommu.h changes.

yes, I am sorry.

> 
> >  hw/virtio/virtio-iommu.c  | 161 +-
> >  hw/virtio/trace-events|   2 +
> >  3 files changed, 166 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/standard-headers/linux/virtio_iommu.h
> > b/include/standard-headers/linux/virtio_iommu.h
> > index b9443b83a1..8a0d47b907 100644
> > --- a/include/standard-headers/linux/virtio_iommu.h
> > +++ b/include/standard-headers/linux/virtio_iommu.h
> > @@ -111,6 +111,7 @@ struct virtio_iommu_req_unmap {
> >
> >  #define VIRTIO_IOMMU_PROBE_T_NONE  0
> >  #define VIRTIO_IOMMU_PROBE_T_RESV_MEM  1
> > +#define VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK2
> >
> >  #define VIRTIO_IOMMU_PROBE_T_MASK  0xfff
> >
> > @@ -130,6 +131,11 @@ struct virtio_iommu_probe_resv_mem {
> > uint64_tend;
> >  };
> >
> > +struct virtio_iommu_probe_pgsize_mask {
> > +   struct virtio_iommu_probe_property  head;
> > +   uint64_tpgsize_bitmap;
> > +};
> > +
> >  struct virtio_iommu_req_probe {
> > struct virtio_iommu_req_headhead;
> > uint32_tendpoint;
> > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index
> > 747e3cf1da..63fbacdcdc 100644
> > --- a/hw/virtio/virtio-iommu.c
> > +++ b/hw/virtio/virtio-iommu.c
> > @@ -38,6 +38,10 @@
> >
> >  /* Max size */
> >  #define VIOMMU_DEFAULT_QUEUE_SIZE 256
> > +#define VIOMMU_PROBE_SIZE 512
> > +
> > +#define SUPPORTED_PROBE_PROPERTIES (\
> > +1 << VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK)
> >
> >  typedef struct VirtIOIOMMUDomain {
> >  uint32_t id;
> > @@ -62,6 +66,13 @@ typedef struct VirtIOIOMMUMapping {
> >  uint32_t flags;
> >  } VirtIOIOMMUMapping;
> >
> > +typedef struct VirtIOIOMMUPropBuffer {
> > +VirtIOIOMMUEndpoint *endpoint;
> > +size_t filled;
> > +uint8_t *start;
> > +bool error;
> > +} VirtIOIOMMUPropBuffer;
> > +
> >  static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)  {
> >  return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); @@
> > -490,6 +501,114 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s,
> >  return ret;
> >  }
> >
> > +static int virtio_iommu_fill_none_prop(VirtIOIOMMUPropBuffer
> > +*bufstate) {
> > +struct virtio_iommu_probe_property *prop;
> > +
> > +prop = (struct virtio_iommu_probe_property *)
> > +(bufstate->start + bufstate->filled);
> > +prop->type = 0;
> > +prop->length = 0;
> > +bufstate->filled += sizeof(*prop);
> > +trace_virtio_iommu_fill_none_property(bufstate->endpoint->id);
> > +return 0;
> > +}
> > +
> > +static int virtio_iommu_fill_page_size_mask(VirtIOIOMMUPropBuffer
> > +*bufstate) {
> > +struct virtio_iommu_probe_pgsize_mask *page_size_mask;
> > +size_t prop_size = sizeof(*page_size_mask);
> > +VirtIOIOMMUEndpoint *ep = bufstate->endpoint;
> > +VirtIOIOMMU *s = ep->viommu;
> > +IOMMUDevice *sdev;
> > +
> > +if (bufstate->filled + prop_size >= VIOMMU_PROBE_SIZE) {
> > +bufstate->error = true;
> > +/* get the traversal stopped by returning true */
> > +return true;
> > +}
> > +
> > +page_size_mask = (struct virtio_iommu_probe_pgsize_mask *)
> > + (bufstate->start + bufstate->filled);
> > +
> > +page_size_mask->head.type = VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK;
> > +page_size_mask->head.length = prop_size;
> > +QLIST_FOREACH(sdev, >notifiers_list, next) {
> > +if (ep->id == sdev->devfn) {
> > +page_size_mask->pgsize_bitmap = 

RE: [EXT] Re: [PATCH v9 2/9] memory: Add interface to set iommu page size mask

2020-03-26 Thread Bharat Bhushan
Hi Eric,

> -Original Message-
> From: Auger Eric 
> Sent: Thursday, March 26, 2020 9:36 PM
> To: Bharat Bhushan ; peter.mayd...@linaro.org;
> pet...@redhat.com; eric.auger@gmail.com; alex.william...@redhat.com;
> kevin.t...@intel.com; m...@redhat.com; Tomasz Nowicki [C]
> ; drjo...@redhat.com; linuc.dec...@gmail.com; qemu-
> de...@nongnu.org; qemu-...@nongnu.org; bharatb.li...@gmail.com; jean-
> phili...@linaro.org; yang.zh...@intel.com
> Subject: [EXT] Re: [PATCH v9 2/9] memory: Add interface to set iommu page size
> mask
> 
> External Email
> 
> --
> Hi Bharat,
> On 3/23/20 9:46 AM, Bharat Bhushan wrote:
> > Allow to set page size mask to be supported by iommu.
> by iommu memory region. I mean this is not global to the IOMMU.

Yes.

> > This is required to expose page size mask compatible with host with
> > virtio-iommu.
> >
> > Signed-off-by: Bharat Bhushan 
> > ---
> >  include/exec/memory.h | 20 
> >  memory.c  | 10 ++
> >  2 files changed, 30 insertions(+)
> >
> > diff --git a/include/exec/memory.h b/include/exec/memory.h index
> > e85b7de99a..063c424854 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -355,6 +355,16 @@ typedef struct IOMMUMemoryRegionClass {
> >   * @iommu: the IOMMUMemoryRegion
> >   */
> >  int (*num_indexes)(IOMMUMemoryRegion *iommu);
> > +
> > +/*
> > + * Set supported IOMMU page size
> > + *
> > + * Optional method: if this is supported then set page size that
> > + * can be supported by IOMMU. This is called to set supported page
> > + * size as per host Linux.
> What about: If supported, allows to restrict the page size mask that can be
> supported with a given IOMMU memory region. For example, this allows to
> propagate host physical IOMMU page size mask limitations to the virtual IOMMU
> (vfio assignment with virtual iommu).

Much better 

> > + */
> > + void (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu,
> > +  uint64_t page_size_mask);
> >  } IOMMUMemoryRegionClass;
> >
> >  typedef struct CoalescedMemoryRange CoalescedMemoryRange; @@ -1363,6
> > +1373,16 @@ int
> memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr,
> >   */
> >  int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr);
> >
> > +/**
> > + * memory_region_iommu_set_page_size_mask: set the supported pages
> > + * size by iommu.
> supported page sizes for a given IOMMU memory region
> > + *
> > + * @iommu_mr: the memory region
> IOMMU memory region
> > + * @page_size_mask: supported page size mask  */ void
> > +memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion
> *iommu_mr,
> > +uint64_t page_size_mask);
> > +
> >  /**
> >   * memory_region_name: get a memory region's name
> >   *
> > diff --git a/memory.c b/memory.c
> > index aeaa8dcc9e..14c8783084 100644
> > --- a/memory.c
> > +++ b/memory.c
> > @@ -1833,6 +1833,16 @@ static int
> memory_region_update_iommu_notify_flags(IOMMUMemoryRegion
> *iommu_mr,
> >  return ret;
> >  }
> >
> > +void memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion
> *iommu_mr,
> > +uint64_t page_size_mask)
> > +{
> > +IOMMUMemoryRegionClass *imrc =
> > +IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
> > +
> > +if (imrc->iommu_set_page_size_mask) {
> > +imrc->iommu_set_page_size_mask(iommu_mr, page_size_mask);
> Shouldn't it return an int in case the setting cannot be applied?

iommu_set_page_size_mask() is setting page-size-mask for endpoint. Below 
function from code

static void virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
  uint64_t page_size_mask)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);

  sdev->page_size_mask = page_size_mask;
}

Do you see any reason it cannot be applied, am I missing something?

Thanks
-Bharat

> > +}
> > +}
> > +
> >  int memory_region_register_iommu_notifier(MemoryRegion *mr,
> >IOMMUNotifier *n, Error
> > **errp)  {
> >
> Thanks
> Eric




Re: [PATCH v16 Kernel 4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

2020-03-26 Thread Kirti Wankhede

Hit send button little early.

>
> I checked v12, it's not like what I said.
> In v12, bitmaps are generated per vfio_dma, and combination of the
> bitmaps are required in order to generate a big bitmap suiting for dirty
> query. It can cause problem when offset not aligning.
> But what I propose here is to generate an rb tree orthogonal to the tree
> of vfio_dma.
>
> as to CPU cycles saving, I don't think iterating/translating page by page
> would achieve that purpose.
>

Instead of creating one extra rb tree for dirty pages tracking in v10 
tried to use dma->pfn_list itself, we tried changes in v10, v11 and v12, 
latest version is evolved version with best possible approach after 
discussion. Probably, go through v11 as well.

https://patchwork.kernel.org/patch/11298335/

Thanks,
Kirti

On 3/27/2020 6:00 AM, Yan Zhao wrote:

On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:



On 3/25/2020 7:41 AM, Yan Zhao wrote:

On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:

VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
   drivers/vfio/vfio_iommu_type1.c | 266 
+++-
   1 file changed, 260 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 70aeab921d0f..874a1a7ae925 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@ struct vfio_iommu {
unsigned intdma_avail;
boolv2;
boolnesting;
+   booldirty_page_tracking;
   };
   
   struct vfio_domain {

@@ -91,6 +92,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
   };
   
   struct vfio_group {

@@ -125,7 +127,21 @@ struct vfio_regions {
   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)  \
(!list_empty(>domain_list))
   
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX (uint64_t)(INT_MAX - 1)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
   static int put_pfn(unsigned long pfn, int prot);
+static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
   
   /*

* This code handles mapping and unmapping of user data buffers
@@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
   }
   
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
+{
+   struct rb_node *p;
+
+   if (RB_EMPTY_ROOT(>pfn_list))
+   return;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = 

Re: [PATCH v8 00/74] per-CPU locks

2020-03-26 Thread Emilio G. Cota
(Apologies if I missed some Cc's; I was not Cc'ed in patch 0
 so I'm blindly crafting a reply.)

On Thu, Mar 26, 2020 at 15:30:43 -0400, Robert Foley wrote:
> This is a continuation of the series created by Emilio Cota.
> We are picking up this patch set with the goal to apply 
> any fixes or updates needed to get this accepted.

Thanks for picking this up!

> Listed below are the changes for this version of the patch, 
> aside from the merge related changes.
> 
> Changes for V8:
> - Fixed issue where in rr mode we could destroy the BQL twice.

I remember doing little to no testing in record-replay mode, so
there should be more bugs hiding in there :-)

> - Found/fixed bug that had been hit in testing previously during 
> the last consideration of this patch.
>  We reproduced the issue hit in the qtest: bios-tables-test.
>  The issue was introduced by dropping the BQL, and found us
>  (very rarely) missing the condition variable wakeup in
>  qemu_tcg_rr_cpu_thread_fn().

Aah, this one:
  https://patchwork.kernel.org/patch/10838149/#22516931
How did you identify the problem? Was it code inspection or using a tool
like rr? I remember this being hard to reproduce reliably.

On a related note, I've done some work to get QEMU-system to work
under thread sanitizer, since tsan now supports our longjmp-based
coroutines (hurrah!). My idea was to integrate tsan in QEMU (i.e.
bring tsan warnings to 0) before (re)trying to merge the
per-CPU lock patchset; this would minimize the potential for
regressions, which from my personal viewpoint seems like a reasonable
thing to do especially now that I have little time to work on QEMU.

If there's interest in doing the tsan work first, then I'd be
happy to send to the list as soon as this weekend the changes that
I have so far [1].

Thanks,
Emilio

[1] WIP branch: https://github.com/cota/qemu/commits/tsan



RE: [EXT] Re: [PATCH v9 4/9] virtio-iommu: set supported page size mask

2020-03-26 Thread Bharat Bhushan
Hi Eric,

> -Original Message-
> From: Auger Eric 
> Sent: Thursday, March 26, 2020 9:22 PM
> To: Bharat Bhushan ; peter.mayd...@linaro.org;
> pet...@redhat.com; eric.auger@gmail.com; alex.william...@redhat.com;
> kevin.t...@intel.com; m...@redhat.com; Tomasz Nowicki [C]
> ; drjo...@redhat.com; linuc.dec...@gmail.com; qemu-
> de...@nongnu.org; qemu-...@nongnu.org; bharatb.li...@gmail.com; jean-
> phili...@linaro.org; yang.zh...@intel.com
> Subject: [EXT] Re: [PATCH v9 4/9] virtio-iommu: set supported page size mask
> 
> External Email
> 
> --
> Hi Bharat,
> 
> On 3/23/20 9:46 AM, Bharat Bhushan wrote:
> > Add optional interface to set page size mask.
> > Currently this is set global configuration and not per endpoint.
> This allows to override the page size mask per end-point?

This patch adds per endpoint page-size-mask configuration in addition to global 
page-size-mask.
endpoint page-size-mask will override global page-size-mask configuration for 
that endpoint.

Thanks
-Bharat

> >
> > Signed-off-by: Bharat Bhushan 
> > ---
> >  include/hw/virtio/virtio-iommu.h | 1 +
> >  hw/virtio/virtio-iommu.c | 9 +
> >  2 files changed, 10 insertions(+)
> >
> > diff --git a/include/hw/virtio/virtio-iommu.h
> > b/include/hw/virtio/virtio-iommu.h
> > index 6f67f1020a..4efa09610a 100644
> > --- a/include/hw/virtio/virtio-iommu.h
> > +++ b/include/hw/virtio/virtio-iommu.h
> > @@ -35,6 +35,7 @@ typedef struct IOMMUDevice {
> >  void *viommu;
> >  PCIBus   *bus;
> >  int   devfn;
> > +uint64_t  page_size_mask;
> >  IOMMUMemoryRegion  iommu_mr;
> >  AddressSpace  as;
> >  } IOMMUDevice;
> > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index
> > 4cee8083bc..a28818202c 100644
> > --- a/hw/virtio/virtio-iommu.c
> > +++ b/hw/virtio/virtio-iommu.c
> > @@ -650,6 +650,14 @@ static gint int_cmp(gconstpointer a, gconstpointer b,
> gpointer user_data)
> >  return (ua > ub) - (ua < ub);
> >  }
> >
> > +static void virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
> > +uint64_t page_size_mask)
> > +{
> > +IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
> > +
> > +sdev->page_size_mask = page_size_mask; }
> > +
> >  static void virtio_iommu_device_realize(DeviceState *dev, Error
> > **errp)  {
> >  VirtIODevice *vdev = VIRTIO_DEVICE(dev); @@ -865,6 +873,7 @@
> > static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
> >  IOMMUMemoryRegionClass *imrc =
> IOMMU_MEMORY_REGION_CLASS(klass);
> >
> >  imrc->translate = virtio_iommu_translate;
> > +imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
> >  }
> >
> >  static const TypeInfo virtio_iommu_info = {
> >
> Thanks
> 
> Eric




Re: [PATCH v16 Kernel 4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

2020-03-26 Thread Kirti Wankhede




On 3/27/2020 6:00 AM, Yan Zhao wrote:

On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:



On 3/25/2020 7:41 AM, Yan Zhao wrote:

On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:

VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
   drivers/vfio/vfio_iommu_type1.c | 266 
+++-
   1 file changed, 260 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 70aeab921d0f..874a1a7ae925 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@ struct vfio_iommu {
unsigned intdma_avail;
boolv2;
boolnesting;
+   booldirty_page_tracking;
   };
   
   struct vfio_domain {

@@ -91,6 +92,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
   };
   
   struct vfio_group {

@@ -125,7 +127,21 @@ struct vfio_regions {
   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)  \
(!list_empty(>domain_list))
   
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX (uint64_t)(INT_MAX - 1)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
   static int put_pfn(unsigned long pfn, int prot);
+static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
   
   /*

* This code handles mapping and unmapping of user data buffers
@@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
   }
   
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
+{
+   struct rb_node *p;
+
+   if (RB_EMPTY_ROOT(>pfn_list))
+   return;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   

Re: [PATCH v16 Kernel 5/7] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-03-26 Thread Kirti Wankhede




On 3/27/2020 5:34 AM, Yan Zhao wrote:

On Fri, Mar 27, 2020 at 05:39:44AM +0800, Kirti Wankhede wrote:



On 3/25/2020 7:48 AM, Yan Zhao wrote:

On Wed, Mar 25, 2020 at 03:32:37AM +0800, Kirti Wankhede wrote:

DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
size of allocated memory, set page size to be considered for bitmap and
set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
   drivers/vfio/vfio_iommu_type1.c | 54 
++---
   include/uapi/linux/vfio.h   | 10 
   2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 27ed069c5053..b98a8d79e13a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -982,7 +982,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
   }
   
   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
   {
uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
@@ -1033,6 +1034,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 * will be returned if these conditions are not met.  The v2 interface
 * will only return success and a size of zero if there were no
 * mappings within the range.
+*
+* When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
+* must be for single mapping. Multiple mappings with this flag set is
+* not supported.
 */
if (iommu->v2) {
dma = vfio_find_dma(iommu, unmap->iova, 1);
@@ -1040,6 +1045,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (dma->iova != unmap->iova || dma->size != unmap->size)) {

potential NULL pointer!

And could you address the comments in v14?
How to handle DSI unmaps in vIOMMU
(https://lore.kernel.org/kvm/20200323011041.GB5456@joy-OptiPlex-7040/)



Sorry, I drafted reply to it, but I missed to send, it remained in my drafts

  >
  > it happens in vIOMMU Domain level invalidation of IOTLB
  > (domain-selective invalidation, see vtd_iotlb_domain_invalidate() in
qemu).
  > common in VTD lazy mode, and NOT just happening once at boot time.
  > rather than invalidate page by page, it batches the page invalidation.
  > so, when this invalidation takes place, even higher level page tables
  > have been invalid and therefore it has to invalidate a bigger
combined range.
  > That's why we see IOVAs are mapped in 4k pages, but are unmapped in 2M
  > pages.
  >
  > I think those UNMAPs should also have GET_DIRTY_BIMTAP flag on, right?


vtd_iotlb_domain_invalidate()
vtd_sync_shadow_page_table()
  vtd_sync_shadow_page_table_range(vtd_as, , 0, UINT64_MAX)
vtd_page_walk()
  vtd_page_walk_level() - walk over specific level for IOVA range
vtd_page_walk_one()
  memory_region_notify_iommu()
  ...
vfio_iommu_map_notify()

In the above trace, isn't page walk will take care of creating proper
IOTLB entry which should be same as created during mapping for that
IOTLB entry?


No. It does walk the page table, but as it's dsi (delay & batched unmap),
pages table entry for a whole 2M (the higher level, not last level for 4K)
range is invalid, so the iotlb->addr_mask what vfio_iommu_map_notify()
receives is (2M - 1), not the same as the size for map.



When do this happen? during my testing I never hit this case. How can I 
hit this case?


In this case, will adjacent whole vfio_dmas will be clubbed together or 
will there be any intersection of vfio_dmas?


Thanks,
Kirti



[PATCH v2 1/2] virtio-blk: delete vqs on the error path in realize()

2020-03-26 Thread Pan Nengyuan
virtio_vqs forgot to free on the error path in realize(). Fix that.

The asan stack:
Direct leak of 14336 byte(s) in 1 object(s) allocated from:
#0 0x7f58b93fd970 in __interceptor_calloc (/lib64/libasan.so.5+0xef970)
#1 0x7f58b858249d in g_malloc0 (/lib64/libglib-2.0.so.0+0x5249d)
#2 0x5562cc627f49 in virtio_add_queue /mnt/sdb/qemu/hw/virtio/virtio.c:2413
#3 0x5562cc4b524a in virtio_blk_device_realize 
/mnt/sdb/qemu/hw/block/virtio-blk.c:1202
#4 0x5562cc613050 in virtio_device_realize 
/mnt/sdb/qemu/hw/virtio/virtio.c:3615
#5 0x5562ccb7a568 in device_set_realized /mnt/sdb/qemu/hw/core/qdev.c:891
#6 0x5562cd39cd45 in property_set_bool /mnt/sdb/qemu/qom/object.c:2238

Reported-by: Euler Robot 
Signed-off-by: Pan Nengyuan 
---
v2->v1:
- Fix incorrect free in v1, it will cause a uaf.
---
Cc: Stefan Hajnoczi 
Cc: Kevin Wolf 
Cc: Max Reitz 
Cc: qemu-bl...@nongnu.org
---
 hw/block/virtio-blk.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 142863a3b2..97ba8a2187 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1204,6 +1204,9 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 virtio_blk_data_plane_create(vdev, conf, >dataplane, );
 if (err != NULL) {
 error_propagate(errp, err);
+for (i = 0; i < conf->num_queues; i++) {
+virtio_del_queue(vdev, i);
+}
 virtio_cleanup(vdev);
 return;
 }
-- 
2.18.2




[PATCH v2 0/2] fix two virtio queues memleak

2020-03-26 Thread Pan Nengyuan
This series fix two vqs leak:
1. Do delete vqs on the error path in virtio_blk_device_realize().
2. Do delete vqs in virtio_iommu_device_unrealize() to fix another leaks.

v2->v1:
- Fix incorrect free in v1, it will cause a uaf.

Pan Nengyuan (2):
  virtio-blk: delete vqs on the error path in realize()
  virtio-iommu: delete vqs in unrealize to fix memleak

 hw/block/virtio-blk.c| 3 +++
 hw/virtio/virtio-iommu.c | 2 ++
 2 files changed, 5 insertions(+)

-- 
2.18.2




[PATCH v2 2/2] virtio-iommu: delete vqs in unrealize to fix memleak

2020-03-26 Thread Pan Nengyuan
req_vq/event_vq forgot to free in unrealize. Fix that.

Signed-off-by: Pan Nengyuan 
---
Cc: Eric Auger 
---
 hw/virtio/virtio-iommu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 4cee8083bc..9d2ff0693c 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -696,6 +696,8 @@ static void virtio_iommu_device_unrealize(DeviceState *dev, 
Error **errp)
 g_tree_destroy(s->domains);
 g_tree_destroy(s->endpoints);
 
+virtio_delete_queue(s->req_vq);
+virtio_delete_queue(s->event_vq);
 virtio_cleanup(vdev);
 }
 
-- 
2.18.2




Re: [PATCH 0/2] fix two virtio queues memleak

2020-03-26 Thread no-reply
Patchew URL: 
https://patchew.org/QEMU/20200327035650.2085-1-pannengy...@huawei.com/



Hi,

This series failed the asan build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1
=== TEST SCRIPT END ===

PASS 1 fdc-test /x86_64/fdc/cmos
PASS 2 fdc-test /x86_64/fdc/no_media_on_start
PASS 3 fdc-test /x86_64/fdc/read_without_media
==6155==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 4 fdc-test /x86_64/fdc/media_change
PASS 5 fdc-test /x86_64/fdc/sense_interrupt
PASS 6 fdc-test /x86_64/fdc/relative_seek
---
PASS 32 test-opts-visitor /visitor/opts/range/beyond
PASS 33 test-opts-visitor /visitor/opts/dict/unvisited
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
tests/test-coroutine -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl 
--test-name="test-coroutine" 
==6213==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
==6213==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 
0x7ffdf7e26000; bottom 0x7fb5d76bf000; size: 0x004820767000 (309782278144)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 1 test-coroutine /basic/no-dangling-access
---
PASS 12 fdc-test /x86_64/fdc/read_no_dma_19
PASS 13 fdc-test /x86_64/fdc/fuzz-registers
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img 
tests/qtest/ide-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl 
--test-name="ide-test" 
==6236==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 1 ide-test /x86_64/ide/identify
==6228==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 14 test-aio /aio/timer/schedule
PASS 15 test-aio /aio/coroutine/queue-chaining
PASS 16 test-aio /aio-gsource/flush
---
PASS 25 test-aio /aio-gsource/event/wait
PASS 26 test-aio /aio-gsource/event/flush
PASS 27 test-aio /aio-gsource/event/wait/no-flush-cb
==6242==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 2 ide-test /x86_64/ide/flush
==6248==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 3 ide-test /x86_64/ide/bmdma/simple_rw
PASS 28 test-aio /aio-gsource/timer/schedule
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
tests/test-aio-multithread -m=quick -k --tap < /dev/null | 
./scripts/tap-driver.pl --test-name="test-aio-multithread" 
==6254==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
==6260==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 1 test-aio-multithread /aio/multi/lifecycle
PASS 4 ide-test /x86_64/ide/bmdma/trim
==6274==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 2 test-aio-multithread /aio/multi/schedule
PASS 3 test-aio-multithread /aio/multi/mutex/contended
PASS 4 test-aio-multithread /aio/multi/mutex/handoff
PASS 5 test-aio-multithread /aio/multi/mutex/mcs
==6295==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 6 test-aio-multithread /aio/multi/mutex/pthread
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
tests/test-throttle -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl 
--test-name="test-throttle" 
==6307==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 1 test-throttle /throttle/leak_bucket
PASS 2 test-throttle /throttle/compute_wait
PASS 3 test-throttle /throttle/init
---
PASS 14 test-throttle /throttle/config/max
PASS 15 test-throttle /throttle/config/iops_size
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
tests/test-thread-pool -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl 
--test-name="test-thread-pool" 
==6311==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 1 test-thread-pool /thread-pool/submit
PASS 2 test-thread-pool /thread-pool/submit-aio
PASS 3 test-thread-pool /thread-pool/submit-co
---
PASS 2 test-hbitmap /hbitmap/size/0
PASS 3 test-hbitmap /hbitmap/size/unaligned
PASS 4 test-hbitmap /hbitmap/iter/empty
==6378==WARNING: ASan doesn't fully support 

Re: [PATCH 0/2] fix two virtio queues memleak

2020-03-26 Thread no-reply
Patchew URL: 
https://patchew.org/QEMU/20200327035650.2085-1-pannengy...@huawei.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

  TESTcheck-qtest-x86_64: tests/qtest/hd-geo-test
Broken pipe
/tmp/qemu-test/src/tests/qtest/libqtest.c:175: kill_qemu() detected QEMU death 
from signal 11 (Segmentation fault) (core dumped)
ERROR - too few tests run (expected 17, got 16)
make: *** [check-qtest-x86_64] Error 1
make: *** Waiting for unfinished jobs
  TESTcheck-unit: tests/test-bufferiszero
  TESTcheck-unit: tests/test-uuid
---
Not run: 259
Passed all 116 iotests
**
ERROR:/tmp/qemu-test/src/tests/qtest/acpi-utils.c:145:acpi_find_rsdp_address_uefi:
 code should not be reached
ERROR - Bail out! 
ERROR:/tmp/qemu-test/src/tests/qtest/acpi-utils.c:145:acpi_find_rsdp_address_uefi:
 code should not be reached
make: *** [check-qtest-aarch64] Error 1
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in 
sys.exit(main())
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=f25aa92521b549bcaedc739ec3ab9398', '-u', 
'1001', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-hwog52fu/src/docker-src.2020-03-26-23.55.41.774:/var/tmp/qemu:z,ro',
 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=f25aa92521b549bcaedc739ec3ab9398
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-hwog52fu/src'
make: *** [docker-run-test-quick@centos7] Error 2

real18m39.674s
user0m9.442s


The full log is available at
http://patchew.org/logs/20200327035650.2085-1-pannengy...@huawei.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

[Bug 1868527] Re: alignment may overlap the TLB flags

2020-03-26 Thread Hansni Bu
** Changed in: qemu
   Status: Incomplete => Invalid

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1868527

Title:
  alignment may overlap the TLB flags

Status in QEMU:
  Invalid

Bug description:
  Hi,
  In QEMU-4.2.0, or git-9b26a610936deaf436af9b7e39e4b7f0a35e4409, alignment may 
overlap the TLB flags. 
  For example, the alignment: MO_ALIGN_32,
  MO_ALIGN_32 = 5 << MO_ASHIFT,
  and the TLB flag: TLB_DISCARD_WRITE
  #define TLB_DISCARD_WRITE   (1 << (TARGET_PAGE_BITS_MIN - 6))

  then, in the function "get_alignment_bits", the assert may fail:

  #if defined(CONFIG_SOFTMMU)
  /* The requested alignment cannot overlap the TLB flags.  */
  tcg_debug_assert((TLB_FLAGS_MASK & ((1 << a) - 1)) == 0);
  #endif

  However, the alignment of MO_ALIGN_32 is not used for now, so the
  assert cannot be triggered in current version. Anyway it seems like a
  potential conflict.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1868527/+subscriptions



Re: [PATCH v6 1/2] net: tulip: check frame size and r/w data length

2020-03-26 Thread Li Qiang
Jason Wang  于2020年3月27日周五 上午10:53写道:

>
> On 2020/3/27 上午10:35, Li Qiang wrote:
> >
> >
> > Jason Wang mailto:jasow...@redhat.com>>
> > 于2020年3月27日周五 上午10:09写道:
> >
> >
> > On 2020/3/24 下午10:54, Li Qiang wrote:
> > >
> > >
> > > Jason Wang mailto:jasow...@redhat.com>
> > >>
> > > 于2020年3月24日周二 下午1:45写道:
> > >
> > >
> > > On 2020/3/24 上午9:29, Li Qiang wrote:
> > > >
> > > >
> > > > P J P mailto:ppan...@redhat.com>
> > >
> > > 
> >  > > 于2020年3月23日周一
> > > > 下午8:24写道:
> > > >
> > > > From: Prasad J Pandit  > 
> > > >
> > > >  >   >  > > >
> > > > Tulip network driver while copying tx/rx buffers does
> > not check
> > > > frame size against r/w data length. This may lead to
> > OOB buffer
> > > > access. Add check to avoid it.
> > > >
> > > > Limit iterations over descriptors to avoid potential
> > infinite
> > > > loop issue in tulip_xmit_list_update.
> > > >
> > > > Reported-by: Li Qiang  > 
> > > >
> > > >  >   >  > > > Reported-by: Ziming Zhang  > 
> > > >
> > > > 
> >  > > > Reported-by: Jason Wang  > 
> > > >
> > > >  >   >  > > > Signed-off-by: Prasad J Pandit  > 
> > > >
> > > >  >   >  > > >
> > > >
> > > >
> > > > Tested-by: Li Qiang  >   > >
> > > 
> >  > > > But I have a minor question
> > > >
> > > > ---
> > > >  hw/net/tulip.c | 36 +++-
> > > >  1 file changed, 27 insertions(+), 9 deletions(-)
> > > >
> > > > Update v3: return a value from tulip_copy_tx_buffers()
> > and avoid
> > > > infinite loop
> > > >   ->
> > > >
> > https://lists.gnu.org/archive/html/qemu-devel/2020-02/msg06275.html
> > > >
> > > > diff --git a/hw/net/tulip.c b/hw/net/tulip.c
> > > > index cfac2719d3..fbe40095da 100644
> > > > --- a/hw/net/tulip.c
> > > > +++ b/hw/net/tulip.c
> > > > @@ -170,6 +170,10 @@ static void
> > tulip_copy_rx_bytes(TULIPState
> > > > *s, struct tulip_descriptor *desc)
> > > >  } else {
> > > >  len = s->rx_frame_len;
> > > >  }
> > > > +
> > > > +if (s->rx_frame_len + len >=
> > sizeof(s->rx_frame)) {
> > > > +return;
> > > > +}
> > > >
> > > >
> > > >
> > > > Why here is '>=' instead of '>'.
> > > > IIUC the total sending length can reach to
> > sizeof(s->rx_frame).
> > > > Same in the other place in this patch.
> > >
> > >
> > > Yes, this need to be fixed.
> > >
> > >
> > > >
> > > > PS: I have planned to write a qtest case. But my personal
> > qemu dev
> > > > environment is broken.
> > > > I will try to write it tonight or tomorrow night.
> > >
> > >
> > > Cool, good to know this.
> > >
> > >
> > > Hi all,
> > > I have countered an interesting issue. Let's look at the
> > definition 

[PATCH 1/2] virtio-blk: delete vqs on the error path in realize()

2020-03-26 Thread Pan Nengyuan
virtio_vqs forgot to free on the error path in realize(). Fix that.

The asan stack:
Direct leak of 14336 byte(s) in 1 object(s) allocated from:
#0 0x7f58b93fd970 in __interceptor_calloc (/lib64/libasan.so.5+0xef970)
#1 0x7f58b858249d in g_malloc0 (/lib64/libglib-2.0.so.0+0x5249d)
#2 0x5562cc627f49 in virtio_add_queue /mnt/sdb/qemu/hw/virtio/virtio.c:2413
#3 0x5562cc4b524a in virtio_blk_device_realize 
/mnt/sdb/qemu/hw/block/virtio-blk.c:1202
#4 0x5562cc613050 in virtio_device_realize 
/mnt/sdb/qemu/hw/virtio/virtio.c:3615
#5 0x5562ccb7a568 in device_set_realized /mnt/sdb/qemu/hw/core/qdev.c:891
#6 0x5562cd39cd45 in property_set_bool /mnt/sdb/qemu/qom/object.c:2238

Reported-by: Euler Robot 
Signed-off-by: Pan Nengyuan 
---
Cc: Stefan Hajnoczi 
Cc: Kevin Wolf 
Cc: Max Reitz 
Cc: qemu-bl...@nongnu.org
---
 hw/block/virtio-blk.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 142863a3b2..a6682c2ced 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1204,8 +1204,7 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 virtio_blk_data_plane_create(vdev, conf, >dataplane, );
 if (err != NULL) {
 error_propagate(errp, err);
-virtio_cleanup(vdev);
-return;
+goto fail;
 }
 
 s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
@@ -1218,6 +1217,11 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
  conf->conf.lcyls,
  conf->conf.lheads,
  conf->conf.lsecs);
+fail:
+for (i = 0; i < conf->num_queues; i++) {
+virtio_del_queue(vdev, i);
+}
+virtio_cleanup(vdev);
 }
 
 static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
-- 
2.18.2




[PATCH 2/2] virtio-iommu: delete vqs in unrealize to fix memleaks

2020-03-26 Thread Pan Nengyuan
req_vq/event_vq forgot to free in unrealize(). Fix that.

Signed-off-by: Pan Nengyuan 
---
Cc: Eric Auger 
---
 hw/virtio/virtio-iommu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 4cee8083bc..9d2ff0693c 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -696,6 +696,8 @@ static void virtio_iommu_device_unrealize(DeviceState *dev, 
Error **errp)
 g_tree_destroy(s->domains);
 g_tree_destroy(s->endpoints);
 
+virtio_delete_queue(s->req_vq);
+virtio_delete_queue(s->event_vq);
 virtio_cleanup(vdev);
 }
 
-- 
2.18.2




[PATCH 0/2] fix two virtio queues memleak

2020-03-26 Thread Pan Nengyuan
This series fix two vqs leak:
1. Do delete vqs on the error path in virtio_blk_device_realize().
2. Do delete vqs in virtio_iommu_device_unrealize() to fix another leaks.

Pan Nengyuan (2):
  virtio-blk: delete vqs on the error path in realize()
  virtio-iommu: delete vqs in unrealize to fix memleaks

 hw/block/virtio-blk.c| 8 ++--
 hw/virtio/virtio-iommu.c | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

-- 
2.18.2




[PULL SUBSYSTEM qemu-pseries] pseries: Update SLOF firmware image

2020-03-26 Thread Alexey Kardashevskiy
The following changes since commit 736cf607e40674776d752acc201f565723e86045:

  Update version for v5.0.0-rc0 release (2020-03-24 17:50:00 +)

are available in the Git repository at:

  g...@github.com:aik/qemu.git tags/qemu-slof-20200327

for you to fetch changes up to 78b145a0330b9c44478f7404b97a710e692bfc96:

  pseries: Update SLOF firmware image (2020-03-27 13:58:00 +1100)


Alexey Kardashevskiy (1):
  pseries: Update SLOF firmware image

 pc-bios/README   |   2 +-
 pc-bios/slof.bin | Bin 965008 -> 965112 bytes
 roms/SLOF|   2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)


*** Note: this is not for master, this is for pseries

This is a single regression fix for for 5.0:

Greg Kurz (1):
  slof: Only close stdout for virtio-serial devices




Re: [PATCH v6 1/2] net: tulip: check frame size and r/w data length

2020-03-26 Thread Jason Wang



On 2020/3/27 上午10:35, Li Qiang wrote:



Jason Wang mailto:jasow...@redhat.com>> 
于2020年3月27日周五 上午10:09写道:



On 2020/3/24 下午10:54, Li Qiang wrote:
>
>
> Jason Wang mailto:jasow...@redhat.com>
>>
> 于2020年3月24日周二 下午1:45写道:
>
>
>     On 2020/3/24 上午9:29, Li Qiang wrote:
>     >
>     >
>     > P J P mailto:ppan...@redhat.com>
>
>     
     于2020年3月23日周一
>     > 下午8:24写道:
>     >
>     >     From: Prasad J Pandit mailto:p...@fedoraproject.org>
>     >
>     >           >
>     >     Tulip network driver while copying tx/rx buffers does
not check
>     >     frame size against r/w data length. This may lead to
OOB buffer
>     >     access. Add check to avoid it.
>     >
>     >     Limit iterations over descriptors to avoid potential
infinite
>     >     loop issue in tulip_xmit_list_update.
>     >
>     >     Reported-by: Li Qiang mailto:pangpei...@antfin.com>
>     >
>     >           >     Reported-by: Ziming Zhang mailto:ezrak...@gmail.com>
>     >
>     >     
     >     Reported-by: Jason Wang mailto:jasow...@redhat.com>
>     >
>     >           >     Signed-off-by: Prasad J Pandit mailto:p...@fedoraproject.org>
>     >
>     >           >
>     >
>     >
>     > Tested-by: Li Qiang mailto:liq...@gmail.com> >
>     
     > But I have a minor question
>     >
>     >     ---
>     >      hw/net/tulip.c | 36 +++-
>     >      1 file changed, 27 insertions(+), 9 deletions(-)
>     >
>     >     Update v3: return a value from tulip_copy_tx_buffers()
and avoid
>     >     infinite loop
>     >       ->
>     >
https://lists.gnu.org/archive/html/qemu-devel/2020-02/msg06275.html
>     >
>     >     diff --git a/hw/net/tulip.c b/hw/net/tulip.c
>     >     index cfac2719d3..fbe40095da 100644
>     >     --- a/hw/net/tulip.c
>     >     +++ b/hw/net/tulip.c
>     >     @@ -170,6 +170,10 @@ static void
tulip_copy_rx_bytes(TULIPState
>     >     *s, struct tulip_descriptor *desc)
>     >              } else {
>     >                  len = s->rx_frame_len;
>     >              }
>     >     +
>     >     +        if (s->rx_frame_len + len >=
sizeof(s->rx_frame)) {
>     >     +            return;
>     >     +        }
>     >
>     >
>     >
>     > Why here is '>=' instead of '>'.
>     > IIUC the total sending length can reach to
sizeof(s->rx_frame).
>     > Same in the other place in this patch.
>
>
>     Yes, this need to be fixed.
>
>
>     >
>     > PS: I have planned to write a qtest case. But my personal
qemu dev
>     > environment is broken.
>     > I will try to write it tonight or tomorrow night.
>
>
>     Cool, good to know this.
>
>
> Hi all,
> I have countered an interesting issue. Let's look at the
definition of
> TULIPState.
>
>   21 typedef struct TULIPState {
>   22     PCIDevice dev;
>   23     MemoryRegion io;
>   24     MemoryRegion memory;
>   25     NICConf c;
>   26     qemu_irq irq;
>   27     NICState *nic;
>   28     eeprom_t *eeprom;
>   29     uint32_t csr[16];
>   30
>   31     /* state for MII */
>   32     uint32_t old_csr9;
>   33     uint32_t mii_word;
>   34     uint32_t mii_bitcnt;
>   35
>   36     hwaddr current_rx_desc;
>   37     hwaddr current_tx_desc;
  

Re: [PATCH v6 1/2] net: tulip: check frame size and r/w data length

2020-03-26 Thread Li Qiang
Jason Wang  于2020年3月27日周五 上午10:09写道:

>
> On 2020/3/24 下午10:54, Li Qiang wrote:
> >
> >
> > Jason Wang mailto:jasow...@redhat.com>>
> > 于2020年3月24日周二 下午1:45写道:
> >
> >
> > On 2020/3/24 上午9:29, Li Qiang wrote:
> > >
> > >
> > > P J P mailto:ppan...@redhat.com>
> > >>
> > 于2020年3月23日周一
> > > 下午8:24写道:
> > >
> > > From: Prasad J Pandit  > 
> > > >>
> > >
> > > Tulip network driver while copying tx/rx buffers does not check
> > > frame size against r/w data length. This may lead to OOB buffer
> > > access. Add check to avoid it.
> > >
> > > Limit iterations over descriptors to avoid potential infinite
> > > loop issue in tulip_xmit_list_update.
> > >
> > > Reported-by: Li Qiang  > 
> > > >>
> > > Reported-by: Ziming Zhang  > 
> > > >>
> > > Reported-by: Jason Wang  > 
> > > >>
> > > Signed-off-by: Prasad J Pandit  > 
> > > >>
> > >
> > >
> > >
> > > Tested-by: Li Qiang mailto:liq...@gmail.com>
> > >>
> > > But I have a minor question
> > >
> > > ---
> > >  hw/net/tulip.c | 36 +++-
> > >  1 file changed, 27 insertions(+), 9 deletions(-)
> > >
> > > Update v3: return a value from tulip_copy_tx_buffers() and
> avoid
> > > infinite loop
> > >   ->
> > >
> https://lists.gnu.org/archive/html/qemu-devel/2020-02/msg06275.html
> > >
> > > diff --git a/hw/net/tulip.c b/hw/net/tulip.c
> > > index cfac2719d3..fbe40095da 100644
> > > --- a/hw/net/tulip.c
> > > +++ b/hw/net/tulip.c
> > > @@ -170,6 +170,10 @@ static void tulip_copy_rx_bytes(TULIPState
> > > *s, struct tulip_descriptor *desc)
> > >  } else {
> > >  len = s->rx_frame_len;
> > >  }
> > > +
> > > +if (s->rx_frame_len + len >= sizeof(s->rx_frame)) {
> > > +return;
> > > +}
> > >
> > >
> > >
> > > Why here is '>=' instead of '>'.
> > > IIUC the total sending length can reach to sizeof(s->rx_frame).
> > > Same in the other place in this patch.
> >
> >
> > Yes, this need to be fixed.
> >
> >
> > >
> > > PS: I have planned to write a qtest case. But my personal qemu dev
> > > environment is broken.
> > > I will try to write it tonight or tomorrow night.
> >
> >
> > Cool, good to know this.
> >
> >
> > Hi all,
> > I have countered an interesting issue. Let's look at the definition of
> > TULIPState.
> >
> >   21 typedef struct TULIPState {
> >   22 PCIDevice dev;
> >   23 MemoryRegion io;
> >   24 MemoryRegion memory;
> >   25 NICConf c;
> >   26 qemu_irq irq;
> >   27 NICState *nic;
> >   28 eeprom_t *eeprom;
> >   29 uint32_t csr[16];
> >   30
> >   31 /* state for MII */
> >   32 uint32_t old_csr9;
> >   33 uint32_t mii_word;
> >   34 uint32_t mii_bitcnt;
> >   35
> >   36 hwaddr current_rx_desc;
> >   37 hwaddr current_tx_desc;
> >   38
> >   39 uint8_t rx_frame[2048];
> >   40 uint8_t tx_frame[2048];
> >   41 uint16_t tx_frame_len;
> >   42 uint16_t rx_frame_len;
> >   43 uint16_t rx_frame_size;
> >   44
> >   45 uint32_t rx_status;
> >   46 uint8_t filter[16][6];
> >   47 } TULIPState;
> >
> > Here we can see the overflow is occured after 'tx_frame'.
> > In my quest, I have see the overflow(the s->tx_frame_len is big).
> > However here doesn't cause SEGV in qtest.
> > In real case, the qemu process will access the data after TULIPState
> > in heap and trigger segv.
> > However in qtest mode I don't know how to trigger this.
>
>
> If it's just the mangling of tx_frame_len, it won't hit SIGSEV.
>
> I wonder maybe, somehow that large tx_frame_len is used for buffer
> copying or other stuffs that can lead the crash.
>

This is because in real qemu process, the OOB copy corrupts the head data
after 'TULIPState' struct.
And maybe later(other thread) access the corrupted data thus leading crash.
However in qtest mode, I don't remember the core code of qtest. But seems
it's not a really VM? just a interface emulation.

In my case, it's backtrace is as this:
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7ffbdb7fe700 

[PATCH] hw/vfio: let readonly flag take effect for mmaped regions

2020-03-26 Thread yan . y . zhao
From: Yan Zhao 

currently, vfio regions without VFIO_REGION_INFO_FLAG_WRITE are only
read-only when VFIO_REGION_INFO_FLAG_MMAP is not set.

regions with flag VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_MMAP
are only read-only in host page table for qemu.

This patch sets corresponding ept page entries read-only for regions
with flag VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_MMAP.

accordingly, it ignores guest write when guest writes to the read-only
regions are trapped.

Signed-off-by: Yan Zhao 
Signed-off-by: Xin Zeng 
---
 hw/vfio/common.c | 4 
 memory.c | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0b3593b3c0..e901621ca0 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -971,6 +971,10 @@ int vfio_region_mmap(VFIORegion *region)
   name, region->mmaps[i].size,
   region->mmaps[i].mmap);
 g_free(name);
+
+if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE)) {
+memory_region_set_readonly(>mmaps[i].mem, true);
+}
 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
 >mmaps[i].mem);
 
diff --git a/memory.c b/memory.c
index 601b749906..4b1071dc74 100644
--- a/memory.c
+++ b/memory.c
@@ -1313,6 +1313,9 @@ static void memory_region_ram_device_write(void *opaque, 
hwaddr addr,
 MemoryRegion *mr = opaque;
 
 trace_memory_region_ram_device_write(get_cpu_index(), mr, addr, data, 
size);
+if (mr->readonly) {
+return;
+}
 
 switch (size) {
 case 1:
-- 
2.17.1




Re: [PATCH v6 0/7] reference implementation of RSS and hash report

2020-03-26 Thread Jason Wang



On 2020/3/26 下午9:32, Michael S. Tsirkin wrote:

On Fri, Mar 20, 2020 at 01:57:44PM +0200, Yuri Benditovich wrote:

Support for VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT
features in QEMU for reference purpose.
Implements Toeplitz hash calculation for incoming
packets according to configuration provided by driver.
Uses calculated hash for decision on receive virtqueue
and/or reports the hash in the virtio header

Series:

Reviewed-by: Michael S. Tsirkin 



Queued for 5.1.

Thanks





Changes from v5:
RSS migration state moved to subsection and migrated
only if enabled (patch 7)
Updated sign off (patch 6)

Yuri Benditovich (7):
   virtio-net: introduce RSS and hash report features
   virtio-net: implement RSS configuration command
   virtio-net: implement RX RSS processing
   tap: allow extended virtio header with hash info
   virtio-net: reference implementation of hash report
   vmstate.h: provide VMSTATE_VARRAY_UINT16_ALLOC macro
   virtio-net: add migration support for RSS and hash report

  hw/net/trace-events|   3 +
  hw/net/virtio-net.c| 448 +++--
  include/hw/virtio/virtio-net.h |  16 ++
  include/migration/vmstate.h|  10 +
  net/tap.c  |  11 +-
  5 files changed, 460 insertions(+), 28 deletions(-)

--
2.17.1





Re: [PATCH] hw/net/i82596.c: Avoid reading off end of buffer in i82596_receive()

2020-03-26 Thread Jason Wang



On 2020/3/27 上午5:11, Peter Maydell wrote:

On Tue, 17 Mar 2020 at 06:13, Jason Wang  wrote:

On 2020/3/13 上午4:16, Peter Maydell wrote:

The i82596_receive() function attempts to pass the guest a buffer
which is effectively the concatenation of the data it is passed and a
4 byte CRC value.  However, rather than implementing this as "write
the data; then write the CRC" it instead bumps the length value of
the data by 4, and writes 4 extra bytes from beyond the end of the
buffer, which it then overwrites with the CRC.  It also assumed that
we could always fit all four bytes of the CRC into the final receive
buffer, which might not be true if the CRC needs to be split over two
receive buffers.

Applied.

Hi Jason -- this doesn't seem to have reached master yet.
Has it gotten lost somewhere along the line?

thanks
-- PMM



Nope, it's in my queue.

I will send a pull request shortly.

Thanks




Re: [PATCH v6 1/2] net: tulip: check frame size and r/w data length

2020-03-26 Thread Jason Wang



On 2020/3/24 下午10:54, Li Qiang wrote:



Jason Wang mailto:jasow...@redhat.com>> 
于2020年3月24日周二 下午1:45写道:



On 2020/3/24 上午9:29, Li Qiang wrote:
>
>
> P J P mailto:ppan...@redhat.com>
>>
于2020年3月23日周一
> 下午8:24写道:
>
>     From: Prasad J Pandit mailto:p...@fedoraproject.org>
>     >>
>
>     Tulip network driver while copying tx/rx buffers does not check
>     frame size against r/w data length. This may lead to OOB buffer
>     access. Add check to avoid it.
>
>     Limit iterations over descriptors to avoid potential infinite
>     loop issue in tulip_xmit_list_update.
>
>     Reported-by: Li Qiang mailto:pangpei...@antfin.com>
>     >>
>     Reported-by: Ziming Zhang mailto:ezrak...@gmail.com>
>     >>
>     Reported-by: Jason Wang mailto:jasow...@redhat.com>
>     >>
>     Signed-off-by: Prasad J Pandit mailto:p...@fedoraproject.org>
>     >>
>
>
>
> Tested-by: Li Qiang mailto:liq...@gmail.com>
>>
> But I have a minor question
>
>     ---
>      hw/net/tulip.c | 36 +++-
>      1 file changed, 27 insertions(+), 9 deletions(-)
>
>     Update v3: return a value from tulip_copy_tx_buffers() and avoid
>     infinite loop
>       ->
> https://lists.gnu.org/archive/html/qemu-devel/2020-02/msg06275.html
>
>     diff --git a/hw/net/tulip.c b/hw/net/tulip.c
>     index cfac2719d3..fbe40095da 100644
>     --- a/hw/net/tulip.c
>     +++ b/hw/net/tulip.c
>     @@ -170,6 +170,10 @@ static void tulip_copy_rx_bytes(TULIPState
>     *s, struct tulip_descriptor *desc)
>              } else {
>                  len = s->rx_frame_len;
>              }
>     +
>     +        if (s->rx_frame_len + len >= sizeof(s->rx_frame)) {
>     +            return;
>     +        }
>
>
>
> Why here is '>=' instead of '>'.
> IIUC the total sending length can reach to sizeof(s->rx_frame).
> Same in the other place in this patch.


Yes, this need to be fixed.


>
> PS: I have planned to write a qtest case. But my personal qemu dev
> environment is broken.
> I will try to write it tonight or tomorrow night.


Cool, good to know this.


Hi all,
I have countered an interesting issue. Let's look at the definition of 
TULIPState.


  21 typedef struct TULIPState {
  22     PCIDevice dev;
  23     MemoryRegion io;
  24     MemoryRegion memory;
  25     NICConf c;
  26     qemu_irq irq;
  27     NICState *nic;
  28     eeprom_t *eeprom;
  29     uint32_t csr[16];
  30
  31     /* state for MII */
  32     uint32_t old_csr9;
  33     uint32_t mii_word;
  34     uint32_t mii_bitcnt;
  35
  36     hwaddr current_rx_desc;
  37     hwaddr current_tx_desc;
  38
  39     uint8_t rx_frame[2048];
  40     uint8_t tx_frame[2048];
  41     uint16_t tx_frame_len;
  42     uint16_t rx_frame_len;
  43     uint16_t rx_frame_size;
  44
  45     uint32_t rx_status;
  46     uint8_t filter[16][6];
  47 } TULIPState;

Here we can see the overflow is occured after 'tx_frame'.
In my quest, I have see the overflow(the s->tx_frame_len is big).
However here doesn't cause SEGV in qtest.
In real case, the qemu process will access the data after TULIPState 
in heap and trigger segv.

However in qtest mode I don't know how to trigger this.



If it's just the mangling of tx_frame_len, it won't hit SIGSEV.

I wonder maybe, somehow that large tx_frame_len is used for buffer 
copying or other stuffs that can lead the crash.


Thanks




The core code like this:

 qpci_device_enable(dev);
bar = qpci_iomap(dev, 0, NULL);
    context_pa = guest_alloc(alloc, sizeof(context));
    guest_pa = guest_alloc(alloc, 4096);
memset(guest_data, 'A', sizeof(guest_data));
    context[0].status = 1 << 31;
context[0].control = 0x7ff << 11 | 0x7ff;
context[0].buf_addr2 = context_pa + sizeof(struct tulip_descriptor);
context[0].buf_addr1 = guest_pa;
    for (i = 1; i < ARRAY_SIZE(context); ++i) {
        context_pa += sizeof(struct tulip_descriptor);
        context[i].status = 1 << 31;
context[i].control = 0x7ff << 11 | 0x7ff;
context[i].buf_addr2 = context_pa + sizeof(struct tulip_descriptor);
context[i].buf_addr1 = guest_pa;
}

qtest_memwrite(dev->bus->qts, context_pa, context, sizeof(context));
qtest_memwrite(dev->bus->qts, guest_pa, guest_data, sizeof(guest_data));
qpci_io_writel(dev, bar, 0x20, context_pa);
qpci_io_writel(dev, bar, 0x30, 1 << 13);

Paolo may give some hints?

Thanks,
Li Qiang


Re: [PATCH] hw/net/allwinner-sun8i-emac.c: Fix REG_ADDR_HIGH/LOW reads

2020-03-26 Thread Jason Wang



On 2020/3/25 上午5:21, Peter Maydell wrote:

Coverity points out (CID 1421926) that the read code for
REG_ADDR_HIGH reads off the end of the buffer, because it does a
32-bit read from byte 4 of a 6-byte buffer.

The code also has an endianness issue for both REG_ADDR_HIGH and
REG_ADDR_LOW, because it will do the wrong thing on a big-endian
host.

Rewrite the read code to use ldl_le_p() and lduw_le_p() to fix this;
the write code is not incorrect, but for consistency we make it use
stl_le_p() and stw_le_p().

Signed-off-by: Peter Maydell 
---
  hw/net/allwinner-sun8i-emac.c | 12 
  1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/hw/net/allwinner-sun8i-emac.c b/hw/net/allwinner-sun8i-emac.c
index 3fc5e346401..fc67a1be70a 100644
--- a/hw/net/allwinner-sun8i-emac.c
+++ b/hw/net/allwinner-sun8i-emac.c
@@ -611,10 +611,10 @@ static uint64_t allwinner_sun8i_emac_read(void *opaque, 
hwaddr offset,
  value = s->mii_data;
  break;
  case REG_ADDR_HIGH: /* MAC Address High */
-value = *(((uint32_t *) (s->conf.macaddr.a)) + 1);
+value = lduw_le_p(s->conf.macaddr.a + 4);
  break;
  case REG_ADDR_LOW:  /* MAC Address Low */
-value = *(uint32_t *) (s->conf.macaddr.a);
+value = ldl_le_p(s->conf.macaddr.a);
  break;
  case REG_TX_DMA_STA:/* Transmit DMA Status */
  break;
@@ -728,14 +728,10 @@ static void allwinner_sun8i_emac_write(void *opaque, 
hwaddr offset,
  s->mii_data = value;
  break;
  case REG_ADDR_HIGH: /* MAC Address High */
-s->conf.macaddr.a[4] = (value & 0xff);
-s->conf.macaddr.a[5] = (value & 0xff00) >> 8;
+stw_le_p(s->conf.macaddr.a + 4, value);
  break;
  case REG_ADDR_LOW:  /* MAC Address Low */
-s->conf.macaddr.a[0] = (value & 0xff);
-s->conf.macaddr.a[1] = (value & 0xff00) >> 8;
-s->conf.macaddr.a[2] = (value & 0xff) >> 16;
-s->conf.macaddr.a[3] = (value & 0xff00) >> 24;
+stl_le_p(s->conf.macaddr.a, value);
  break;
  case REG_TX_DMA_STA:/* Transmit DMA Status */
  case REG_TX_CUR_DESC:   /* Transmit Current Descriptor */



Applied.

Thanks





Re: [PATCH v7] net: tulip: check frame size and r/w data length

2020-03-26 Thread Jason Wang



On 2020/3/26 下午7:11, Li Qiang wrote:



P J P mailto:ppan...@redhat.com>> 于2020年3月25日周三 
上午1:31写道:


From: Prasad J Pandit mailto:p...@fedoraproject.org>>

Tulip network driver while copying tx/rx buffers does not check
frame size against r/w data length. This may lead to OOB buffer
access. Add check to avoid it.

Limit iterations over descriptors to avoid potential infinite
loop issue in tulip_xmit_list_update.

Reported-by: Li Qiang mailto:pangpei...@antfin.com>>
Reported-by: Ziming Zhang mailto:ezrak...@gmail.com>>
Reported-by: Jason Wang mailto:jasow...@redhat.com>>
Signed-off-by: Prasad J Pandit mailto:p...@fedoraproject.org>>



Tested-by: Li Qiang mailto:liq...@gmail.com>>
Reviewed-by: Li Qiang mailto:liq...@gmail.com>>

Thanks,
Li Qiang



Applied.

Thanks




---
 hw/net/tulip.c | 36 +++-
 1 file changed, 27 insertions(+), 9 deletions(-)

Update v7: fix length check expression to replace '>=' with '>'
  ->
https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07160.html

diff --git a/hw/net/tulip.c b/hw/net/tulip.c
index cfac2719d3..1295f51d07 100644
--- a/hw/net/tulip.c
+++ b/hw/net/tulip.c
@@ -170,6 +170,10 @@ static void tulip_copy_rx_bytes(TULIPState
*s, struct tulip_descriptor *desc)
         } else {
             len = s->rx_frame_len;
         }
+
+        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
+            return;
+        }
         pci_dma_write(>dev, desc->buf_addr1, s->rx_frame +
             (s->rx_frame_size - s->rx_frame_len), len);
         s->rx_frame_len -= len;
@@ -181,6 +185,10 @@ static void tulip_copy_rx_bytes(TULIPState
*s, struct tulip_descriptor *desc)
         } else {
             len = s->rx_frame_len;
         }
+
+        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
+            return;
+        }
         pci_dma_write(>dev, desc->buf_addr2, s->rx_frame +
             (s->rx_frame_size - s->rx_frame_len), len);
         s->rx_frame_len -= len;
@@ -227,7 +235,8 @@ static ssize_t tulip_receive(TULIPState *s,
const uint8_t *buf, size_t size)

     trace_tulip_receive(buf, size);

-    if (size < 14 || size > 2048 || s->rx_frame_len ||
tulip_rx_stopped(s)) {
+    if (size < 14 || size > sizeof(s->rx_frame) - 4
+        || s->rx_frame_len || tulip_rx_stopped(s)) {
         return 0;
     }

@@ -275,7 +284,6 @@ static ssize_t tulip_receive_nc(NetClientState
*nc,
     return tulip_receive(qemu_get_nic_opaque(nc), buf, size);
 }

-
 static NetClientInfo net_tulip_info = {
     .type = NET_CLIENT_DRIVER_NIC,
     .size = sizeof(NICState),
@@ -558,7 +566,7 @@ static void tulip_tx(TULIPState *s, struct
tulip_descriptor *desc)
         if ((s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK) {
             /* Internal or external Loopback */
             tulip_receive(s, s->tx_frame, s->tx_frame_len);
-        } else {
+        } else if (s->tx_frame_len <= sizeof(s->tx_frame)) {
             qemu_send_packet(qemu_get_queue(s->nic),
                 s->tx_frame, s->tx_frame_len);
         }
@@ -570,23 +578,31 @@ static void tulip_tx(TULIPState *s, struct
tulip_descriptor *desc)
     }
 }

-static void tulip_copy_tx_buffers(TULIPState *s, struct
tulip_descriptor *desc)
+static int tulip_copy_tx_buffers(TULIPState *s, struct
tulip_descriptor *desc)
 {
     int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) &
TDES1_BUF1_SIZE_MASK;
     int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) &
TDES1_BUF2_SIZE_MASK;

+    if (s->tx_frame_len + len1 > sizeof(s->tx_frame)) {
+        return -1;
+    }
     if (len1) {
         pci_dma_read(>dev, desc->buf_addr1,
             s->tx_frame + s->tx_frame_len, len1);
         s->tx_frame_len += len1;
     }

+    if (s->tx_frame_len + len2 > sizeof(s->tx_frame)) {
+        return -1;
+    }
     if (len2) {
         pci_dma_read(>dev, desc->buf_addr2,
             s->tx_frame + s->tx_frame_len, len2);
         s->tx_frame_len += len2;
     }
     desc->status = (len1 + len2) ? 0 : 0x7fff;
+
+    return 0;
 }

 static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf,
int n)
@@ -651,13 +667,15 @@ static uint32_t tulip_ts(TULIPState *s)

 static void tulip_xmit_list_update(TULIPState *s)
 {
+#define TULIP_DESC_MAX 128
+    uint8_t i = 0;
     struct tulip_descriptor desc;

     if (tulip_ts(s) != CSR5_TS_SUSPENDED) {
         return;
     }

-    for (;;) {
+    for (i = 0; i < TULIP_DESC_MAX; i++) {
         tulip_desc_read(s, s->current_tx_desc, );
         tulip_dump_tx_descriptor(s, );

@@ -675,10 +693,10 @@ 

RE: [PATCH v1 12/22] intel_iommu: add PASID cache management infrastructure

2020-03-26 Thread Liu, Yi L
> From: Peter Xu 
> Sent: Thursday, March 26, 2020 11:54 PM
> To: Liu, Yi L 
> Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache management
> infrastructure
> 
> On Thu, Mar 26, 2020 at 01:57:10PM +, Liu, Yi L wrote:
> > > From: Liu, Yi L
> > > Sent: Thursday, March 26, 2020 2:15 PM
> > > To: 'Peter Xu' 
> > > Subject: RE: [PATCH v1 12/22] intel_iommu: add PASID cache management
> > > infrastructure
> > >
> > > > From: Peter Xu 
> > > > Sent: Wednesday, March 25, 2020 10:52 PM
> > > > To: Liu, Yi L 
> > > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache management
> > > > infrastructure
> > > >
> > > > On Wed, Mar 25, 2020 at 12:20:21PM +, Liu, Yi L wrote:
> > > > > > From: Peter Xu 
> > > > > > Sent: Wednesday, March 25, 2020 1:32 AM
> > > > > > To: Liu, Yi L 
> > > > > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache
> > > > > > management infrastructure
> > > > > >
> > > > > > On Sun, Mar 22, 2020 at 05:36:09AM -0700, Liu Yi L wrote:
> > > > > > > This patch adds a PASID cache management infrastructure based on
> > > > > > > new added structure VTDPASIDAddressSpace, which is used to track
> > > > > > > the PASID usage and future PASID tagged DMA address translation
> > > > > > > support in vIOMMU.
> > [...]
> > > > > >
> > > > > > 
> > > > > >
> > > > > > > +/*
> > > > > > > + * Step 2: loop all the exisitng vtd_dev_icx instances.
> > > > > > > + * Ideally, needs to loop all devices to find if there is 
> > > > > > > any new
> > > > > > > + * PASID binding regards to the PASID cache invalidation 
> > > > > > > request.
> > > > > > > + * But it is enough to loop the devices which are backed by 
> > > > > > > host
> > > > > > > + * IOMMU. For devices backed by vIOMMU (a.k.a emulated 
> > > > > > > devices),
> > > > > > > + * if new PASID happened on them, their vtd_pasid_as 
> > > > > > > instance could
> > > > > > > + * be created during future vIOMMU DMA translation.
> > > > > > > + */
> > > > > > > +QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
> > > > > > > +VTDPASIDAddressSpace *vtd_pasid_as;
> > > > > > > +VTDPASIDCacheEntry *pc_entry;
> > > > > > > +VTDPASIDEntry pe;
> > > > > > > +VTDBus *vtd_bus = vtd_dev_icx->vtd_bus;
> > > > > > > +uint16_t devfn = vtd_dev_icx->devfn;
> > > > > > > +int bus_n = pci_bus_num(vtd_bus->bus);
> > > > > > > +
> > > > > > > +/* i) fetch vtd_pasid_as and check if it is valid */
> > > > > > > +vtd_pasid_as = vtd_add_find_pasid_as(s, vtd_bus,
> > > > > > > + devfn, pasid);
> > > > > >
> > > > > > I don't feel like it's correct here...
> > > > > >
> > > > > > Assuming we have two devices assigned D1, D2.  D1 uses PASID=1, D2
> > > > > > uses
> > > > PASID=2.
> > > > > > When invalidating against PASID=1, are you also going to create a
> > > > > > VTDPASIDAddressSpace also for D2 with PASID=1?
> > > > >
> > > > > Answer is no. Before going forward, let's see if the below flow looks 
> > > > > good to
> you.
> > > > >
> > > > > Let me add one more device besides D1 and D2. Say device D3 which
> > > > > also uses PASID=1. And assume it begins with no PASID usage in guest.
> > > > >
> > > > > Then the flow from scratch is:
> > > > >
> > > > > a) guest IOMMU driver setup PASID entry for D1 with PASID=1,
> > > > >then invalidates against PASID #1
> > > > > b) trap to QEMU, and comes to this function. Since there is
> > > > >no previous pasid cache invalidation, so the Step 1 of this
> > > > >function has nothing to do, then goes to Step 2 which is to
> > > > >loop all assigned devices and check if the guest pasid entry
> > > > >is present. In this loop, should find D1's pasid entry for
> > > > >PASID#1 is present. So create the VTDPASIDAddressSpace and
> > > > >initialize its pasid_cache_entry and pasid_cache_gen fields.
> > > > > c) guest IOMMU driver setup PASID entry for D2 with PASID=2,
> > > > >then invalidates against PASID #2
> > > > > d) same with b), only difference is the Step 1 of this function
> > > > >will loop the VTDPASIDAddressSpace created in b), but its
> > > > >pasid is 1 which is not the target of current pasid cache
> > > > >invalidation. Same with b), in Step 2, it will create a
> > > > >VTDPASIDAddressSpace for D2+PASID#2
> > > > > e) guest IOMMU driver setup PASID entry for D3 with PASID=1,
> > > > >then invalidates against PASID #1
> > > > > f) trap to QEMU and comes to this function. Step 1 loops two
> > > > >VTDPASIDAddressSpace created in b) and d), and it finds there
> > > > >is a VTDPASIDAddressSpace whose pasid is 1. vtd_flush_pasid()
> > > > >will check if the cached pasid entry is the same with the one
> > > > >in guest memory. In this flow, it should be the same, so
> > > > >vtd_flush_pasid() will do nothing for it. Then comes to Step 2,
> > > > >it loops D1, D2, D3.
> > > > 

Re: [RFC for-5.1 4/4] spapr: Don't allow unplug of NVLink2 devices

2020-03-26 Thread David Gibson
On Thu, Mar 26, 2020 at 01:27:40PM +0100, Greg Kurz wrote:
> On Thu, 26 Mar 2020 16:40:09 +1100
> David Gibson  wrote:
> 
> > Currently, we can't properly handle unplug of NVLink2 devices, because we
> > don't have code to tear down their special memory resources.  There's not
> > a lot of impetus to implement that. Since hardware NVLink2 devices can't
> > be hot unplugged, the guest side drivers don't usually support unplug
> > anyway.
> > 
> > Therefore, simply prevent unplug of NVLink2 devices.
> > 
> 
> This could maybe considered as a valid fix for 5.0 since this prevents
> guest crashes IIUC. But since this requires the two preliminary cleanup
> patches, I understand you may prefer to postpone that to 5.1.

Yeah, it's arguably a bug, but not a regression, so I'm inclined to
leave it to 5.1.

> 
> > Signed-off-by: David Gibson 
> > ---
> 
> Reviewed-by: Greg Kurz 
> 
> >  hw/ppc/spapr_pci.c | 5 +
> >  1 file changed, 5 insertions(+)
> > 
> > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > index 55ca9dee1e..5c8262413a 100644
> > --- a/hw/ppc/spapr_pci.c
> > +++ b/hw/ppc/spapr_pci.c
> > @@ -1666,6 +1666,11 @@ static void spapr_pci_unplug_request(HotplugHandler 
> > *plug_handler,
> >  return;
> >  }
> >  
> > +if (spapr_phb_is_nvlink_dev(pdev, phb)) {
> > +error_setg(errp, "PCI: Cannot unplug NVLink2 devices");
> > +return;
> > +}
> > +
> >  /* ensure any other present functions are pending unplug */
> >  if (PCI_FUNC(pdev->devfn) == 0) {
> >  for (i = 1; i < 8; i++) {
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC for-5.1 3/4] spapr: Fix failure path for attempting to hot unplug PCI bridges

2020-03-26 Thread David Gibson
On Thu, Mar 26, 2020 at 01:18:24PM +0100, Greg Kurz wrote:
> On Thu, 26 Mar 2020 16:40:08 +1100
> David Gibson  wrote:
> 
> > For various technical reasons we can't currently allow unplug a PCI to PCI
> > bridge on the pseries machine.  spapr_pci_unplug_request() correctly
> > generates an error message if that's attempted.
> > 
> > But.. if the given errp is not error_abort or error_fatal,
> 
> Which is the always case when trying to unplug a device AFAICT:
> 
> void qdev_unplug(DeviceState *dev, Error **errp)
> {
> DeviceClass *dc = DEVICE_GET_CLASS(dev);
> HotplugHandler *hotplug_ctrl;
> HotplugHandlerClass *hdc;
> Error *local_err = NULL;
> 
> [...]
> hdc = HOTPLUG_HANDLER_GET_CLASS(hotplug_ctrl);
> if (hdc->unplug_request) {
> hotplug_handler_unplug_request(hotplug_ctrl, dev, _err);
> 
> And anyway, spapr_pci_unplug_request() shouldn't rely on the caller
> passing _fatal or _abort to do the right thing. Calling
> error_setg() without returning right away is a dangerous practice
> since it would cause a subsequent call to error_setg() with the
> same errp to abort QEMU.
> 
> > it doesn't actually stop trying to unplug the bridge anyway.
> > 
> 
> This looks like a bug fix that could go to 5.0 IMHO.

Fair point.  I've added the tag and moved to ppc-for-5.0.

> 
> Maybe add this tag ?
> 
>Fixes: 14e714900f6b "spapr: Allow hot plug/unplug of PCI bridges and 
> devices under PCI bridges"
> 
> > Signed-off-by: David Gibson 
> > ---
> 
> Reviewed-by: Greg Kurz 
> 
> >  hw/ppc/spapr_pci.c | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > index 709a52780d..55ca9dee1e 100644
> > --- a/hw/ppc/spapr_pci.c
> > +++ b/hw/ppc/spapr_pci.c
> > @@ -1663,6 +1663,7 @@ static void spapr_pci_unplug_request(HotplugHandler 
> > *plug_handler,
> >  
> >  if (pc->is_bridge) {
> >  error_setg(errp, "PCI: Hot unplug of PCI bridges not 
> > supported");
> > +return;
> >  }
> >  
> >  /* ensure any other present functions are pending unplug */
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC for-5.1 1/4] spapr: Refactor locating NVLink2 devices for device tree creation

2020-03-26 Thread David Gibson
On Thu, Mar 26, 2020 at 12:57:38PM +0100, Greg Kurz wrote:
65;5803;1c> On Thu, 26 Mar 2020 16:40:06 +1100
> David Gibson  wrote:
> 
> > Currently spapr_phb_nvgpu_populate_pcidev_dt() works a little cryptically.
> > It steps through all the NVLink2 GPUs and NPUs and if they match the device
> > we're called for, we generate the relevant device tree information.
> > 
> > Make this a little more obvious by introducing helpers to determine it a
> 
> ... to determine if a

Fixed, thanks.

> 
> > given PCI device is an NVLink2 GPU or NPU, returning the NVLink2 slot and
> > link number information as well.
> > 
> > Signed-off-by: David Gibson 
> > ---
> 
> LGTM
> 
> Reviewed-by: Greg Kurz 
> 
> >  hw/ppc/spapr_pci_nvlink2.c | 115 +
> >  1 file changed, 79 insertions(+), 36 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
> > index 8332d5694e..7d3a685421 100644
> > --- a/hw/ppc/spapr_pci_nvlink2.c
> > +++ b/hw/ppc/spapr_pci_nvlink2.c
> > @@ -390,13 +390,12 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState 
> > *sphb, void *fdt)
> >  
> >  }
> >  
> > -void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int 
> > offset,
> > -SpaprPhbState *sphb)
> > +static bool is_nvgpu(PCIDevice *dev, SpaprPhbState *sphb, int *slot)
> >  {
> > -int i, j;
> > +int i;
> >  
> >  if (!sphb->nvgpus) {
> > -return;
> > +return false;
> >  }
> >  
> >  for (i = 0; i < sphb->nvgpus->num; ++i) {
> > @@ -406,47 +405,91 @@ void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice 
> > *dev, void *fdt, int offset,
> >  if (!nvslot->gpdev) {
> >  continue;
> >  }
> > +
> >  if (dev == nvslot->gpdev) {
> > -uint32_t npus[nvslot->linknum];
> > +if (slot) {
> > +*slot = i;
> > +}
> > +return true;
> > +}
> > +}
> >  
> > -for (j = 0; j < nvslot->linknum; ++j) {
> > -PCIDevice *npdev = nvslot->links[j].npdev;
> > +return false;
> > +}
> >  
> > -npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev));
> > -}
> > -_FDT(fdt_setprop(fdt, offset, "ibm,npu", npus,
> > - j * sizeof(npus[0])));
> > -_FDT((fdt_setprop_cell(fdt, offset, "phandle",
> > -   PHANDLE_PCIDEV(sphb, dev;
> > +static bool is_nvnpu(PCIDevice *dev, SpaprPhbState *sphb, int *slot, int 
> > *link)
> > +{
> > +int i, j;
> > +
> > +if (!sphb->nvgpus) {
> > +return false;
> > +}
> > +
> > +for (i = 0; i < sphb->nvgpus->num; ++i) {
> > +SpaprPhbPciNvGpuSlot *nvslot = >nvgpus->slots[i];
> > +
> > +/* Skip "slot" without attached GPU */
> > +if (!nvslot->gpdev) {
> >  continue;
> >  }
> >  
> >  for (j = 0; j < nvslot->linknum; ++j) {
> > -if (dev != nvslot->links[j].npdev) {
> > -continue;
> > +if (dev == nvslot->links[j].npdev) {
> > +if (slot) {
> > +*slot = i;
> > +}
> > +if (link) {
> > +*link = j;
> > +}
> > +return true;
> >  }
> > +}
> > +}
> >  
> > -_FDT((fdt_setprop_cell(fdt, offset, "phandle",
> > -   PHANDLE_PCIDEV(sphb, dev;
> > -_FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu",
> > -  PHANDLE_PCIDEV(sphb, nvslot->gpdev)));
> > -_FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink",
> > -   PHANDLE_NVLINK(sphb, i, j;
> > -/*
> > - * If we ever want to emulate GPU RAM at the same location as 
> > on
> > - * the host - here is the encoding GPA->TGT:
> > - *
> > - * gta  = ((sphb->nv2_gpa >> 42) & 0x1) << 42;
> > - * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43;
> > - * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45;
> > - * gta |= sphb->nv2_gpa & ((1UL << 43) - 1);
> > - */
> > -_FDT(fdt_setprop_cell(fdt, offset, "memory-region",
> > -  PHANDLE_GPURAM(sphb, i)));
> > -_FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr",
> > - nvslot->tgt));
> > -_FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed",
> > -  nvslot->links[j].link_speed));
> > +return false;
> > +}
> > +
> > +void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int 
> > offset,
> > +SpaprPhbState *sphb)
> > +{
> > +int slot, link;
> > +
> > +if (is_nvgpu(dev, sphb, )) {
> > +SpaprPhbPciNvGpuSlot *nvslot = 

Re: [PATCH v16 Kernel 4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

2020-03-26 Thread Yan Zhao
On Fri, Mar 27, 2020 at 05:39:01AM +0800, Kirti Wankhede wrote:
> 
> 
> On 3/25/2020 7:41 AM, Yan Zhao wrote:
> > On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:
> >> VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
> >> - Start dirty pages tracking while migration is active
> >> - Stop dirty pages tracking.
> >> - Get dirty pages bitmap. Its user space application's responsibility to
> >>copy content of dirty pages from source to destination during migration.
> >>
> >> To prevent DoS attack, memory for bitmap is allocated per vfio_dma
> >> structure. Bitmap size is calculated considering smallest supported page
> >> size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled
> >>
> >> Bitmap is populated for already pinned pages when bitmap is allocated for
> >> a vfio_dma with the smallest supported page size. Update bitmap from
> >> pinning functions when tracking is enabled. When user application queries
> >> bitmap, check if requested page size is same as page size used to
> >> populated bitmap. If it is equal, copy bitmap, but if not equal, return
> >> error.
> >>
> >> Signed-off-by: Kirti Wankhede 
> >> Reviewed-by: Neo Jia 
> >> ---
> >>   drivers/vfio/vfio_iommu_type1.c | 266 
> >> +++-
> >>   1 file changed, 260 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c 
> >> b/drivers/vfio/vfio_iommu_type1.c
> >> index 70aeab921d0f..874a1a7ae925 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>unsigned intdma_avail;
> >>boolv2;
> >>boolnesting;
> >> +  booldirty_page_tracking;
> >>   };
> >>   
> >>   struct vfio_domain {
> >> @@ -91,6 +92,7 @@ struct vfio_dma {
> >>boollock_cap;   /* capable(CAP_IPC_LOCK) */
> >>struct task_struct  *task;
> >>struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
> >> +  unsigned long   *bitmap;
> >>   };
> >>   
> >>   struct vfio_group {
> >> @@ -125,7 +127,21 @@ struct vfio_regions {
> >>   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)  \
> >>(!list_empty(>domain_list))
> >>   
> >> +#define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / 
> >> BITS_PER_BYTE)
> >> +
> >> +/*
> >> + * Input argument of number of bits to bitmap_set() is unsigned integer, 
> >> which
> >> + * further casts to signed integer for unaligned multi-bit operation,
> >> + * __bitmap_set().
> >> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 
> >> bits/byte,
> >> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
> >> + * system.
> >> + */
> >> +#define DIRTY_BITMAP_PAGES_MAX(uint64_t)(INT_MAX - 1)
> >> +#define DIRTY_BITMAP_SIZE_MAX  
> >> DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >> +
> >>   static int put_pfn(unsigned long pfn, int prot);
> >> +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>   
> >>   /*
> >>* This code handles mapping and unmapping of user data buffers
> >> @@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
> >> struct vfio_dma *old)
> >>rb_erase(>node, >dma_list);
> >>   }
> >>   
> >> +
> >> +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
> >> +{
> >> +  uint64_t npages = dma->size / pgsize;
> >> +
> >> +  if (npages > DIRTY_BITMAP_PAGES_MAX)
> >> +  return -EINVAL;
> >> +
> >> +  dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
> >> +  if (!dma->bitmap)
> >> +  return -ENOMEM;
> >> +
> >> +  return 0;
> >> +}
> >> +
> >> +static void vfio_dma_bitmap_free(struct vfio_dma *dma)
> >> +{
> >> +  kfree(dma->bitmap);
> >> +  dma->bitmap = NULL;
> >> +}
> >> +
> >> +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t 
> >> pgsize)
> >> +{
> >> +  struct rb_node *p;
> >> +
> >> +  if (RB_EMPTY_ROOT(>pfn_list))
> >> +  return;
> >> +
> >> +  for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
> >> +  struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
> >> +
> >> +  bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
> >> +  }
> >> +}
> >> +
> >> +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t 
> >> pgsize)
> >> +{
> >> +  struct rb_node *n = rb_first(>dma_list);
> >> +
> >> +  for (; n; n = rb_next(n)) {
> >> +  struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> >> +  int ret;
> >> +
> >> +  ret = vfio_dma_bitmap_alloc(dma, pgsize);
> >> +  if (ret) {
> >> +  struct rb_node *p = rb_prev(n);
> >> +
> >> +  for (; p; p = rb_prev(p)) {
> >> +  struct vfio_dma *dma = rb_entry(n,
> >> +  struct 

Re: [PATCH v16 Kernel 5/7] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-03-26 Thread Yan Zhao
On Fri, Mar 27, 2020 at 05:39:44AM +0800, Kirti Wankhede wrote:
> 
> 
> On 3/25/2020 7:48 AM, Yan Zhao wrote:
> > On Wed, Mar 25, 2020 at 03:32:37AM +0800, Kirti Wankhede wrote:
> >> DMA mapped pages, including those pinned by mdev vendor drivers, might
> >> get unpinned and unmapped while migration is active and device is still
> >> running. For example, in pre-copy phase while guest driver could access
> >> those pages, host device or vendor driver can dirty these mapped pages.
> >> Such pages should be marked dirty so as to maintain memory consistency
> >> for a user making use of dirty page tracking.
> >>
> >> To get bitmap during unmap, user should allocate memory for bitmap, set
> >> size of allocated memory, set page size to be considered for bitmap and
> >> set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.
> >>
> >> Signed-off-by: Kirti Wankhede 
> >> Reviewed-by: Neo Jia 
> >> ---
> >>   drivers/vfio/vfio_iommu_type1.c | 54 
> >> ++---
> >>   include/uapi/linux/vfio.h   | 10 
> >>   2 files changed, 60 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c 
> >> b/drivers/vfio/vfio_iommu_type1.c
> >> index 27ed069c5053..b98a8d79e13a 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -982,7 +982,8 @@ static int verify_bitmap_size(uint64_t npages, 
> >> uint64_t bitmap_size)
> >>   }
> >>   
> >>   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >> -   struct vfio_iommu_type1_dma_unmap *unmap)
> >> +   struct vfio_iommu_type1_dma_unmap *unmap,
> >> +   struct vfio_bitmap *bitmap)
> >>   {
> >>uint64_t mask;
> >>struct vfio_dma *dma, *dma_last = NULL;
> >> @@ -1033,6 +1034,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu 
> >> *iommu,
> >> * will be returned if these conditions are not met.  The v2 interface
> >> * will only return success and a size of zero if there were no
> >> * mappings within the range.
> >> +   *
> >> +   * When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
> >> +   * must be for single mapping. Multiple mappings with this flag set is
> >> +   * not supported.
> >> */
> >>if (iommu->v2) {
> >>dma = vfio_find_dma(iommu, unmap->iova, 1);
> >> @@ -1040,6 +1045,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu 
> >> *iommu,
> >>ret = -EINVAL;
> >>goto unlock;
> >>}
> >> +
> >> +  if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> >> +  (dma->iova != unmap->iova || dma->size != unmap->size)) {
> > potential NULL pointer!
> > 
> > And could you address the comments in v14?
> > How to handle DSI unmaps in vIOMMU
> > (https://lore.kernel.org/kvm/20200323011041.GB5456@joy-OptiPlex-7040/)
> > 
> 
> Sorry, I drafted reply to it, but I missed to send, it remained in my drafts
> 
>  >
>  > it happens in vIOMMU Domain level invalidation of IOTLB
>  > (domain-selective invalidation, see vtd_iotlb_domain_invalidate() in 
> qemu).
>  > common in VTD lazy mode, and NOT just happening once at boot time.
>  > rather than invalidate page by page, it batches the page invalidation.
>  > so, when this invalidation takes place, even higher level page tables
>  > have been invalid and therefore it has to invalidate a bigger 
> combined range.
>  > That's why we see IOVAs are mapped in 4k pages, but are unmapped in 2M
>  > pages.
>  >
>  > I think those UNMAPs should also have GET_DIRTY_BIMTAP flag on, right?
> 
> 
> vtd_iotlb_domain_invalidate()
>vtd_sync_shadow_page_table()
>  vtd_sync_shadow_page_table_range(vtd_as, , 0, UINT64_MAX)
>vtd_page_walk()
>  vtd_page_walk_level() - walk over specific level for IOVA range
>vtd_page_walk_one()
>  memory_region_notify_iommu()
>  ...
>vfio_iommu_map_notify()
> 
> In the above trace, isn't page walk will take care of creating proper 
> IOTLB entry which should be same as created during mapping for that 
> IOTLB entry?
>
No. It does walk the page table, but as it's dsi (delay & batched unmap),
pages table entry for a whole 2M (the higher level, not last level for 4K)
range is invalid, so the iotlb->addr_mask what vfio_iommu_map_notify()
receives is (2M - 1), not the same as the size for map.

> 
>  >>>
>  >>> Such unmap would callback vfio_iommu_map_notify() in QEMU. In
>  >>> vfio_iommu_map_notify(), unmap is called on same range   >>> iotlb->addr_mask + 1> which was used for map. Secondly unmap with 
> bitmap
>  >>> will be called only when device state has _SAVING flag set.
>  >>
>  > in this case, iotlb->addr_mask in unmap is 0x20 -1.
>  > different than 0x1000 -1 used for map.
>  >> It might be helpful for Yan, and everyone else, to see the latest QEMU
>  >> patch series.  Thanks,
>  >>
>  > yes, please. also curious of log_sync part for 

Re: [PATCH for 5.0 v1 0/2] RISC-V: Fix Hypervisor guest user space

2020-03-26 Thread Palmer Dabbelt

On Thu, 26 Mar 2020 15:44:04 PDT (-0700), Alistair Francis wrote:

This series fixes two bugs in the RISC-V two stage lookup
implementation. This fixes the Hypervisor userspace failing to start.

Alistair Francis (2):
  riscv: Don't use stage-2 PTE lookup protection flags
  riscv: AND stage-1 and stage-2 protection flags

 target/riscv/cpu_helper.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)


Thanks, these are in the queue.



Re: [PATCH] i386/cpu: Expand MAX_FIXED_COUNTERS from 3 to 4 to for Icelake

2020-03-26 Thread Like Xu

On 2020/3/27 2:48, Paolo Bonzini wrote:

On 17/03/20 06:54, Like Xu wrote:

In the Intel SDM, "Table 18-2. Association of Fixed-Function
Performance Counters with Architectural Performance Events",
we may have a new fixed counter 'TOPDOWN.SLOTS' (since Icelake),
which counts the number of available slots for an unhalted
logical processor. Check commit 6017608936 in the kernel tree.

Signed-off-by: Like Xu 
---
  target/i386/cpu.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 576f309bbf..ec2b67d425 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1185,7 +1185,7 @@ typedef struct {
  #define CPU_NB_REGS CPU_NB_REGS32
  #endif
  
-#define MAX_FIXED_COUNTERS 3

+#define MAX_FIXED_COUNTERS 4
  #define MAX_GP_COUNTERS(MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0)
  
  #define TARGET_INSN_START_EXTRA_WORDS 1




Hi Like, the problem with this patch is that it breaks live migration;
the vmstate_msr_architectural_pmu record hardcodes MAX_FIXED_COUNTERS as
the number of registers.

So it's more complicated, you need to add a new subsection (following
vmstate_msr_architectural_pmu) and transmit it only if the 4th counter
is nonzero (instead of the more complicated check in pmu_enable_needed).
  Just to be safe, I'd make the new subsection hold 16 counters and bump
MAX_FIXED_COUNTERS to 16.


The new MAX_FIXED_COUNTERS looks good to me and
and let me follow up this live migration issue.

Thanks,
Like Xu



Thanks,

Paolo







Re: [PATCH for 5.0 v1 2/2] riscv: AND stage-1 and stage-2 protection flags

2020-03-26 Thread Alistair Francis
On Thu, Mar 26, 2020 at 4:32 PM Richard Henderson
 wrote:
>
> On 3/26/20 3:44 PM, Alistair Francis wrote:
> > Take the result of stage-1 and stage-2 page table walks and AND the two
> > protection flags together. This way we require both to set permissions
> > instead of just stage-2.
> >
> > Signed-off-by: Alistair Francis 
> > ---
> >  target/riscv/cpu_helper.c | 8 +---
> >  1 file changed, 5 insertions(+), 3 deletions(-)
> >
> > diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> > index f36d184b7b..50e13a064f 100644
> > --- a/target/riscv/cpu_helper.c
> > +++ b/target/riscv/cpu_helper.c
> > @@ -707,7 +707,7 @@ bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, 
> > int size,
> >  #ifndef CONFIG_USER_ONLY
> >  vaddr im_address;
> >  hwaddr pa = 0;
> > -int prot;
> > +int prot, prot2;
> >  bool pmp_violation = false;
> >  bool m_mode_two_stage = false;
> >  bool hs_mode_two_stage = false;
> > @@ -757,13 +757,15 @@ bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, 
> > int size,
> >  /* Second stage lookup */
> >  im_address = pa;
> >
> > -ret = get_physical_address(env, , , im_address,
> > +ret = get_physical_address(env, , , im_address,
> > access_type, mmu_idx, false, true);
> >
> >  qemu_log_mask(CPU_LOG_MMU,
> >  "%s 2nd-stage address=%" VADDR_PRIx " ret %d physical "
> >  TARGET_FMT_plx " prot %d\n",
> > -__func__, im_address, ret, pa, prot);
> > +__func__, im_address, ret, pa, prot2);
> > +
> > +prot &= prot2;
> >
> >  if (riscv_feature(env, RISCV_FEATURE_PMP) &&
> >  (ret == TRANSLATE_SUCCESS) &&
> >
>
> Whee!  Yes, I think this is what you've been looking for.

Yep!

I actually tried this ages ago, but it didn't work without the first
path so it never fixed the problem.

> Reviewed-by: Richard Henderson 

Thanks

Alistair

>
>
> r~



Re: [PATCH for 5.0 v1 1/2] riscv: Don't use stage-2 PTE lookup protection flags

2020-03-26 Thread Richard Henderson
On 3/26/20 3:44 PM, Alistair Francis wrote:
> When doing the fist of a two stage lookup (Hypervisor extensions) don't
> set the current protection flags from the second stage lookup of the
> base address PTE.
> 
> Signed-off-by: Alistair Francis 
> ---
>  target/riscv/cpu_helper.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> index d3ba9efb02..f36d184b7b 100644
> --- a/target/riscv/cpu_helper.c
> +++ b/target/riscv/cpu_helper.c
> @@ -452,10 +452,11 @@ restart:
>  hwaddr pte_addr;
>  
>  if (two_stage && first_stage) {
> +int vbase_prot;
>  hwaddr vbase;
>  
>  /* Do the second stage translation on the base PTE address. */
> -get_physical_address(env, , prot, base, access_type,
> +get_physical_address(env, , _prot, base, access_type,
>   mmu_idx, false, true);
>  
>  pte_addr = vbase + idx * ptesize;
> 

Certainly stage2 pte lookup has nothing to do with the original lookup, so
using a new variable for prot is correct.

So as far as this minimal patch,

Reviewed-by: Richard Henderson 

However, this bit of code doesn't look right:

(1) Similarly, what has the original access_type got to do with the PTE lookup?
 Seems like this should be MMU_DATA_LOAD always.

(2) Why is the get_physical_address return value ignored?  On failure, surely
this should be some sort of PTE lookup failure.

(3) Do we need to validate vbase_prot for write before updating the PTE for
Access or Dirty?  That seems like a loop-hole to allow silent modification of
hypervisor read-only memory.

I do wonder if it might be easier to manage all of this by using additional
TLBs to handle the stage2 and physical address spaces.  That's probably too
invasive for this stage of development though.


r~



Re: [PATCH for 5.0 v1 2/2] riscv: AND stage-1 and stage-2 protection flags

2020-03-26 Thread Richard Henderson
On 3/26/20 3:44 PM, Alistair Francis wrote:
> Take the result of stage-1 and stage-2 page table walks and AND the two
> protection flags together. This way we require both to set permissions
> instead of just stage-2.
> 
> Signed-off-by: Alistair Francis 
> ---
>  target/riscv/cpu_helper.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> index f36d184b7b..50e13a064f 100644
> --- a/target/riscv/cpu_helper.c
> +++ b/target/riscv/cpu_helper.c
> @@ -707,7 +707,7 @@ bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, int 
> size,
>  #ifndef CONFIG_USER_ONLY
>  vaddr im_address;
>  hwaddr pa = 0;
> -int prot;
> +int prot, prot2;
>  bool pmp_violation = false;
>  bool m_mode_two_stage = false;
>  bool hs_mode_two_stage = false;
> @@ -757,13 +757,15 @@ bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, 
> int size,
>  /* Second stage lookup */
>  im_address = pa;
>  
> -ret = get_physical_address(env, , , im_address,
> +ret = get_physical_address(env, , , im_address,
> access_type, mmu_idx, false, true);
>  
>  qemu_log_mask(CPU_LOG_MMU,
>  "%s 2nd-stage address=%" VADDR_PRIx " ret %d physical "
>  TARGET_FMT_plx " prot %d\n",
> -__func__, im_address, ret, pa, prot);
> +__func__, im_address, ret, pa, prot2);
> +
> +prot &= prot2;
>  
>  if (riscv_feature(env, RISCV_FEATURE_PMP) &&
>  (ret == TRANSLATE_SUCCESS) &&
> 

Whee!  Yes, I think this is what you've been looking for.
Reviewed-by: Richard Henderson 


r~



[PATCH 29/31] target/arm: Vectorize SABD/UABD

2020-03-26 Thread Richard Henderson
Include 64-bit element size in preparation for SVE.

Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  10 +++
 target/arm/translate.h |   5 ++
 target/arm/translate-a64.c |   8 ++-
 target/arm/translate.c | 133 -
 target/arm/vec_helper.c|  88 
 5 files changed, 240 insertions(+), 4 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 5ef7bb158f..97ccbd70c6 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -748,6 +748,16 @@ DEF_HELPER_FLAGS_3(gvec_sli_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_sli_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_sli_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(gvec_sabd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_uabd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_uabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_uabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_uabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/translate.h b/target/arm/translate.h
index 843ecc1472..c453aa1c47 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -311,6 +311,11 @@ void arm_gen_gvec_sri(unsigned vece, uint32_t rd_ofs, 
uint32_t rm_ofs,
 void arm_gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
 
+void arm_gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
 /*
  * Forward to the isar_feature_* tests given a DisasContext pointer.
  */
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index fc156a217a..1791c26a39 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -12159,6 +12159,13 @@ static void disas_simd_3same_int(DisasContext *s, 
uint32_t insn)
 gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smin, size);
 }
 return;
+case 0xe: /* SABD, UABD */
+if (u) {
+gen_gvec_fn3(s, is_q, rd, rn, rm, arm_gen_gvec_uabd, size);
+} else {
+gen_gvec_fn3(s, is_q, rd, rn, rm, arm_gen_gvec_sabd, size);
+}
+return;
 case 0x10: /* ADD, SUB */
 if (u) {
 gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size);
@@ -12291,7 +12298,6 @@ static void disas_simd_3same_int(DisasContext *s, 
uint32_t insn)
 genenvfn = fns[size][u];
 break;
 }
-case 0xe: /* SABD, UABD */
 case 0xf: /* SABA, UABA */
 {
 static NeonGenTwoOpFn * const fns[3][2] = {
diff --git a/target/arm/translate.c b/target/arm/translate.c
index bb6db53598..a29868976a 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4849,6 +4849,126 @@ const GVecGen4 sqsub_op[4] = {
   .vece = MO_64 },
 };
 
+static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+TCGv_i32 t = tcg_temp_new_i32();
+
+tcg_gen_sub_i32(t, a, b);
+tcg_gen_sub_i32(d, b, a);
+tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
+tcg_temp_free_i32(t);
+}
+
+static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+TCGv_i64 t = tcg_temp_new_i64();
+
+tcg_gen_sub_i64(t, a, b);
+tcg_gen_sub_i64(d, b, a);
+tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
+tcg_temp_free_i64(t);
+}
+
+static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+tcg_gen_smin_vec(vece, t, a, b);
+tcg_gen_smax_vec(vece, d, a, b);
+tcg_gen_sub_vec(vece, d, d, t);
+tcg_temp_free_vec(t);
+}
+
+void arm_gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+static const TCGOpcode vecop_list[] = {
+INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+};
+static const GVecGen3 ops[4] = {
+{ .fniv = gen_sabd_vec,
+  .fno = gen_helper_gvec_sabd_b,
+  .opt_opc = vecop_list,
+  .vece = MO_8 },
+{ .fniv = gen_sabd_vec,
+  .fno = gen_helper_gvec_sabd_h,
+  .opt_opc = vecop_list,
+  .vece = MO_16 },
+{ .fni4 = gen_sabd_i32,
+  .fniv = gen_sabd_vec,
+  .fno = gen_helper_gvec_sabd_s,
+  

[PATCH 30/31] target/arm: Vectorize SABA/UABA

2020-03-26 Thread Richard Henderson
Include 64-bit element size in preparation for SVE.

Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  17 +++--
 target/arm/translate.h |   5 ++
 target/arm/neon_helper.c   |  10 ---
 target/arm/translate-a64.c |  17 ++---
 target/arm/translate.c | 134 +++--
 target/arm/vec_helper.c|  88 
 6 files changed, 238 insertions(+), 33 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 97ccbd70c6..5cf6a5b4a0 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -299,13 +299,6 @@ DEF_HELPER_2(neon_pmax_s8, i32, i32, i32)
 DEF_HELPER_2(neon_pmax_u16, i32, i32, i32)
 DEF_HELPER_2(neon_pmax_s16, i32, i32, i32)
 
-DEF_HELPER_2(neon_abd_u8, i32, i32, i32)
-DEF_HELPER_2(neon_abd_s8, i32, i32, i32)
-DEF_HELPER_2(neon_abd_u16, i32, i32, i32)
-DEF_HELPER_2(neon_abd_s16, i32, i32, i32)
-DEF_HELPER_2(neon_abd_u32, i32, i32, i32)
-DEF_HELPER_2(neon_abd_s32, i32, i32, i32)
-
 DEF_HELPER_2(neon_shl_u16, i32, i32, i32)
 DEF_HELPER_2(neon_shl_s16, i32, i32, i32)
 DEF_HELPER_2(neon_rshl_u8, i32, i32, i32)
@@ -758,6 +751,16 @@ DEF_HELPER_FLAGS_4(gvec_uabd_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_uabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_uabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(gvec_saba_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_saba_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_saba_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_saba_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_uaba_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_uaba_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_uaba_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_uaba_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/translate.h b/target/arm/translate.h
index c453aa1c47..0df7ce51b2 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -316,6 +316,11 @@ void arm_gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, 
uint32_t rn_ofs,
 void arm_gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
 
+void arm_gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
 /*
  * Forward to the isar_feature_* tests given a DisasContext pointer.
  */
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index e6481a5764..4c1cf1e031 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -595,16 +595,6 @@ NEON_POP(pmax_s16, neon_s16, 2)
 NEON_POP(pmax_u16, neon_u16, 2)
 #undef NEON_FN
 
-#define NEON_FN(dest, src1, src2) \
-dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
-NEON_VOP(abd_s8, neon_s8, 4)
-NEON_VOP(abd_u8, neon_u8, 4)
-NEON_VOP(abd_s16, neon_s16, 2)
-NEON_VOP(abd_u16, neon_u16, 2)
-NEON_VOP(abd_s32, neon_s32, 1)
-NEON_VOP(abd_u32, neon_u32, 1)
-#undef NEON_FN
-
 #define NEON_FN(dest, src1, src2) \
 (dest = do_uqrshl_bhs(src1, src2, 16, false, NULL))
 NEON_VOP(shl_u16, neon_u16, 2)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 1791c26a39..d830a58c3f 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -12166,6 +12166,13 @@ static void disas_simd_3same_int(DisasContext *s, 
uint32_t insn)
 gen_gvec_fn3(s, is_q, rd, rn, rm, arm_gen_gvec_sabd, size);
 }
 return;
+case 0xf: /* SABA, UABA */
+if (u) {
+gen_gvec_fn3(s, is_q, rd, rn, rm, arm_gen_gvec_uaba, size);
+} else {
+gen_gvec_fn3(s, is_q, rd, rn, rm, arm_gen_gvec_saba, size);
+}
+return;
 case 0x10: /* ADD, SUB */
 if (u) {
 gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size);
@@ -12298,16 +12305,6 @@ static void disas_simd_3same_int(DisasContext *s, 
uint32_t insn)
 genenvfn = fns[size][u];
 break;
 }
-case 0xf: /* SABA, UABA */
-{
-static NeonGenTwoOpFn * const fns[3][2] = {
-{ gen_helper_neon_abd_s8, gen_helper_neon_abd_u8 },
-{ gen_helper_neon_abd_s16, gen_helper_neon_abd_u16 },
-{ gen_helper_neon_abd_s32, gen_helper_neon_abd_u32 },
-};
-genfn = fns[size][u];
-break;
-}
 case 0x16: /* SQDMULH, SQRDMULH */
 {
 static NeonGenTwoOpEnvFn * const fns[2][2] = {
diff --git 

[PATCH 18/31] target/arm: Implement SVE2 bitwise exclusive-or interleaved

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  5 +
 target/arm/sve.decode  |  5 +
 target/arm/sve_helper.c| 20 
 target/arm/translate-sve.c | 19 +++
 4 files changed, 49 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 9c0c41ba80..9e894a2b55 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2380,3 +2380,8 @@ DEF_HELPER_FLAGS_3(sve2_sshll_d, TCG_CALL_NO_RWG, void, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve2_ushll_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve2_ushll_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve2_ushll_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_eoril_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_eoril_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_eoril_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_eoril_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 36ef9de563..8af35e48a5 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1215,3 +1215,8 @@ SSHLLB  01000101 .. 0 . 1010 00 . .  
@rd_rn_tszimm_shl
 SSHLLT  01000101 .. 0 . 1010 01 . .  @rd_rn_tszimm_shl
 USHLLB  01000101 .. 0 . 1010 10 . .  @rd_rn_tszimm_shl
 USHLLT  01000101 .. 0 . 1010 11 . .  @rd_rn_tszimm_shl
+
+## SVE2 bitwise exclusive-or interleaved
+
+EORBT   01000101 .. 0 . 10010 0 . .  @rd_rn_rm
+EORTB   01000101 .. 0 . 10010 1 . .  @rd_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index e0a701c446..15ea1fd524 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1196,6 +1196,26 @@ DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, DO_SUB)
 
 #undef DO_ZZZ_WTB
 
+#define DO_ZZZ_NTB(NAME, TYPE, H, OP)   \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
+{   \
+intptr_t i, opr_sz = simd_oprsz(desc);  \
+intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
+intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
+for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {\
+TYPE nn = *(TYPE *)(vn + H(i + sel1));  \
+TYPE mm = *(TYPE *)(vm + H(i + sel2));  \
+*(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);   \
+}   \
+}
+
+DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
+
+#undef DO_ZZZ_NTB
+
 #define DO_ZZI_SHLL(NAME, TYPE, TYPEN, OP) \
 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)   \
 {  \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 9873b83feb..3eaf9cbe51 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6064,6 +6064,25 @@ DO_SVE2_ZZZ_TB(SMULLT_zzz, smull_zzz, true, true)
 DO_SVE2_ZZZ_TB(UMULLB_zzz, umull_zzz, false, false)
 DO_SVE2_ZZZ_TB(UMULLT_zzz, umull_zzz, true, true)
 
+static bool do_eor_tb(DisasContext *s, arg_rrr_esz *a, bool sel1)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+gen_helper_sve2_eoril_b, gen_helper_sve2_eoril_h,
+gen_helper_sve2_eoril_s, gen_helper_sve2_eoril_d,
+};
+return do_sve2_zzw_ool(s, a, fns[a->esz], (!sel1 << 1) | sel1);
+}
+
+static bool trans_EORBT(DisasContext *s, arg_rrr_esz *a)
+{
+return do_eor_tb(s, a, false);
+}
+
+static bool trans_EORTB(DisasContext *s, arg_rrr_esz *a)
+{
+return do_eor_tb(s, a, true);
+}
+
 static bool do_trans_pmull(DisasContext *s, arg_rrr_esz *a, bool sel)
 {
 static gen_helper_gvec_3 * const fns[4] = {
-- 
2.20.1




[PATCH 25/31] target/arm: Implement SVE2 bitwise shift right and accumulate

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/sve.decode  |  8 
 target/arm/translate-sve.c | 34 ++
 2 files changed, 42 insertions(+)

diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 5d46e3ab45..756f939df1 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1253,3 +1253,11 @@ UABALT  01000101 .. 0 . 1100 11 . .  
@rda_rn_rm
 # ADC and SBC decoded via size in helper dispatch.
 ADCLB   01000101 .. 0 . 11010 0 . .  @rda_rn_rm
 ADCLT   01000101 .. 0 . 11010 1 . .  @rda_rn_rm
+
+## SVE2 bitwise shift right and accumulate
+
+# TODO: Use @rda and %reg_movprfx here.
+SSRA01000101 .. 0 . 1110 00 . .  @rd_rn_tszimm_shr
+USRA01000101 .. 0 . 1110 01 . .  @rd_rn_tszimm_shr
+SRSRA   01000101 .. 0 . 1110 10 . .  @rd_rn_tszimm_shr
+URSRA   01000101 .. 0 . 1110 11 . .  @rd_rn_tszimm_shr
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index a80765a978..1d1f55dfdd 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6317,3 +6317,37 @@ static bool trans_ADCLT(DisasContext *s, arg__esz *a)
 {
 return do_adcl(s, a, true);
 }
+
+static bool do_sve2_fn2i(DisasContext *s, arg_rri_esz *a, GVecGen2iFn *fn)
+{
+if (!dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+if (sve_access_check(s)) {
+unsigned vsz = vec_full_reg_size(s);
+unsigned rd_ofs = vec_full_reg_offset(s, a->rd);
+unsigned rn_ofs = vec_full_reg_offset(s, a->rn);
+fn(a->esz, rd_ofs, rn_ofs, a->imm, vsz, vsz);
+}
+return true;
+}
+
+static bool trans_SSRA(DisasContext *s, arg_rri_esz *a)
+{
+return do_sve2_fn2i(s, a, arm_gen_gvec_ssra);
+}
+
+static bool trans_USRA(DisasContext *s, arg_rri_esz *a)
+{
+return do_sve2_fn2i(s, a, arm_gen_gvec_usra);
+}
+
+static bool trans_SRSRA(DisasContext *s, arg_rri_esz *a)
+{
+return do_sve2_fn2i(s, a, arm_gen_gvec_srsra);
+}
+
+static bool trans_URSRA(DisasContext *s, arg_rri_esz *a)
+{
+return do_sve2_fn2i(s, a, arm_gen_gvec_ursra);
+}
-- 
2.20.1




[PATCH 26/31] target/arm: Create arm_gen_gvec_{sri,sli}

2020-03-26 Thread Richard Henderson
The functions eliminate duplication of the special cases for
this operation.  They match up with the GVecGen2iFn typedef.

Add out-of-line helpers.  We got away with only having inline
expanders because the neon vector size is only 16 bytes, and
we know that the inline expansion will always succeed.
When we reuse this for SVE, tcg-gvec-op may decide to use an
out-of-line helper due to longer vector lengths.

Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  10 ++
 target/arm/translate.h |   7 +-
 target/arm/translate-a64.c |  20 +---
 target/arm/translate.c | 186 +
 target/arm/vec_helper.c|  38 
 5 files changed, 160 insertions(+), 101 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 1ffd840f1d..5ef7bb158f 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -738,6 +738,16 @@ DEF_HELPER_FLAGS_3(gvec_ursra_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_ursra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_ursra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(gvec_sri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sri_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_sli_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sli_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sli_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sli_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/translate.h b/target/arm/translate.h
index 1c5cdf13e3..843ecc1472 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -291,8 +291,6 @@ extern const GVecGen3 mls_op[4];
 extern const GVecGen3 cmtst_op[4];
 extern const GVecGen3 sshl_op[4];
 extern const GVecGen3 ushl_op[4];
-extern const GVecGen2i sri_op[4];
-extern const GVecGen2i sli_op[4];
 extern const GVecGen4 uqadd_op[4];
 extern const GVecGen4 sqadd_op[4];
 extern const GVecGen4 uqsub_op[4];
@@ -308,6 +306,11 @@ void arm_gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, 
uint32_t rm_ofs,
 void arm_gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
int64_t shift, uint32_t opr_sz, uint32_t max_sz);
 
+void arm_gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+  int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+  int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
 /*
  * Forward to the isar_feature_* tests given a DisasContext pointer.
  */
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 37ee85f867..f7d492cce4 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -680,16 +680,6 @@ static void gen_gvec_fn4(DisasContext *s, bool is_q, int 
rd, int rn, int rm,
 is_q ? 16 : 8, vec_full_reg_size(s));
 }
 
-/* Expand a 2-operand + immediate AdvSIMD vector operation using
- * an op descriptor.
- */
-static void gen_gvec_op2i(DisasContext *s, bool is_q, int rd,
-  int rn, int64_t imm, const GVecGen2i *gvec_op)
-{
-tcg_gen_gvec_2i(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
-is_q ? 16 : 8, vec_full_reg_size(s), imm, gvec_op);
-}
-
 /* Expand a 3-operand AdvSIMD vector operation using an op descriptor.  */
 static void gen_gvec_op3(DisasContext *s, bool is_q, int rd,
  int rn, int rm, const GVecGen3 *gvec_op)
@@ -11132,12 +11122,9 @@ static void handle_vec_simd_shri(DisasContext *s, bool 
is_q, bool is_u,
 gen_gvec_fn2i(s, is_q, rd, rn, shift,
   is_u ? arm_gen_gvec_usra : arm_gen_gvec_ssra, size);
 return;
+
 case 0x08: /* SRI */
-/* Shift count same as element size is valid but does nothing.  */
-if (shift == 8 << size) {
-goto done;
-}
-gen_gvec_op2i(s, is_q, rd, rn, shift, _op[size]);
+gen_gvec_fn2i(s, is_q, rd, rn, shift, arm_gen_gvec_sri, size);
 return;
 
 case 0x00: /* SSHR / USHR */
@@ -11188,7 +11175,6 @@ static void handle_vec_simd_shri(DisasContext *s, bool 
is_q, bool is_u,
 }
 tcg_temp_free_i64(tcg_round);
 
- done:
 clear_vec_high(s, is_q, rd);
 }
 
@@ -11213,7 +11199,7 @@ static void handle_vec_simd_shli(DisasContext *s, bool 
is_q, bool insert,
 }
 
 if (insert) {
-gen_gvec_op2i(s, is_q, rd, rn, shift, _op[size]);
+gen_gvec_fn2i(s, is_q, rd, rn, shift, arm_gen_gvec_sli, size);
 } else {
 gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size);
 }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 

[PATCH 31/31] target/arm: Implement SVE2 integer absolute difference and accumulate

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/sve.decode  |  6 ++
 target/arm/translate-sve.c | 25 +
 2 files changed, 31 insertions(+)

diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 9bf66e8ad4..6d565912e3 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1266,3 +1266,9 @@ URSRA   01000101 .. 0 . 1110 11 . .  
@rd_rn_tszimm_shr
 
 SRI 01000101 .. 0 . 0 0 . .  @rd_rn_tszimm_shr
 SLI 01000101 .. 0 . 0 1 . .  @rd_rn_tszimm_shl
+
+## SVE2 integer absolute difference and accumulate
+
+# TODO: Use @rda and %reg_movprfx here.
+SABA01000101 .. 0 . 1 0 . .  @rd_rn_rm
+UABA01000101 .. 0 . 1 1 . .  @rd_rn_rm
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 7556cecfb3..42ef031b77 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6361,3 +6361,28 @@ static bool trans_SLI(DisasContext *s, arg_rri_esz *a)
 {
 return do_sve2_fn2i(s, a, arm_gen_gvec_sli);
 }
+
+static bool do_sve2_fn3(DisasContext *s, arg_rrr_esz *a, GVecGen3Fn *fn)
+{
+if (!dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+if (sve_access_check(s)) {
+unsigned vsz = vec_full_reg_size(s);
+unsigned rd_ofs = vec_full_reg_offset(s, a->rd);
+unsigned rn_ofs = vec_full_reg_offset(s, a->rn);
+unsigned rm_ofs = vec_full_reg_offset(s, a->rm);
+fn(a->esz, rd_ofs, rn_ofs, rm_ofs, vsz, vsz);
+}
+return true;
+}
+
+static bool trans_SABA(DisasContext *s, arg_rrr_esz *a)
+{
+return do_sve2_fn3(s, a, arm_gen_gvec_saba);
+}
+
+static bool trans_UABA(DisasContext *s, arg_rrr_esz *a)
+{
+return do_sve2_fn3(s, a, arm_gen_gvec_uaba);
+}
-- 
2.20.1




[PATCH 16/31] target/arm: Tidy SVE tszimm shift formats

2020-03-26 Thread Richard Henderson
Rather than require the user to fill in the immediate (shl or shr),
create full formats that include the immediate.

Signed-off-by: Richard Henderson 
---
 target/arm/sve.decode | 35 ---
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 04bf9e5ce8..440cff4597 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -151,13 +151,17 @@
 @rd_rn_i6    ... rn:5 . imm:s6 rd:5 
 
 # Two register operand, one immediate operand, with predicate,
-# element size encoded as TSZHL.  User must fill in imm.
-@rdn_pg_tszimm   .. ... ... ... pg:3 . rd:5 \
-_esz rn=%reg_movprfx esz=%tszimm_esz
+# element size encoded as TSZHL.
+@rdn_pg_tszimm_shl   .. ... ... ... pg:3 . rd:5 \
+_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shl
+@rdn_pg_tszimm_shr   .. ... ... ... pg:3 . rd:5 \
+_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shr
 
 # Similarly without predicate.
-@rd_rn_tszimm    .. ... ... .. rn:5 rd:5 \
-_esz esz=%tszimm16_esz
+@rd_rn_tszimm_shl    .. ... ... .. rn:5 rd:5 \
+_esz esz=%tszimm16_esz imm=%tszimm16_shl
+@rd_rn_tszimm_shr    .. ... ... .. rn:5 rd:5 \
+_esz esz=%tszimm16_esz imm=%tszimm16_shr
 
 # Two register operand, one immediate operand, with 4-bit predicate.
 # User must fill in imm.
@@ -290,14 +294,10 @@ UMINV   0100 .. 001 011 001 ... . .   
  @rd_pg_rn
 ### SVE Shift by Immediate - Predicated Group
 
 # SVE bitwise shift by immediate (predicated)
-ASR_zpzi0100 .. 000 000 100 ... .. ... . \
-@rdn_pg_tszimm imm=%tszimm_shr
-LSR_zpzi0100 .. 000 001 100 ... .. ... . \
-@rdn_pg_tszimm imm=%tszimm_shr
-LSL_zpzi0100 .. 000 011 100 ... .. ... . \
-@rdn_pg_tszimm imm=%tszimm_shl
-ASRD0100 .. 000 100 100 ... .. ... . \
-@rdn_pg_tszimm imm=%tszimm_shr
+ASR_zpzi0100 .. 000 000 100 ... .. ... .  @rdn_pg_tszimm_shr
+LSR_zpzi0100 .. 000 001 100 ... .. ... .  @rdn_pg_tszimm_shr
+LSL_zpzi0100 .. 000 011 100 ... .. ... .  @rdn_pg_tszimm_shl
+ASRD0100 .. 000 100 100 ... .. ... .  @rdn_pg_tszimm_shr
 
 # SVE bitwise shift by vector (predicated)
 ASR_zpzz0100 .. 010 000 100 ... . .   @rdn_pg_rm
@@ -401,12 +401,9 @@ RDVL0100 101 1 01010 imm:s6 rd:5
 ### SVE Bitwise Shift - Unpredicated Group
 
 # SVE bitwise shift by immediate (unpredicated)
-ASR_zzi 0100 .. 1 . 1001 00 . . \
-@rd_rn_tszimm imm=%tszimm16_shr
-LSR_zzi 0100 .. 1 . 1001 01 . . \
-@rd_rn_tszimm imm=%tszimm16_shr
-LSL_zzi 0100 .. 1 . 1001 11 . . \
-@rd_rn_tszimm imm=%tszimm16_shl
+ASR_zzi 0100 .. 1 . 1001 00 . .  @rd_rn_tszimm_shr
+LSR_zzi 0100 .. 1 . 1001 01 . .  @rd_rn_tszimm_shr
+LSL_zzi 0100 .. 1 . 1001 11 . .  @rd_rn_tszimm_shl
 
 # SVE bitwise shift by wide elements (unpredicated)
 # Note esz != 3
-- 
2.20.1




[PATCH 21/31] target/arm: Implement SVE2 integer absolute difference and accumulate long

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 14 ++
 target/arm/sve.decode  | 12 +
 target/arm/sve_helper.c| 24 +
 target/arm/translate-sve.c | 54 ++
 4 files changed, 104 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0e4b4c48da..b48a88135f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2410,3 +2410,17 @@ DEF_HELPER_FLAGS_4(sve2_sqcadd_b, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_sqcadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_sqcadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_sqcadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_sabal_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sabal_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sabal_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uabal_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uabal_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uabal_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 5fb4b5f977..f66a6c242f 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -70,6 +70,7 @@
 _s  rd pg rn s
 _s rd pg rn rm s
 _esz   rd pg rn rm esz
+_esz   rd ra rn rm esz
 _esz  rd pg rn rm ra esz
 _esz   rd pg rn imm esz
   rd esz pat s
@@ -120,6 +121,10 @@
 @rdn_i8s esz:2 .. ... imm:s8 rd:5 \
 _esz rn=%reg_movprfx
 
+# Four operand, vector element size
+@rda_rn_rm   esz:2 . rm:5 ... ... rn:5 rd:5 \
+_esz ra=%reg_movprfx
+
 # Three operand with "memory" size, aka immediate left shift
 @rd_rn_msz_rm    ... rm:5  imm:2 rn:5 rd:5  
 
@@ -1235,3 +1240,10 @@ CADD_rot90  01000101 .. 0 0 11011 0 . .  
@rdn_rm
 CADD_rot270 01000101 .. 0 0 11011 1 . .  @rdn_rm
 SQCADD_rot9001000101 .. 0 1 11011 0 . .  @rdn_rm
 SQCADD_rot270   01000101 .. 0 1 11011 1 . .  @rdn_rm
+
+## SVE2 integer absolute difference and accumulate long
+
+SABALB  01000101 .. 0 . 1100 00 . .  @rda_rn_rm
+SABALT  01000101 .. 0 . 1100 01 . .  @rda_rn_rm
+UABALB  01000101 .. 0 . 1100 10 . .  @rda_rn_rm
+UABALT  01000101 .. 0 . 1100 11 . .  @rda_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a3653007ac..a0995d95c7 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1216,6 +1216,30 @@ DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
 
 #undef DO_ZZZ_NTB
 
+#define DO_ABAL(NAME, TYPE, TYPEN) \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, uint32_t desc) \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+int sel1 = (simd_data(desc) & 1) * sizeof(TYPE);   \
+int sel2 = (simd_data(desc) & 2) * (sizeof(TYPE) / 2); \
+for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
+TYPE nn = (TYPEN)(*(TYPE *)(vn + i) >> sel1);  \
+TYPE mm = (TYPEN)(*(TYPE *)(vm + i) >> sel2);  \
+TYPE aa = *(TYPE *)(va + i);   \
+*(TYPE *)(vd + i) = DO_ABD(nn, mm) + aa;   \
+}  \
+}
+
+DO_ABAL(sve2_sabal_h, int16_t, int8_t)
+DO_ABAL(sve2_sabal_s, int32_t, int16_t)
+DO_ABAL(sve2_sabal_d, int64_t, int32_t)
+
+DO_ABAL(sve2_uabal_h, uint16_t, uint8_t)
+DO_ABAL(sve2_uabal_s, uint32_t, uint16_t)
+DO_ABAL(sve2_uabal_d, uint64_t, uint32_t)
+
+#undef DO_ABAL
+
 #define DO_BITPERM(NAME, TYPE, OP) \
 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 {  \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 3b0aa86e79..c6161d2ce2 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6240,3 +6240,57 @@ static bool trans_SQCADD_rot270(DisasContext *s, 
arg_rrr_esz *a)
 {
 return do_cadd(s, a, true, true);
 }
+
+static bool do_sve2__ool(DisasContext *s, arg__esz *a,
+ gen_helper_gvec_4 *fn, int data)
+{
+if (fn == NULL || !dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+if (sve_access_check(s)) {
+unsigned vsz = vec_full_reg_size(s);
+tcg_gen_gvec_4_ool(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->ra),
+   

[PATCH 27/31] target/arm: Tidy handle_vec_simd_shri

2020-03-26 Thread Richard Henderson
Now that we've converted all cases to gvec, there is quite a bit
of dead code at the end of the function.  Remove it.

Sink the call to gen_gvec_fn2i to the end, loading a function
pointer within the switch statement.

Signed-off-by: Richard Henderson 
---
 target/arm/translate-a64.c | 56 ++
 1 file changed, 14 insertions(+), 42 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index f7d492cce4..fc156a217a 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -11096,16 +11096,7 @@ static void handle_vec_simd_shri(DisasContext *s, bool 
is_q, bool is_u,
 int size = 32 - clz32(immh) - 1;
 int immhb = immh << 3 | immb;
 int shift = 2 * (8 << size) - immhb;
-bool accumulate = false;
-int dsize = is_q ? 128 : 64;
-int esize = 8 << size;
-int elements = dsize/esize;
-MemOp memop = size | (is_u ? 0 : MO_SIGN);
-TCGv_i64 tcg_rn = new_tmp_a64(s);
-TCGv_i64 tcg_rd = new_tmp_a64(s);
-TCGv_i64 tcg_round;
-uint64_t round_const;
-int i;
+GVecGen2iFn *gvec_fn;
 
 if (extract32(immh, 3, 1) && !is_q) {
 unallocated_encoding(s);
@@ -9,13 +0,12 @@ static void handle_vec_simd_shri(DisasContext *s, 
bool is_q, bool is_u,
 
 switch (opcode) {
 case 0x02: /* SSRA / USRA (accumulate) */
-gen_gvec_fn2i(s, is_q, rd, rn, shift,
-  is_u ? arm_gen_gvec_usra : arm_gen_gvec_ssra, size);
-return;
+gvec_fn = is_u ? arm_gen_gvec_usra : arm_gen_gvec_ssra;
+break;
 
 case 0x08: /* SRI */
-gen_gvec_fn2i(s, is_q, rd, rn, shift, arm_gen_gvec_sri, size);
-return;
+gvec_fn = arm_gen_gvec_sri;
+break;
 
 case 0x00: /* SSHR / USHR */
 if (is_u) {
@@ -11133,49 +11123,31 @@ static void handle_vec_simd_shri(DisasContext *s, 
bool is_q, bool is_u,
 /* Shift count the same size as element size produces zero.  */
 tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd),
is_q ? 16 : 8, vec_full_reg_size(s), 0);
-} else {
-gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size);
+return;
 }
+gvec_fn = tcg_gen_gvec_shri;
 } else {
 /* Shift count the same size as element size produces all sign.  */
 if (shift == 8 << size) {
 shift -= 1;
 }
-gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_sari, size);
+gvec_fn = tcg_gen_gvec_sari;
 }
-return;
+break;
 
 case 0x04: /* SRSHR / URSHR (rounding) */
-gen_gvec_fn2i(s, is_q, rd, rn, shift,
-  is_u ? arm_gen_gvec_urshr : arm_gen_gvec_srshr, size);
-return;
+gvec_fn = is_u ? arm_gen_gvec_urshr : arm_gen_gvec_srshr;
+break;
 
 case 0x06: /* SRSRA / URSRA (accum + rounding) */
-gen_gvec_fn2i(s, is_q, rd, rn, shift,
-  is_u ? arm_gen_gvec_ursra : arm_gen_gvec_srsra, size);
-return;
+gvec_fn = is_u ? arm_gen_gvec_ursra : arm_gen_gvec_srsra;
+break;
 
 default:
 g_assert_not_reached();
 }
 
-round_const = 1ULL << (shift - 1);
-tcg_round = tcg_const_i64(round_const);
-
-for (i = 0; i < elements; i++) {
-read_vec_element(s, tcg_rn, rn, i, memop);
-if (accumulate) {
-read_vec_element(s, tcg_rd, rd, i, memop);
-}
-
-handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
-accumulate, is_u, size, shift);
-
-write_vec_element(s, tcg_rd, rd, i, size);
-}
-tcg_temp_free_i64(tcg_round);
-
-clear_vec_high(s, is_q, rd);
+gen_gvec_fn2i(s, is_q, rd, rn, shift, gvec_fn, size);
 }
 
 /* SHL/SLI - Vector shift left */
-- 
2.20.1




[PATCH 28/31] target/arm: Implement SVE2 bitwise shift and insert

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/sve.decode  |  5 +
 target/arm/translate-sve.c | 10 ++
 2 files changed, 15 insertions(+)

diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 756f939df1..9bf66e8ad4 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1261,3 +1261,8 @@ SSRA01000101 .. 0 . 1110 00 . .  
@rd_rn_tszimm_shr
 USRA01000101 .. 0 . 1110 01 . .  @rd_rn_tszimm_shr
 SRSRA   01000101 .. 0 . 1110 10 . .  @rd_rn_tszimm_shr
 URSRA   01000101 .. 0 . 1110 11 . .  @rd_rn_tszimm_shr
+
+## SVE2 bitwise shift and insert
+
+SRI 01000101 .. 0 . 0 0 . .  @rd_rn_tszimm_shr
+SLI 01000101 .. 0 . 0 1 . .  @rd_rn_tszimm_shl
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 1d1f55dfdd..7556cecfb3 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6351,3 +6351,13 @@ static bool trans_URSRA(DisasContext *s, arg_rri_esz *a)
 {
 return do_sve2_fn2i(s, a, arm_gen_gvec_ursra);
 }
+
+static bool trans_SRI(DisasContext *s, arg_rri_esz *a)
+{
+return do_sve2_fn2i(s, a, arm_gen_gvec_sri);
+}
+
+static bool trans_SLI(DisasContext *s, arg_rri_esz *a)
+{
+return do_sve2_fn2i(s, a, arm_gen_gvec_sli);
+}
-- 
2.20.1




[PATCH 15/31] target/arm: Implement PMULLB and PMULLT

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/cpu.h   | 10 ++
 target/arm/helper-sve.h|  1 +
 target/arm/sve.decode  |  2 ++
 target/arm/translate-sve.c | 22 ++
 target/arm/vec_helper.c| 26 ++
 5 files changed, 61 insertions(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 2314e3c18c..2e9d9f046d 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3855,6 +3855,16 @@ static inline bool isar_feature_aa64_sve2(const 
ARMISARegisters *id)
 return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, SVEVER) != 0;
 }
 
+static inline bool isar_feature_aa64_sve2_aes(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, AES) != 0;
+}
+
+static inline bool isar_feature_aa64_sve2_pmull128(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, AES) >= 2;
+}
+
 /*
  * Feature tests for "does this exist in either 32-bit or 64-bit?"
  */
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index c4784919d2..943839e2dc 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2371,3 +2371,4 @@ DEF_HELPER_FLAGS_4(sve2_umull_zzz_s, TCG_CALL_NO_RWG, 
void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_umull_zzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_pmull_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 2410dd85a1..04bf9e5ce8 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1204,6 +1204,8 @@ USUBWT  01000101 .. 0 . 010 111 . .  
@rd_rn_rm
 
 SQDMULLB_zzz01000101 .. 0 . 011 000 . .  @rd_rn_rm
 SQDMULLT_zzz01000101 .. 0 . 011 001 . .  @rd_rn_rm
+PMULLB  01000101 .. 0 . 011 010 . .  @rd_rn_rm
+PMULLT  01000101 .. 0 . 011 011 . .  @rd_rn_rm
 SMULLB_zzz  01000101 .. 0 . 011 100 . .  @rd_rn_rm
 SMULLT_zzz  01000101 .. 0 . 011 101 . .  @rd_rn_rm
 UMULLB_zzz  01000101 .. 0 . 011 110 . .  @rd_rn_rm
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index c66ec9eb83..67416a25ce 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6064,6 +6064,28 @@ DO_SVE2_ZZZ_TB(SMULLT_zzz, smull_zzz, true, true)
 DO_SVE2_ZZZ_TB(UMULLB_zzz, umull_zzz, false, false)
 DO_SVE2_ZZZ_TB(UMULLT_zzz, umull_zzz, true, true)
 
+static bool do_trans_pmull(DisasContext *s, arg_rrr_esz *a, bool sel)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+gen_helper_gvec_pmull_q, gen_helper_sve2_pmull_h,
+NULL,gen_helper_sve2_pmull_d,
+};
+if (a->esz == 0 && !dc_isar_feature(aa64_sve2_pmull128, s)) {
+return false;
+}
+return do_sve2_zzw_ool(s, a, fns[a->esz], sel);
+}
+
+static bool trans_PMULLB(DisasContext *s, arg_rrr_esz *a)
+{
+return do_trans_pmull(s, a, false);
+}
+
+static bool trans_PMULLT(DisasContext *s, arg_rrr_esz *a)
+{
+return do_trans_pmull(s, a, true);
+}
+
 #define DO_SVE2_ZZZ_WTB(NAME, name, SEL2) \
 static bool trans_##NAME(DisasContext *s, arg_rrr_esz *a)   \
 {   \
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 00dc38c9db..154d32518a 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1256,6 +1256,32 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, 
uint32_t desc)
 d[i] = pmull_h(nn, mm);
 }
 }
+
+static uint64_t pmull_d(uint64_t op1, uint64_t op2)
+{
+uint64_t result = 0;
+int i;
+
+for (i = 0; i < 32; ++i) {
+uint64_t mask = -((op1 >> i) & 1);
+result ^= (op2 << i) & mask;
+}
+return result;
+}
+
+void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+int shift = simd_data(desc) * 32;
+intptr_t i, opr_sz = simd_oprsz(desc);
+uint64_t *d = vd, *n = vn, *m = vm;
+
+for (i = 0; i < opr_sz / 8; ++i) {
+uint64_t nn = (uint32_t)(n[i] >> shift);
+uint64_t mm = (uint32_t)(m[i] >> shift);
+
+d[i] = pmull_d(nn, mm);
+}
+}
 #endif
 
 /*
-- 
2.20.1




[PATCH 20/31] target/arm: Implement SVE2 complex integer add

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 10 +
 target/arm/sve.decode  |  9 
 target/arm/sve_helper.c| 42 ++
 target/arm/translate-sve.c | 31 
 4 files changed, 92 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 466b01986f..0e4b4c48da 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2400,3 +2400,13 @@ DEF_HELPER_FLAGS_4(sve2_bgrp_b, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_bgrp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_bgrp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_bgrp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_cadd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_cadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_cadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_cadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_sqcadd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqcadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqcadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqcadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index ca60e9f2ce..5fb4b5f977 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1226,3 +1226,12 @@ EORTB   01000101 .. 0 . 10010 1 . .  
@rd_rn_rm
 BEXT01000101 .. 0 . 1011 00 . .  @rd_rn_rm
 BDEP01000101 .. 0 . 1011 01 . .  @rd_rn_rm
 BGRP01000101 .. 0 . 1011 10 . .  @rd_rn_rm
+
+ SVE2 Accumulate
+
+## SVE2 complex integer add
+
+CADD_rot90  01000101 .. 0 0 11011 0 . .  @rdn_rm
+CADD_rot270 01000101 .. 0 0 11011 1 . .  @rdn_rm
+SQCADD_rot9001000101 .. 0 1 11011 0 . .  @rdn_rm
+SQCADD_rot270   01000101 .. 0 1 11011 1 . .  @rdn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b5afa34efe..a3653007ac 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1289,6 +1289,48 @@ DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
 
 #undef DO_BITPERM
 
+#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)  \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
+{   \
+intptr_t i, opr_sz = simd_oprsz(desc);  \
+int sub_r = simd_data(desc);\
+if (sub_r) {\
+for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {\
+TYPE acc_r = *(TYPE *)(vn + H(i));  \
+TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
+TYPE el2_r = *(TYPE *)(vm + H(i));  \
+TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
+acc_r = SUB_OP(acc_r, el2_i);   \
+acc_i = ADD_OP(acc_i, el2_r);   \
+*(TYPE *)(vd + H(i)) = acc_r;   \
+*(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;\
+}   \
+} else {\
+for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {\
+TYPE acc_r = *(TYPE *)(vn + H(i));  \
+TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
+TYPE el2_r = *(TYPE *)(vm + H(i));  \
+TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
+acc_r = ADD_OP(acc_r, el2_i);   \
+acc_i = SUB_OP(acc_i, el2_r);   \
+*(TYPE *)(vd + H(i)) = acc_r;   \
+*(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;\
+}   \
+}   \
+}
+
+DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
+
+DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
+DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
+DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
+DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
+
+#undef DO_CADD
+
 #define DO_ZZI_SHLL(NAME, TYPE, TYPEN, OP) \
 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)   \
 {  \
diff --git a/target/arm/translate-sve.c 

[PATCH 23/31] target/arm: Create arm_gen_gvec_[us]sra

2020-03-26 Thread Richard Henderson
The functions eliminate duplication of the special cases for
this operation.  They match up with the GVecGen2iFn typedef.

Add out-of-line helpers.  We got away with only having inline
expanders because the neon vector size is only 16 bytes, and
we know that the inline expansion will always succeed.
When we reuse this for SVE, tcg-gvec-op may decide to use an
out-of-line helper due to longer vector lengths.

Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  10 +++
 target/arm/translate.h |   7 +-
 target/arm/translate-a64.c |  15 +---
 target/arm/translate.c | 161 ++---
 target/arm/vec_helper.c|  25 ++
 5 files changed, 139 insertions(+), 79 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 938fdbc362..dc6a43dbd8 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -708,6 +708,16 @@ DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(gvec_ssra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ssra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ssra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ssra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_usra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_usra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_usra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_usra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/translate.h b/target/arm/translate.h
index 5552ee5a94..1c5cdf13e3 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -291,8 +291,6 @@ extern const GVecGen3 mls_op[4];
 extern const GVecGen3 cmtst_op[4];
 extern const GVecGen3 sshl_op[4];
 extern const GVecGen3 ushl_op[4];
-extern const GVecGen2i ssra_op[4];
-extern const GVecGen2i usra_op[4];
 extern const GVecGen2i sri_op[4];
 extern const GVecGen2i sli_op[4];
 extern const GVecGen4 uqadd_op[4];
@@ -305,6 +303,11 @@ void gen_sshl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
 void gen_ushl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 void gen_sshl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 
+void arm_gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
 /*
  * Forward to the isar_feature_* tests given a DisasContext pointer.
  */
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 2bcf643069..d50207fcfb 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10682,19 +10682,8 @@ static void handle_vec_simd_shri(DisasContext *s, bool 
is_q, bool is_u,
 
 switch (opcode) {
 case 0x02: /* SSRA / USRA (accumulate) */
-if (is_u) {
-/* Shift count same as element size produces zero to add.  */
-if (shift == 8 << size) {
-goto done;
-}
-gen_gvec_op2i(s, is_q, rd, rn, shift, _op[size]);
-} else {
-/* Shift count same as element size produces all sign to add.  */
-if (shift == 8 << size) {
-shift -= 1;
-}
-gen_gvec_op2i(s, is_q, rd, rn, shift, _op[size]);
-}
+gen_gvec_fn2i(s, is_q, rd, rn, shift,
+  is_u ? arm_gen_gvec_usra : arm_gen_gvec_ssra, size);
 return;
 case 0x08: /* SRI */
 /* Shift count same as element size is valid but does nothing.  */
diff --git a/target/arm/translate.c b/target/arm/translate.c
index cba84987db..f5768014d1 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -3947,33 +3947,51 @@ static void gen_ssra_vec(unsigned vece, TCGv_vec d, 
TCGv_vec a, int64_t sh)
 tcg_gen_add_vec(vece, d, d, a);
 }
 
-static const TCGOpcode vecop_list_ssra[] = {
-INDEX_op_sari_vec, INDEX_op_add_vec, 0
-};
+void arm_gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+static const TCGOpcode vecop_list[] = {
+INDEX_op_sari_vec, INDEX_op_add_vec, 0
+};
+static const GVecGen2i ops[4] = {
+{ .fni8 = gen_ssra8_i64,
+  .fniv = gen_ssra_vec,
+  .fno = gen_helper_gvec_ssra_b,
+  .load_dest = true,
+  .opt_opc = vecop_list,
+  .vece = MO_8 },
+{ .fni8 = gen_ssra16_i64,
+  .fniv = gen_ssra_vec,
+  .fno = gen_helper_gvec_ssra_h,
+  .load_dest = true,
+  .opt_opc = vecop_list,
+  .vece = MO_16 },
+{ .fni4 = gen_ssra32_i32,
+  .fniv = gen_ssra_vec,
+  

[PATCH 24/31] target/arm: Create arm_gen_gvec_{u,s}{rshr,rsra}

2020-03-26 Thread Richard Henderson
Create vectorized versions of handle_shri_with_rndacc
for shift+round and shift+round+accumulate.  Add out-of-line
helpers in preparation for longer vector lengths from SVE.

Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  20 ++
 target/arm/translate-a64.h |   9 +
 target/arm/translate-a64.c | 458 -
 target/arm/vec_helper.c|  50 
 4 files changed, 534 insertions(+), 3 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index dc6a43dbd8..1ffd840f1d 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -718,6 +718,26 @@ DEF_HELPER_FLAGS_3(gvec_usra_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_usra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_usra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(gvec_srshr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_srshr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_srshr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_srshr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_urshr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_urshr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_urshr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_urshr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_srsra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_srsra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_srsra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_srsra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_ursra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ursra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ursra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ursra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h
index 65c0280498..7846e91e51 100644
--- a/target/arm/translate-a64.h
+++ b/target/arm/translate-a64.h
@@ -129,4 +129,13 @@ typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
 typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t,
 uint32_t, uint32_t, uint32_t);
 
+void arm_gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
 #endif /* TARGET_ARM_TRANSLATE_A64_H */
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index d50207fcfb..37ee85f867 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -8561,6 +8561,453 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, 
TCGv_i64 tcg_src,
 }
 }
 
+static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+TCGv_i64 t = tcg_temp_new_i64();
+TCGv_i64 ones = tcg_const_i64(dup_const(MO_8, 1));
+
+/* Shift one less than the requested amount. */
+if (shift > 1) {
+tcg_gen_vec_sar8i_i64(a, a, shift - 1);
+}
+
+/* The low bit is the rounding bit.  Mask it off.  */
+tcg_gen_and_i64(t, a, ones);
+
+/* Finish the shift. */
+tcg_gen_vec_sar8i_i64(d, a, 1);
+
+/* Round. */
+tcg_gen_vec_add8_i64(d, d, t);
+
+tcg_temp_free_i64(t);
+tcg_temp_free_i64(ones);
+}
+
+static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+TCGv_i64 t = tcg_temp_new_i64();
+TCGv_i64 ones = tcg_const_i64(dup_const(MO_16, 1));
+
+if (shift > 1) {
+tcg_gen_vec_sar16i_i64(a, a, shift - 1);
+}
+tcg_gen_and_i64(t, a, ones);
+tcg_gen_vec_sar16i_i64(d, a, 1);
+tcg_gen_vec_add16_i64(d, d, t);
+
+tcg_temp_free_i64(t);
+tcg_temp_free_i64(ones);
+}
+
+static void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+TCGv_i32 t = tcg_temp_new_i32();
+
+tcg_gen_sari_i32(a, a, shift - 1);
+tcg_gen_andi_i32(t, a, 1);
+tcg_gen_sari_i32(d, a, 1);
+tcg_gen_add_i32(d, d, t);
+
+tcg_temp_free_i32(t);
+}
+
+static void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+TCGv_i64 t = tcg_temp_new_i64();
+
+tcg_gen_sari_i64(a, a, shift - 1);
+tcg_gen_andi_i64(t, a, 1);
+tcg_gen_sari_i64(d, a, 1);
+tcg_gen_add_i64(d, d, t);
+
+tcg_temp_free_i64(t);
+}
+
+static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
+{
+

[PATCH 19/31] target/arm: Implement SVE2 bitwise permute

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/cpu.h   |  5 +++
 target/arm/helper-sve.h| 15 
 target/arm/sve.decode  |  6 
 target/arm/sve_helper.c| 73 ++
 target/arm/translate-sve.c | 36 +++
 5 files changed, 135 insertions(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 2e9d9f046d..b7c7946771 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3865,6 +3865,11 @@ static inline bool isar_feature_aa64_sve2_pmull128(const 
ARMISARegisters *id)
 return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, AES) >= 2;
 }
 
+static inline bool isar_feature_aa64_sve2_bitperm(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, BITPERM) != 0;
+}
+
 /*
  * Feature tests for "does this exist in either 32-bit or 64-bit?"
  */
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 9e894a2b55..466b01986f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2385,3 +2385,18 @@ DEF_HELPER_FLAGS_4(sve2_eoril_b, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_eoril_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_eoril_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_eoril_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_bext_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bext_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bext_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bext_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_bdep_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bdep_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bdep_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bdep_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_bgrp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bgrp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bgrp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_bgrp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 8af35e48a5..ca60e9f2ce 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1220,3 +1220,9 @@ USHLLT  01000101 .. 0 . 1010 11 . .  
@rd_rn_tszimm_shl
 
 EORBT   01000101 .. 0 . 10010 0 . .  @rd_rn_rm
 EORTB   01000101 .. 0 . 10010 1 . .  @rd_rn_rm
+
+## SVE2 bitwise permute
+
+BEXT01000101 .. 0 . 1011 00 . .  @rd_rn_rm
+BDEP01000101 .. 0 . 1011 01 . .  @rd_rn_rm
+BGRP01000101 .. 0 . 1011 10 . .  @rd_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 15ea1fd524..b5afa34efe 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1216,6 +1216,79 @@ DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
 
 #undef DO_ZZZ_NTB
 
+#define DO_BITPERM(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
+TYPE nn = *(TYPE *)(vn + i);   \
+TYPE mm = *(TYPE *)(vm + i);   \
+*(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);  \
+}  \
+}
+
+static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
+{
+uint64_t res = 0;
+int db, rb = 0;
+
+for (db = 0; db < n; ++db) {
+if ((mask >> db) & 1) {
+res |= ((data >> db) & 1) << rb;
+++rb;
+}
+}
+return res;
+}
+
+DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
+DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
+DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
+DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
+
+static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
+{
+uint64_t res = 0;
+int rb, db = 0;
+
+for (rb = 0; rb < n; ++rb) {
+if ((mask >> rb) & 1) {
+res |= ((data >> db) & 1) << rb;
+++db;
+}
+}
+return res;
+}
+
+DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
+DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
+DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
+DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
+
+static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
+{
+uint64_t resm = 0, resu = 0;
+int db, rbm = 0, rbu = 0;
+
+for (db = 0; db < n; ++db) {
+uint64_t val = (data >> db) & 1;
+if ((mask >> db) & 1) {
+resm |= val << rbm++;
+} else {
+resu |= val << rbu++;
+}
+

[PATCH 14/31] target/arm: Implement SVE2 integer multiply long

2020-03-26 Thread Richard Henderson
Exclude PMULL from this category for the moment.

Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 15 +++
 target/arm/sve.decode  |  9 +
 target/arm/sve_helper.c| 31 +++
 target/arm/translate-sve.c |  9 +
 4 files changed, 64 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 6a95c6085c..c4784919d2 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2355,4 +2355,19 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd_mte, TCG_CALL_NO_WG,
 DEF_HELPER_FLAGS_6(sve_stdd_be_zd_mte, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
 
+DEF_HELPER_FLAGS_4(sve2_sqdmull_zzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqdmull_zzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqdmull_zzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_smull_zzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_smull_zzz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_smull_zzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_umull_zzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_umull_zzz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_umull_zzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 9994e1eb71..2410dd85a1 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1199,3 +1199,12 @@ SSUBWB  01000101 .. 0 . 010 100 . .  
@rd_rn_rm
 SSUBWT  01000101 .. 0 . 010 101 . .  @rd_rn_rm
 USUBWB  01000101 .. 0 . 010 110 . .  @rd_rn_rm
 USUBWT  01000101 .. 0 . 010 111 . .  @rd_rn_rm
+
+## SVE2 integer multiply long
+
+SQDMULLB_zzz01000101 .. 0 . 011 000 . .  @rd_rn_rm
+SQDMULLT_zzz01000101 .. 0 . 011 001 . .  @rd_rn_rm
+SMULLB_zzz  01000101 .. 0 . 011 100 . .  @rd_rn_rm
+SMULLT_zzz  01000101 .. 0 . 011 101 . .  @rd_rn_rm
+UMULLB_zzz  01000101 .. 0 . 011 110 . .  @rd_rn_rm
+UMULLT_zzz  01000101 .. 0 . 011 111 . .  @rd_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 44503626e4..130697f3d9 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1129,6 +1129,37 @@ DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, DO_ABD)
 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, DO_ABD)
 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, DO_ABD)
 
+DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, DO_MUL)
+
+DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, DO_MUL)
+
+/* Note that the multiply cannot overflow, but the doubling can. */
+static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
+{
+int16_t val = n * m;
+return DO_SQADD_H(val, val);
+}
+
+static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
+{
+int32_t val = n * m;
+return DO_SQADD_S(val, val);
+}
+
+static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
+{
+int64_t val = n * m;
+return do_sqadd_d(val, val);
+}
+
+DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, do_sqdmull_h)
+DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, do_sqdmull_s)
+DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, do_sqdmull_d)
+
 #undef DO_ZZZ_TB
 
 #define DO_ZZZ_WTB(NAME, TYPE, TYPEN, OP) \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index fb214360bf..c66ec9eb83 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6055,6 +6055,15 @@ DO_SVE2_ZZZ_TB(UABDLT, uabdl, true, true)
 DO_SVE2_ZZZ_TB(SADDLBT, saddl, false, true)
 DO_SVE2_ZZZ_TB(SSUBLBT, ssubl, false, true)
 
+DO_SVE2_ZZZ_TB(SQDMULLB_zzz, sqdmull_zzz, false, false)
+DO_SVE2_ZZZ_TB(SQDMULLT_zzz, sqdmull_zzz, true, true)
+
+DO_SVE2_ZZZ_TB(SMULLB_zzz, smull_zzz, false, false)
+DO_SVE2_ZZZ_TB(SMULLT_zzz, smull_zzz, true, true)
+
+DO_SVE2_ZZZ_TB(UMULLB_zzz, umull_zzz, false, false)
+DO_SVE2_ZZZ_TB(UMULLT_zzz, umull_zzz, true, true)
+
 #define DO_SVE2_ZZZ_WTB(NAME, name, SEL2) \
 static bool trans_##NAME(DisasContext *s, arg_rrr_esz *a)   \
 {   \
-- 
2.20.1




[PATCH 10/31] target/arm: Implement SVE2 saturating add/subtract (predicated)

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  54 +++
 target/arm/sve.decode  |  11 +++
 target/arm/sve_helper.c| 182 +
 target/arm/translate-sve.c |   7 ++
 4 files changed, 198 insertions(+), 56 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 028c3b85a8..368185944a 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -371,6 +371,60 @@ DEF_HELPER_FLAGS_5(sve2_uminp_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve2_uminp_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve2_sqadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uqadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_sqsub_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqsub_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqsub_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqsub_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uqsub_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqsub_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqsub_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqsub_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_suqadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_suqadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_suqadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_suqadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_usqadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_usqadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_usqadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_usqadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_asr_zpzw_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_asr_zpzw_h, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 86a6bf7088..86aee38668 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1150,3 +1150,14 @@ SMAXP   01000100 .. 010 100 101 ... . .  
@rdn_pg_rm
 UMAXP   01000100 .. 010 101 101 ... . .  @rdn_pg_rm
 SMINP   01000100 .. 010 110 101 ... . .  @rdn_pg_rm
 UMINP   01000100 .. 010 111 101 ... . .  @rdn_pg_rm
+
+### SVE2 saturating add/subtract (predicated)
+
+SQADD_zpzz  01000100 .. 011 000 100 ... . .  @rdn_pg_rm
+UQADD_zpzz  01000100 .. 011 001 100 ... . .  @rdn_pg_rm
+SQSUB_zpzz  01000100 .. 011 010 100 ... . .  @rdn_pg_rm
+UQSUB_zpzz  01000100 .. 011 011 100 ... . .  @rdn_pg_rm
+SUQADD  01000100 .. 011 100 100 ... . .  @rdn_pg_rm
+USQADD  01000100 .. 011 101 100 ... . .  @rdn_pg_rm
+SQSUB_zpzz  01000100 .. 011 110 100 ... . .  @rdm_pg_rn # SQSUBR
+UQSUB_zpzz  01000100 .. 011 111 100 ... . .  @rdm_pg_rn # UQSUBR
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index d7c181ddb8..bee00eaa44 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -678,6 +678,123 @@ DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
 
+static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
+{
+return val >= max ? max : val <= min ? min : val;
+}
+
+#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, 

[PATCH 09/31] target/arm: Implement SVE2 integer pairwise arithmetic

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 45 +
 target/arm/sve.decode  |  8 +
 target/arm/sve_helper.c| 67 ++
 target/arm/translate-sve.c |  6 
 4 files changed, 126 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 149fff1fae..028c3b85a8 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -326,6 +326,51 @@ DEF_HELPER_FLAGS_5(sve_sel_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_sel_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve2_addp_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_addp_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_addp_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_addp_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_smaxp_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_smaxp_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_smaxp_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_smaxp_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_umaxp_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_umaxp_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_umaxp_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_umaxp_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_sminp_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sminp_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sminp_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sminp_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uminp_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uminp_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uminp_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uminp_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_asr_zpzw_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_asr_zpzw_h, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 54076bb607..86a6bf7088 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1142,3 +1142,11 @@ SRHADD  01000100 .. 010 100 100 ... . .  
@rdn_pg_rm
 URHADD  01000100 .. 010 101 100 ... . .  @rdn_pg_rm
 SHSUB   01000100 .. 010 110 100 ... . .  @rdm_pg_rn # SHSUBR
 UHSUB   01000100 .. 010 111 100 ... . .  @rdm_pg_rn # UHSUBR
+
+### SVE2 integer pairwise arithmetic
+
+ADDP01000100 .. 010 001 101 ... . .  @rdn_pg_rm
+SMAXP   01000100 .. 010 100 101 ... . .  @rdn_pg_rm
+UMAXP   01000100 .. 010 101 101 ... . .  @rdn_pg_rm
+SMINP   01000100 .. 010 110 101 ... . .  @rdn_pg_rm
+UMINP   01000100 .. 010 111 101 ... . .  @rdn_pg_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 5d75aed7b7..d7c181ddb8 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -681,6 +681,73 @@ DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
 #undef DO_ZPZZ
 #undef DO_ZPZZ_D
 
+/*
+ * Three operand expander, operating on element pairs.
+ * If the slot I is even, the elements from from VN {I, I+1}.
+ * If the slot I is odd, the elements from from VM {I-1, I}.
+ */
+#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{   \
+intptr_t i, opr_sz = simd_oprsz(desc);  \
+for (i = 0; i < opr_sz; ) { \
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+do {\
+if (pg & 1) {   \
+void *p = (i & 1 ? vm : vn);\
+TYPE nn = *(TYPE *)(p + H(i & ~1)); \
+

[PATCH 22/31] target/arm: Implement SVE2 integer add/subtract long with carry

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  3 +++
 target/arm/sve.decode  |  6 ++
 target/arm/sve_helper.c| 33 +
 target/arm/translate-sve.c | 23 +++
 4 files changed, 65 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index b48a88135f..cfc1357613 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2424,3 +2424,6 @@ DEF_HELPER_FLAGS_5(sve2_uabal_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve2_uabal_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_adcl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_adcl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index f66a6c242f..5d46e3ab45 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1247,3 +1247,9 @@ SABALB  01000101 .. 0 . 1100 00 . .  
@rda_rn_rm
 SABALT  01000101 .. 0 . 1100 01 . .  @rda_rn_rm
 UABALB  01000101 .. 0 . 1100 10 . .  @rda_rn_rm
 UABALT  01000101 .. 0 . 1100 11 . .  @rda_rn_rm
+
+## SVE2 integer add/subtract long with carry
+
+# ADC and SBC decoded via size in helper dispatch.
+ADCLB   01000101 .. 0 . 11010 0 . .  @rda_rn_rm
+ADCLT   01000101 .. 0 . 11010 1 . .  @rda_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a0995d95c7..aa330f75c3 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1240,6 +1240,39 @@ DO_ABAL(sve2_uabal_d, uint64_t, uint32_t)
 
 #undef DO_ABAL
 
+void HELPER(sve2_adcl_s)(void *vd, void *va, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc);
+int sel = extract32(desc, SIMD_DATA_SHIFT, 1) * 32;
+uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+uint64_t *d = vd, *a = va, *n = vn, *m = vm;
+
+for (i = 0; i < opr_sz / 8; ++i) {
+uint32_t e1 = (uint32_t)a[i];
+uint32_t e2 = (uint32_t)(n[i] >> sel) ^ inv;
+uint64_t c = extract64(m[i], 32, 1);
+/* Compute and store the entire 33-bit result at once. */
+d[i] = c + e1 + e2;
+}
+}
+
+void HELPER(sve2_adcl_d)(void *vd, void *va, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc);
+int sel = extract32(desc, SIMD_DATA_SHIFT, 1) * 32;
+uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+uint64_t *d = vd, *a = va, *n = vn, *m = vm;
+
+for (i = 0; i < opr_sz / 8; i += 2) {
+Int128 e1 = int128_make64(a[i]);
+Int128 e2 = int128_make64(n[i + sel] ^ inv);
+Int128 c = int128_make64(m[i + 1] & 1);
+Int128 r = int128_add(int128_add(e1, e2), c);
+d[i + 0] = int128_getlo(r);
+d[i + 1] = int128_gethi(r);
+}
+}
+
 #define DO_BITPERM(NAME, TYPE, OP) \
 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 {  \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index c6161d2ce2..a80765a978 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6294,3 +6294,26 @@ static bool trans_UABALT(DisasContext *s, arg__esz 
*a)
 {
 return do_abal(s, a, true, true);
 }
+
+static bool do_adcl(DisasContext *s, arg__esz *a, bool sel)
+{
+static gen_helper_gvec_4 * const fns[2] = {
+gen_helper_sve2_adcl_s,
+gen_helper_sve2_adcl_d,
+};
+/*
+ * Note that in this case the ESZ field encodes both size and sign.
+ * Split out 'subtract' into bit 1 of the data field for the helper.
+ */
+return do_sve2__ool(s, a, fns[a->esz & 1], (a->esz & 2) | sel);
+}
+
+static bool trans_ADCLB(DisasContext *s, arg__esz *a)
+{
+return do_adcl(s, a, false);
+}
+
+static bool trans_ADCLT(DisasContext *s, arg__esz *a)
+{
+return do_adcl(s, a, true);
+}
-- 
2.20.1




[PATCH 13/31] target/arm: Implement SVE2 integer add/subtract wide

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 16 
 target/arm/sve.decode  | 12 
 target/arm/sve_helper.c| 30 ++
 target/arm/translate-sve.c | 20 
 4 files changed, 78 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 475fce7f3a..6a95c6085c 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -1396,6 +1396,22 @@ DEF_HELPER_FLAGS_4(sve2_uabdl_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_uabdl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_uabdl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve2_saddw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_saddw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_saddw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_ssubw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_ssubw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_ssubw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_uaddw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_uaddw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_uaddw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_usubw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_usubw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_usubw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(sve_ldr, TCG_CALL_NO_WG, void, env, ptr, tl, int)
 DEF_HELPER_FLAGS_4(sve_str, TCG_CALL_NO_WG, void, env, ptr, tl, int)
 
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 8d5f31bcc4..9994e1eb71 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1187,3 +1187,15 @@ UABDLT  01000101 .. 0 . 00  . .  
@rd_rn_rm
 SADDLBT 01000101 .. 0 . 1000 00 . .  @rd_rn_rm
 SSUBLBT 01000101 .. 0 . 1000 10 . .  @rd_rn_rm
 SSUBLBT 01000101 .. 0 . 1000 11 . .  @rd_rm_rn # SSUBLTB
+
+## SVE2 integer add/subtract wide
+
+SADDWB  01000101 .. 0 . 010 000 . .  @rd_rn_rm
+SADDWT  01000101 .. 0 . 010 001 . .  @rd_rn_rm
+UADDWB  01000101 .. 0 . 010 010 . .  @rd_rn_rm
+UADDWT  01000101 .. 0 . 010 011 . .  @rd_rn_rm
+
+SSUBWB  01000101 .. 0 . 010 100 . .  @rd_rn_rm
+SSUBWT  01000101 .. 0 . 010 101 . .  @rd_rn_rm
+USUBWB  01000101 .. 0 . 010 110 . .  @rd_rn_rm
+USUBWT  01000101 .. 0 . 010 111 . .  @rd_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 7d7a59f620..44503626e4 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1131,6 +1131,36 @@ DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, DO_ABD)
 
 #undef DO_ZZZ_TB
 
+#define DO_ZZZ_WTB(NAME, TYPE, TYPEN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+int sel2 = (simd_data(desc) & 1) * sizeof(TYPE);   \
+for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
+TYPE nn = *(TYPE *)(vn + i);   \
+TYPE mm = (TYPEN)(*(TYPE *)(vm + i) >> sel2);  \
+*(TYPE *)(vd + i) = OP(nn, mm);\
+}  \
+}
+
+DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, DO_ADD)
+
+DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, DO_SUB)
+
+DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, DO_ADD)
+
+DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, DO_SUB)
+
+#undef DO_ZZZ_WTB
+
 /* Two-operand reduction expander, controlled by a predicate.
  * The difference between TYPERED and TYPERET has to do with
  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index accb74537b..fb214360bf 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6054,3 +6054,23 @@ DO_SVE2_ZZZ_TB(UABDLT, uabdl, true, true)
 
 DO_SVE2_ZZZ_TB(SADDLBT, saddl, false, true)
 DO_SVE2_ZZZ_TB(SSUBLBT, ssubl, false, true)
+
+#define DO_SVE2_ZZZ_WTB(NAME, name, SEL2) \
+static bool trans_##NAME(DisasContext *s, arg_rrr_esz *a)  

[PATCH 06/31] target/arm: Split out saturating/rounding shifts from neon

2020-03-26 Thread Richard Henderson
Split these operations out into a header that can be shared
between neon and sve.  The "sat" pointer acts both as a boolean
for control of saturating behavior and controls the difference
in behavior between neon and sve -- QC bit or no QC bit.

Implement right-shift rounding as

tmp = src >> (shift - 1);
dst = (tmp >> 1) + (tmp & 1);

This is the same number of instructions as the current

tmp = 1 << (shift - 1);
dst = (src + tmp) >> shift;

without any possibility of intermediate overflow.

Signed-off-by: Richard Henderson 
---
 target/arm/vec_internal.h | 161 
 target/arm/neon_helper.c  | 507 +++---
 2 files changed, 244 insertions(+), 424 deletions(-)
 create mode 100644 target/arm/vec_internal.h

diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
new file mode 100644
index 00..0d1f9c86c8
--- /dev/null
+++ b/target/arm/vec_internal.h
@@ -0,0 +1,161 @@
+/*
+ * ARM AdvSIMD / SVE Vector Helpers
+ *
+ * Copyright (c) 2020 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+#ifndef TARGET_ARM_VEC_INTERNALS_H
+#define TARGET_ARM_VEC_INTERNALS_H
+
+static inline int32_t do_sqrshl_bhs(int32_t src, int8_t shift, int bits,
+bool round, uint32_t *sat)
+{
+if (shift <= -bits) {
+/* Rounding the sign bit always produces 0. */
+if (round) {
+return 0;
+}
+return src >> 31;
+} else if (shift < 0) {
+if (round) {
+src >>= -shift - 1;
+return (src >> 1) + (src & 1);
+}
+return src >> -shift;
+} else if (shift < bits) {
+int32_t val = src << shift;
+if (bits == 32) {
+if (!sat || val >> shift == src) {
+return val;
+}
+} else {
+int32_t extval = sextract32(val, 0, bits);
+if (!sat || val == extval) {
+return extval;
+}
+}
+} else if (!sat || src == 0) {
+return 0;
+}
+
+*sat = 1;
+return (1u << (bits - 1)) - (src >= 0);
+}
+
+static inline uint32_t do_uqrshl_bhs(uint32_t src, int8_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+if (shift <= -(bits + round)) {
+return 0;
+} else if (shift < 0) {
+if (round) {
+src >>= -shift - 1;
+return (src >> 1) + (src & 1);
+}
+return src >> -shift;
+} else if (shift < bits) {
+uint32_t val = src << shift;
+if (bits == 32) {
+if (!sat || val >> shift == src) {
+return val;
+}
+} else {
+uint32_t extval = extract32(val, 0, bits);
+if (!sat || val == extval) {
+return extval;
+}
+}
+} else if (!sat || src == 0) {
+return 0;
+}
+
+*sat = 1;
+return MAKE_64BIT_MASK(0, bits);
+}
+
+static inline int32_t do_suqrshl_bhs(int32_t src, int8_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+if (src < 0) {
+*sat = 1;
+return 0;
+}
+return do_uqrshl_bhs(src, shift, bits, round, sat);
+}
+
+static inline int64_t do_sqrshl_d(int64_t src, int8_t shift,
+  bool round, uint32_t *sat)
+{
+if (shift <= -64) {
+/* Rounding the sign bit always produces 0. */
+if (round) {
+return 0;
+}
+return src >> 63;
+} else if (shift < 0) {
+if (round) {
+src >>= -shift - 1;
+return (src >> 1) + (src & 1);
+}
+return src >> -shift;
+} else if (shift < 64) {
+int64_t val = src << shift;
+if (!sat || val >> shift == src) {
+return val;
+}
+} else if (!sat || src == 0) {
+return 0;
+}
+
+*sat = 1;
+return src < 0 ? INT64_MIN : INT64_MAX;
+}
+
+static inline uint64_t do_uqrshl_d(uint64_t src, int8_t shift,
+   bool round, uint32_t *sat)
+{
+if (shift <= -(64 + round)) {
+return 0;
+} else if (shift < 0) {
+if (round) {
+src >>= -shift - 1;
+return (src >> 1) + (src & 1);
+}
+return src >> -shift;
+} else if (shift < 

[PATCH 17/31] target/arm: Implement SVE2 bitwise shift left long

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  8 +++
 target/arm/sve.decode  |  8 +++
 target/arm/sve_helper.c| 34 ++
 target/arm/translate-sve.c | 49 ++
 4 files changed, 99 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 943839e2dc..9c0c41ba80 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2372,3 +2372,11 @@ DEF_HELPER_FLAGS_4(sve2_umull_zzz_d, TCG_CALL_NO_RWG, 
void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_pmull_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_sshll_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sshll_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_sshll_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve2_ushll_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_ushll_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve2_ushll_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 440cff4597..36ef9de563 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1207,3 +1207,11 @@ SMULLB_zzz  01000101 .. 0 . 011 100 . .  
@rd_rn_rm
 SMULLT_zzz  01000101 .. 0 . 011 101 . .  @rd_rn_rm
 UMULLB_zzz  01000101 .. 0 . 011 110 . .  @rd_rn_rm
 UMULLT_zzz  01000101 .. 0 . 011 111 . .  @rd_rn_rm
+
+## SVE2 bitwise shift left long
+
+# Note bit23 == 0 is handled by esz > 0 in do_sve2_shll_tb.
+SSHLLB  01000101 .. 0 . 1010 00 . .  @rd_rn_tszimm_shl
+SSHLLT  01000101 .. 0 . 1010 01 . .  @rd_rn_tszimm_shl
+USHLLB  01000101 .. 0 . 1010 10 . .  @rd_rn_tszimm_shl
+USHLLT  01000101 .. 0 . 1010 11 . .  @rd_rn_tszimm_shl
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 130697f3d9..e0a701c446 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -625,6 +625,8 @@ DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
 
+#undef do_sqrshl_d
+
 #define do_uqrshl_b(n, m) \
({ uint32_t discard; do_uqrshl_bhs(n, m, 8, true, ); })
 #define do_uqrshl_h(n, m) \
@@ -639,6 +641,8 @@ DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
 
+#undef do_uqrshl_d
+
 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
 #define DO_HADD_D(n, m)((n >> 1) + (m >> 1) + (n & m & 1))
 
@@ -1192,6 +1196,36 @@ DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, DO_SUB)
 
 #undef DO_ZZZ_WTB
 
+#define DO_ZZI_SHLL(NAME, TYPE, TYPEN, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)   \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+int sel = (simd_data(desc) & 1) * sizeof(TYPE);\
+int shift = simd_data(desc) >> 1;  \
+for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
+TYPE nn = (TYPEN)(*(TYPE *)(vn + i) >> sel);   \
+*(TYPE *)(vd + i) = OP(nn, shift); \
+}  \
+}
+
+#define DO_SSHLL_H(val, sh)  do_sqrshl_bhs(val, sh, 16, false, NULL)
+#define DO_SSHLL_S(val, sh)  do_sqrshl_bhs(val, sh, 32, false, NULL)
+#define DO_SSHLL_D(val, sh)  do_sqrshl_d(val, sh, false, NULL)
+
+DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, DO_SSHLL_H)
+DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, DO_SSHLL_S)
+DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, DO_SSHLL_D)
+
+#define DO_USHLL_H(val, sh)  do_uqrshl_bhs(val, sh, 16, false, NULL)
+#define DO_USHLL_S(val, sh)  do_uqrshl_bhs(val, sh, 32, false, NULL)
+#define DO_USHLL_D(val, sh)  do_uqrshl_d(val, sh, false, NULL)
+
+DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, DO_USHLL_H)
+DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, DO_USHLL_S)
+DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, DO_USHLL_D)
+
+#undef DO_ZZI_SHLL
+
 /* Two-operand reduction expander, controlled by a predicate.
  * The difference between TYPERED and TYPERET has to do with
  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 67416a25ce..9873b83feb 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6105,3 +6105,52 @@ DO_SVE2_ZZZ_WTB(UADDWB, uaddw, false)
 DO_SVE2_ZZZ_WTB(UADDWT, uaddw, true)
 DO_SVE2_ZZZ_WTB(USUBWB, usubw, false)
 DO_SVE2_ZZZ_WTB(USUBWT, usubw, true)
+
+static bool do_sve2_shll_tb(DisasContext *s, arg_rri_esz *a,
+bool sel, bool uns)
+{
+static 

[PATCH 08/31] target/arm: Implement SVE2 integer halving add/subtract (predicated)

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 54 ++
 target/arm/sve.decode  | 11 
 target/arm/sve_helper.c| 39 +++
 target/arm/translate-sve.c |  8 ++
 4 files changed, 112 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0eecf33249..149fff1fae 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -226,6 +226,60 @@ DEF_HELPER_FLAGS_5(sve2_uqrshl_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve2_uqrshl_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve2_shadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_shadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_shadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_shadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uhadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uhadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uhadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uhadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_srhadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_srhadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_srhadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_srhadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_urhadd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_urhadd_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_urhadd_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_urhadd_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_shsub_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_shsub_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_shsub_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_shsub_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uhsub_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uhsub_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uhsub_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uhsub_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_d, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index f0b6692e43..54076bb607 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1131,3 +1131,14 @@ SQRSHL  01000100 .. 001 010 100 ... . .  
@rdn_pg_rm
 UQRSHL  01000100 .. 001 011 100 ... . .  @rdn_pg_rm
 SQRSHL  01000100 .. 001 110 100 ... . .  @rdm_pg_rn # SQRSHLR
 UQRSHL  01000100 .. 001 111 100 ... . .  @rdm_pg_rn # UQRSHLR
+
+### SVE2 integer halving add/subtract (predicated)
+
+SHADD   01000100 .. 010 000 100 ... . .  @rdn_pg_rm
+UHADD   01000100 .. 010 001 100 ... . .  @rdn_pg_rm
+SHSUB   01000100 .. 010 010 100 ... . .  @rdn_pg_rm
+UHSUB   01000100 .. 010 011 100 ... . .  @rdn_pg_rm
+SRHADD  01000100 .. 010 100 100 ... . .  @rdn_pg_rm
+URHADD  01000100 .. 010 101 100 ... . .  @rdn_pg_rm
+SHSUB   01000100 .. 010 110 100 ... . .  @rdm_pg_rn # SHSUBR
+UHSUB   01000100 .. 010 111 100 ... . .  @rdm_pg_rn # UHSUBR
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a7e9b8d341..5d75aed7b7 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -639,6 +639,45 @@ DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
 
+#define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
+#define DO_HADD_D(n, m)((n >> 1) + (m >> 1) + (n & m & 1))
+
+DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1_2, 

[PATCH 05/31] target/arm: Implement SVE2 integer unary operations (predicated)

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 13 +++
 target/arm/sve.decode  |  7 ++
 target/arm/sve_helper.c| 25 
 target/arm/translate-sve.c | 47 ++
 4 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 854cd97fdf..d3b7c3bd12 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -507,6 +507,19 @@ DEF_HELPER_FLAGS_4(sve_rbit_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_rbit_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_rbit_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve2_sqabs_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqabs_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqabs_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqabs_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_sqneg_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqneg_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqneg_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sqneg_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_urecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_ursqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_splice, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_b, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 6691145854..95a9c65451 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1107,3 +1107,10 @@ PMUL_zzz0100 00 1 . 0110 01 . .  
@rd_rn_rm_e0
 
 SADALP_zpzz 01000100 .. 000 100 101 ... . .  @rdm_pg_rn
 UADALP_zpzz 01000100 .. 000 101 101 ... . .  @rdm_pg_rn
+
+### SVE2 integer unary operations (predicated)
+
+URECPE  01000100 .. 000 000 101 ... . .  @rd_pg_rn
+URSQRTE 01000100 .. 000 001 101 ... . .  @rd_pg_rn
+SQABS   01000100 .. 001 000 101 ... . .  @rd_pg_rn
+SQNEG   01000100 .. 001 001 101 ... . .  @rd_pg_rn
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 7dc17421e9..16606331fc 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -535,8 +535,8 @@ static inline uint64_t do_sadalp_d(uint64_t n, uint64_t m)
 return m + n1 + n2;
 }
 
-DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
-DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
+DO_ZPZZ(sve2_sadalp_zpzz_h, uint16_t, H1_2, do_sadalp_h)
+DO_ZPZZ(sve2_sadalp_zpzz_s, uint32_t, H1_4, do_sadalp_s)
 DO_ZPZZ_D(sve2_sadalp_zpzz_d, uint64_t, do_sadalp_d)
 
 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
@@ -557,8 +557,8 @@ static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
 return m + n1 + n2;
 }
 
-DO_ZPZZ(sve2_uadalp_zpzz_h, int16_t, H1_2, do_uadalp_h)
-DO_ZPZZ(sve2_uadalp_zpzz_s, int32_t, H1_4, do_uadalp_s)
+DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
+DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
 
 #undef DO_ZPZZ
@@ -728,6 +728,23 @@ DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 
+#define DO_SQABS(N)  (N == -N ? N - 1 : N < 0 ? -N : N)
+
+DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
+DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
+DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
+DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
+
+#define DO_SQNEG(N)  (N == -N ? N - 1 : -N)
+
+DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
+DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
+
+DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
+DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
+
 /* Three-operand expander, unpredicated, in which the third operand is "wide".
  */
 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)   \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index bc8321f7cd..938ec08673 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -5919,3 +5919,50 @@ static bool trans_UADALP_zpzz(DisasContext *s, 
arg_rprr_esz *a)
 }
 return do_sve2_zpzz_ool(s, a, fns[a->esz - 1]);
 }
+
+/*
+ * SVE2 integer unary operations (predicated)
+ */
+
+static bool do_sve2_zpz_ool(DisasContext *s, arg_rpr_esz *a,
+gen_helper_gvec_3 *fn)
+{
+if (!dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+return do_zpz_ool(s, a, fn);
+}
+
+static bool trans_URECPE(DisasContext *s, arg_rpr_esz *a)
+{
+if (a->esz != 2) {
+return false;
+}
+return 

[PATCH 07/31] target/arm: Implement SVE2 saturating/rounding bitwise shift left (predicated)

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 54 ++
 target/arm/sve.decode  | 17 +
 target/arm/sve_helper.c| 78 ++
 target/arm/translate-sve.c | 18 +
 4 files changed, 167 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index d3b7c3bd12..0eecf33249 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -172,6 +172,60 @@ DEF_HELPER_FLAGS_5(sve2_uadalp_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve2_uadalp_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve2_srshl_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_srshl_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_srshl_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_srshl_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_urshl_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_urshl_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_urshl_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_urshl_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_sqshl_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqshl_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqshl_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqshl_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uqshl_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqshl_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqshl_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqshl_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_sqrshl_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrshl_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrshl_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sqrshl_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uqrshl_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqrshl_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqrshl_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uqrshl_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_d, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 95a9c65451..f0b6692e43 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1114,3 +1114,20 @@ URECPE  01000100 .. 000 000 101 ... . .  
@rd_pg_rn
 URSQRTE 01000100 .. 000 001 101 ... . .  @rd_pg_rn
 SQABS   01000100 .. 001 000 101 ... . .  @rd_pg_rn
 SQNEG   01000100 .. 001 001 101 ... . .  @rd_pg_rn
+
+### SVE2 saturating/rounding bitwise shift left (predicated)
+
+SRSHL   01000100 .. 000 010 100 ... . .  @rdn_pg_rm
+URSHL   01000100 .. 000 011 100 ... . .  @rdn_pg_rm
+SRSHL   01000100 .. 000 110 100 ... . .  @rdm_pg_rn # SRSHLR
+URSHL   01000100 .. 000 111 100 ... . .  @rdm_pg_rn # URSHLR
+
+SQSHL   01000100 .. 001 000 100 ... . .  @rdn_pg_rm
+UQSHL   01000100 .. 001 001 100 ... . .  @rdn_pg_rm
+SQSHL   01000100 .. 001 100 100 ... . .  @rdm_pg_rn # SQSHLR
+UQSHL   01000100 .. 001 101 100 ... . .  @rdm_pg_rn # UQSHLR
+
+SQRSHL  01000100 .. 001 010 100 ... . .  @rdn_pg_rm
+UQRSHL  01000100 .. 001 011 100 ... . .  @rdn_pg_rm
+SQRSHL  01000100 .. 001 110 100 ... . .  @rdm_pg_rn # SQRSHLR
+UQRSHL  01000100 .. 001 111 100 ... . .  @rdm_pg_rn # UQRSHLR
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 16606331fc..a7e9b8d341 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -26,6 +26,7 @@
 #include "tcg/tcg-gvec-desc.h"
 

[PATCH 12/31] target/arm: Implement SVE2 integer add/subtract interleaved long

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/sve.decode  | 7 +++
 target/arm/translate-sve.c | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index a239fd3479..8d5f31bcc4 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -109,6 +109,7 @@
 
 # Three operand, vector element size
 @rd_rn_rm    esz:2 . rm:5 ... ... rn:5 rd:5 _esz
+@rd_rm_rn    esz:2 . rn:5 ... ... rm:5 rd:5 _esz
 @pd_pn_pm    esz:2 .. rm:4 ... rn:4 . rd:4  _esz
 @rdn_rm  esz:2 .. .. rm:5 rd:5 \
 _esz rn=%reg_movprfx
@@ -1180,3 +1181,9 @@ SABDLB  01000101 .. 0 . 00 1100 . .  
@rd_rn_rm
 SABDLT  01000101 .. 0 . 00 1101 . .  @rd_rn_rm
 UABDLB  01000101 .. 0 . 00 1110 . .  @rd_rn_rm
 UABDLT  01000101 .. 0 . 00  . .  @rd_rn_rm
+
+## SVE2 integer add/subtract interleaved long
+
+SADDLBT 01000101 .. 0 . 1000 00 . .  @rd_rn_rm
+SSUBLBT 01000101 .. 0 . 1000 10 . .  @rd_rn_rm
+SSUBLBT 01000101 .. 0 . 1000 11 . .  @rd_rm_rn # SSUBLTB
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index ee8a6fd912..accb74537b 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6051,3 +6051,6 @@ DO_SVE2_ZZZ_TB(SABDLT, sabdl, true, true)
 DO_SVE2_ZZZ_TB(UADDLT, uaddl, true, true)
 DO_SVE2_ZZZ_TB(USUBLT, usubl, true, true)
 DO_SVE2_ZZZ_TB(UABDLT, uabdl, true, true)
+
+DO_SVE2_ZZZ_TB(SADDLBT, saddl, false, true)
+DO_SVE2_ZZZ_TB(SSUBLBT, ssubl, false, true)
-- 
2.20.1




[PATCH 04/31] target/arm: Remove fp_status from helper_{recpe, rsqrte}_u32

2020-03-26 Thread Richard Henderson
These operations do not touch fp_status.

Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  4 ++--
 target/arm/translate-a64.c |  5 ++---
 target/arm/translate.c | 12 ++--
 target/arm/vfp_helper.c|  4 ++--
 4 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 80bc129763..938fdbc362 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -213,8 +213,8 @@ DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, 
ptr)
 DEF_HELPER_FLAGS_2(rsqrte_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
 DEF_HELPER_FLAGS_2(rsqrte_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(rsqrte_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
-DEF_HELPER_2(recpe_u32, i32, i32, ptr)
-DEF_HELPER_FLAGS_2(rsqrte_u32, TCG_CALL_NO_RWG, i32, i32, ptr)
+DEF_HELPER_FLAGS_1(recpe_u32, TCG_CALL_NO_RWG, i32, i32)
+DEF_HELPER_FLAGS_1(rsqrte_u32, TCG_CALL_NO_RWG, i32, i32)
 DEF_HELPER_FLAGS_4(neon_tbl, TCG_CALL_NO_RWG, i32, i32, i32, ptr, i32)
 
 DEF_HELPER_3(shl_cc, i32, env, i32, i32)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index db41e3d72a..2bcf643069 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10220,7 +10220,7 @@ static void handle_2misc_reciprocal(DisasContext *s, 
int opcode,
 
 switch (opcode) {
 case 0x3c: /* URECPE */
-gen_helper_recpe_u32(tcg_res, tcg_op, fpst);
+gen_helper_recpe_u32(tcg_res, tcg_op);
 break;
 case 0x3d: /* FRECPE */
 gen_helper_recpe_f32(tcg_res, tcg_op, fpst);
@@ -12802,7 +12802,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, 
uint32_t insn)
 unallocated_encoding(s);
 return;
 }
-need_fpstatus = true;
 break;
 case 0x1e: /* FRINT32Z */
 case 0x1f: /* FRINT64Z */
@@ -12970,7 +12969,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, 
uint32_t insn)
 gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus);
 break;
 case 0x7c: /* URSQRTE */
-gen_helper_rsqrte_u32(tcg_res, tcg_op, tcg_fpstatus);
+gen_helper_rsqrte_u32(tcg_res, tcg_op);
 break;
 case 0x1e: /* FRINT32Z */
 case 0x5e: /* FRINT32X */
diff --git a/target/arm/translate.c b/target/arm/translate.c
index b38af6149a..cba84987db 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -6711,19 +6711,11 @@ static int disas_neon_data_insn(DisasContext *s, 
uint32_t insn)
 break;
 }
 case NEON_2RM_VRECPE:
-{
-TCGv_ptr fpstatus = get_fpstatus_ptr(1);
-gen_helper_recpe_u32(tmp, tmp, fpstatus);
-tcg_temp_free_ptr(fpstatus);
+gen_helper_recpe_u32(tmp, tmp);
 break;
-}
 case NEON_2RM_VRSQRTE:
-{
-TCGv_ptr fpstatus = get_fpstatus_ptr(1);
-gen_helper_rsqrte_u32(tmp, tmp, fpstatus);
-tcg_temp_free_ptr(fpstatus);
+gen_helper_rsqrte_u32(tmp, tmp);
 break;
-}
 case NEON_2RM_VRECPE_F:
 {
 TCGv_ptr fpstatus = get_fpstatus_ptr(1);
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index 930d6e747f..a792661166 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -1023,7 +1023,7 @@ float64 HELPER(rsqrte_f64)(float64 input, void *fpstp)
 return make_float64(val);
 }
 
-uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
+uint32_t HELPER(recpe_u32)(uint32_t a)
 {
 /* float_status *s = fpstp; */
 int input, estimate;
@@ -1038,7 +1038,7 @@ uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
 return deposit32(0, (32 - 9), 9, estimate);
 }
 
-uint32_t HELPER(rsqrte_u32)(uint32_t a, void *fpstp)
+uint32_t HELPER(rsqrte_u32)(uint32_t a)
 {
 int estimate;
 
-- 
2.20.1




[PATCH 03/31] target/arm: Implement SVE2 integer pairwise add and accumulate long

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 14 
 target/arm/sve.decode  |  5 +
 target/arm/sve_helper.c| 44 ++
 target/arm/translate-sve.c | 39 +
 4 files changed, 102 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 11d627981d..854cd97fdf 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -158,6 +158,20 @@ DEF_HELPER_FLAGS_5(sve_umulh_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_umulh_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve2_sadalp_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sadalp_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sadalp_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uadalp_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uadalp_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uadalp_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_d, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 58e0b808e9..6691145854 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1102,3 +1102,8 @@ MUL_zzz 0100 .. 1 . 0110 00 . .  
@rd_rn_rm
 SMULH_zzz   0100 .. 1 . 0110 10 . .  @rd_rn_rm
 UMULH_zzz   0100 .. 1 . 0110 11 . .  @rd_rn_rm
 PMUL_zzz0100 00 1 . 0110 01 . .  @rd_rn_rm_e0
+
+### SVE2 Integer - Predicated
+
+SADALP_zpzz 01000100 .. 000 100 101 ... . .  @rdm_pg_rn
+UADALP_zpzz 01000100 .. 000 101 101 ... . .  @rdm_pg_rn
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index d40b1994aa..7dc17421e9 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -517,6 +517,50 @@ DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 
+static inline uint16_t do_sadalp_h(uint16_t n, uint16_t m)
+{
+int8_t n1 = n, n2 = n >> 8;
+return m + n1 + n2;
+}
+
+static inline uint32_t do_sadalp_s(uint32_t n, uint32_t m)
+{
+int16_t n1 = n, n2 = n >> 16;
+return m + n1 + n2;
+}
+
+static inline uint64_t do_sadalp_d(uint64_t n, uint64_t m)
+{
+int32_t n1 = n, n2 = n >> 32;
+return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
+DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
+DO_ZPZZ_D(sve2_sadalp_zpzz_d, uint64_t, do_sadalp_d)
+
+static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
+{
+uint8_t n1 = n, n2 = n >> 8;
+return m + n1 + n2;
+}
+
+static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
+{
+uint16_t n1 = n, n2 = n >> 16;
+return m + n1 + n2;
+}
+
+static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
+{
+uint32_t n1 = n, n2 = n >> 32;
+return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_uadalp_zpzz_h, int16_t, H1_2, do_uadalp_h)
+DO_ZPZZ(sve2_uadalp_zpzz_s, int32_t, H1_4, do_uadalp_s)
+DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
+
 #undef DO_ZPZZ
 #undef DO_ZPZZ_D
 
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index e962f45b32..bc8321f7cd 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -5880,3 +5880,42 @@ static bool trans_PMUL_zzz(DisasContext *s, arg_rrr_esz 
*a)
 {
 return do_sve2_zzz_ool(s, a, gen_helper_gvec_pmul_b);
 }
+
+/*
+ * SVE2 Integer - Predicated
+ */
+
+static bool do_sve2_zpzz_ool(DisasContext *s, arg_rprr_esz *a,
+ gen_helper_gvec_4 *fn)
+{
+if (!dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+return do_zpzz_ool(s, a, fn);
+}
+
+static bool trans_SADALP_zpzz(DisasContext *s, arg_rprr_esz *a)
+{
+static gen_helper_gvec_4 * const fns[3] = {
+gen_helper_sve2_sadalp_zpzz_h,
+gen_helper_sve2_sadalp_zpzz_s,
+gen_helper_sve2_sadalp_zpzz_d,
+};
+if (a->esz == 0) {
+return false;
+}
+return do_sve2_zpzz_ool(s, a, fns[a->esz - 1]);
+}
+
+static bool trans_UADALP_zpzz(DisasContext *s, arg_rprr_esz *a)
+{
+static gen_helper_gvec_4 * const fns[3] = {
+gen_helper_sve2_uadalp_zpzz_h,
+gen_helper_sve2_uadalp_zpzz_s,
+gen_helper_sve2_uadalp_zpzz_d,
+};
+if (a->esz == 0) {
+return false;
+}
+return do_sve2_zpzz_ool(s, a, fns[a->esz - 1]);
+}
-- 
2.20.1




[PATCH 11/31] target/arm: Implement SVE2 integer add/subtract long

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 24 
 target/arm/sve.decode  | 19 
 target/arm/sve_helper.c| 43 +++
 target/arm/translate-sve.c | 46 ++
 4 files changed, 132 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 368185944a..475fce7f3a 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -1372,6 +1372,30 @@ DEF_HELPER_FLAGS_5(sve_ftmad_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_ftmad_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_ftmad_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve2_saddl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_saddl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_saddl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_ssubl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_ssubl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_ssubl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_sabdl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sabdl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_sabdl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_uaddl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_uaddl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_uaddl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_usubl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_usubl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_usubl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2_uabdl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_uabdl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2_uabdl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(sve_ldr, TCG_CALL_NO_WG, void, env, ptr, tl, int)
 DEF_HELPER_FLAGS_4(sve_str, TCG_CALL_NO_WG, void, env, ptr, tl, int)
 
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 86aee38668..a239fd3479 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1161,3 +1161,22 @@ SUQADD  01000100 .. 011 100 100 ... . .  
@rdn_pg_rm
 USQADD  01000100 .. 011 101 100 ... . .  @rdn_pg_rm
 SQSUB_zpzz  01000100 .. 011 110 100 ... . .  @rdm_pg_rn # SQSUBR
 UQSUB_zpzz  01000100 .. 011 111 100 ... . .  @rdm_pg_rn # UQSUBR
+
+ SVE2 Widening Integer Arithmetic
+
+## SVE2 integer add/subtract long
+
+SADDLB  01000101 .. 0 . 00  . .  @rd_rn_rm
+SADDLT  01000101 .. 0 . 00 0001 . .  @rd_rn_rm
+UADDLB  01000101 .. 0 . 00 0010 . .  @rd_rn_rm
+UADDLT  01000101 .. 0 . 00 0011 . .  @rd_rn_rm
+
+SSUBLB  01000101 .. 0 . 00 0100 . .  @rd_rn_rm
+SSUBLT  01000101 .. 0 . 00 0101 . .  @rd_rn_rm
+USUBLB  01000101 .. 0 . 00 0110 . .  @rd_rn_rm
+USUBLT  01000101 .. 0 . 00 0111 . .  @rd_rn_rm
+
+SABDLB  01000101 .. 0 . 00 1100 . .  @rd_rn_rm
+SABDLT  01000101 .. 0 . 00 1101 . .  @rd_rn_rm
+UABDLB  01000101 .. 0 . 00 1110 . .  @rd_rn_rm
+UABDLT  01000101 .. 0 . 00  . .  @rd_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index bee00eaa44..7d7a59f620 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1088,6 +1088,49 @@ DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 #undef DO_ZPZ
 #undef DO_ZPZ_D
 
+/*
+ * Three-operand expander, unpredicated, in which the two inputs are
+ * selected from the top or bottom half of the wide column.
+ */
+#define DO_ZZZ_TB(NAME, TYPE, TYPEN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+int sel1 = (simd_data(desc) & 1) * sizeof(TYPE);   \
+int sel2 = (simd_data(desc) & 2) * (sizeof(TYPE) / 2); \
+for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
+TYPE nn = (TYPEN)(*(TYPE *)(vn + i) >> sel1);  \
+TYPE mm = (TYPEN)(*(TYPE *)(vm + i) >> sel2);  \
+*(TYPE *)(vd + i) = OP(nn, mm);\
+}  \
+}
+
+DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, DO_ADD)
+
+DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, DO_SUB)

[PATCH for-5.1 00/31] target/arm: SVE2, part 1

2020-03-26 Thread Richard Henderson
Posting this for early review.  It's based on some other patch
sets that I have posted recently that also touch SVE, listed
below.  But it might just be easier to clone the devel tree [2].
While the branch itself will rebase frequently for development,
I've also created a tag, post-sve2-20200326, for this posting.

This is mostly untested, as the most recently released Foundation
Model does not support SVE2.  Some of the new instructions overlap
with old fashioned NEON, and I can verify that those have not
broken, and show that SVE2 will use the same code path.  But the
predicated insns and bottom/top interleaved insns are not yet
RISU testable, as I have nothing to compare against.

The patches are in general arranged so that one complete group
of insns are added at once.  The groups within the manual [1]
have so far been small-ish.


r~

---

[1] ISA manual: 
https://static.docs.arm.com/ddi0602/d/ISA_A64_xml_futureA-2019-12_OPT.pdf

[2] Devel tree: https://github.com/rth7680/qemu/tree/tgt-arm-sve-2

Based-on: http://patchwork.ozlabs.org/project/qemu-devel/list/?series=163610
("target/arm: sve load/store improvements")

Based-on: http://patchwork.ozlabs.org/project/qemu-devel/list/?series=164500
("target/arm: Use tcg_gen_gvec_5_ptr for sve FMLA/FCMLA")

Based-on: http://patchwork.ozlabs.org/project/qemu-devel/list/?series=164048
("target/arm: Implement ARMv8.5-MemTag, system mode")

Richard Henderson (31):
  target/arm: Add ID_AA64ZFR0 fields and isar_feature_aa64_sve2
  target/arm: Implement SVE2 Integer Multiply - Unpredicated
  target/arm: Implement SVE2 integer pairwise add and accumulate long
  target/arm: Remove fp_status from helper_{recpe,rsqrte}_u32
  target/arm: Implement SVE2 integer unary operations (predicated)
  target/arm: Split out saturating/rounding shifts from neon
  target/arm: Implement SVE2 saturating/rounding bitwise shift left
(predicated)
  target/arm: Implement SVE2 integer halving add/subtract (predicated)
  target/arm: Implement SVE2 integer pairwise arithmetic
  target/arm: Implement SVE2 saturating add/subtract (predicated)
  target/arm: Implement SVE2 integer add/subtract long
  target/arm: Implement SVE2 integer add/subtract interleaved long
  target/arm: Implement SVE2 integer add/subtract wide
  target/arm: Implement SVE2 integer multiply long
  target/arm: Implement PMULLB and PMULLT
  target/arm: Tidy SVE tszimm shift formats
  target/arm: Implement SVE2 bitwise shift left long
  target/arm: Implement SVE2 bitwise exclusive-or interleaved
  target/arm: Implement SVE2 bitwise permute
  target/arm: Implement SVE2 complex integer add
  target/arm: Implement SVE2 integer absolute difference and accumulate
long
  target/arm: Implement SVE2 integer add/subtract long with carry
  target/arm: Create arm_gen_gvec_[us]sra
  target/arm: Create arm_gen_gvec_{u,s}{rshr,rsra}
  target/arm: Implement SVE2 bitwise shift right and accumulate
  target/arm: Create arm_gen_gvec_{sri,sli}
  target/arm: Tidy handle_vec_simd_shri
  target/arm: Implement SVE2 bitwise shift and insert
  target/arm: Vectorize SABD/UABD
  target/arm: Vectorize SABA/UABA
  target/arm: Implement SVE2 integer absolute difference and accumulate

 target/arm/cpu.h   |  31 ++
 target/arm/helper-sve.h| 345 +
 target/arm/helper.h|  81 +++-
 target/arm/translate-a64.h |   9 +
 target/arm/translate.h |  24 +-
 target/arm/vec_internal.h  | 161 
 target/arm/sve.decode  | 217 ++-
 target/arm/helper.c|   3 +-
 target/arm/kvm64.c |   2 +
 target/arm/neon_helper.c   | 515 -
 target/arm/sve_helper.c| 757 ++---
 target/arm/translate-a64.c | 557 +++
 target/arm/translate-sve.c | 557 +++
 target/arm/translate.c | 626 ++
 target/arm/vec_helper.c| 411 
 target/arm/vfp_helper.c|   4 +-
 16 files changed, 3532 insertions(+), 768 deletions(-)
 create mode 100644 target/arm/vec_internal.h

-- 
2.20.1




[PATCH 01/31] target/arm: Add ID_AA64ZFR0 fields and isar_feature_aa64_sve2

2020-03-26 Thread Richard Henderson
Will be used for SVE2 isa subset enablement.

Signed-off-by: Richard Henderson 
---
 target/arm/cpu.h| 16 
 target/arm/helper.c |  3 +--
 target/arm/kvm64.c  |  2 ++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index e9f049c8d8..2314e3c18c 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -900,6 +900,7 @@ struct ARMCPU {
 uint64_t id_aa64mmfr2;
 uint64_t id_aa64dfr0;
 uint64_t id_aa64dfr1;
+uint64_t id_aa64zfr0;
 } isar;
 uint32_t midr;
 uint32_t revidr;
@@ -1860,6 +1861,16 @@ FIELD(ID_AA64DFR0, PMSVER, 32, 4)
 FIELD(ID_AA64DFR0, DOUBLELOCK, 36, 4)
 FIELD(ID_AA64DFR0, TRACEFILT, 40, 4)
 
+FIELD(ID_AA64ZFR0, SVEVER, 0, 4)
+FIELD(ID_AA64ZFR0, AES, 4, 4)
+FIELD(ID_AA64ZFR0, BITPERM, 16, 4)
+FIELD(ID_AA64ZFR0, BFLOAT16, 20, 4)
+FIELD(ID_AA64ZFR0, SHA3, 32, 4)
+FIELD(ID_AA64ZFR0, SM4, 40, 4)
+FIELD(ID_AA64ZFR0, I8MM, 44, 4)
+FIELD(ID_AA64ZFR0, F32MM, 52, 4)
+FIELD(ID_AA64ZFR0, F64MM, 56, 4)
+
 FIELD(ID_DFR0, COPDBG, 0, 4)
 FIELD(ID_DFR0, COPSDBG, 4, 4)
 FIELD(ID_DFR0, MMAPDBG, 8, 4)
@@ -3839,6 +3850,11 @@ static inline bool isar_feature_aa64_ccidx(const 
ARMISARegisters *id)
 return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR2, CCIDX) != 0;
 }
 
+static inline bool isar_feature_aa64_sve2(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, SVEVER) != 0;
+}
+
 /*
  * Feature tests for "does this exist in either 32-bit or 64-bit?"
  */
diff --git a/target/arm/helper.c b/target/arm/helper.c
index b3bc33db41..3767002995 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -7500,8 +7500,7 @@ void register_cp_regs_for_features(ARMCPU *cpu)
   .opc0 = 3, .opc1 = 0, .crn = 0, .crm = 4, .opc2 = 4,
   .access = PL1_R, .type = ARM_CP_CONST,
   .accessfn = access_aa64_tid3,
-  /* At present, only SVEver == 0 is defined anyway.  */
-  .resetvalue = 0 },
+  .resetvalue = cpu->isar.id_aa64zfr0 },
 { .name = "ID_AA64PFR5_EL1_RESERVED", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 0, .crn = 0, .crm = 4, .opc2 = 5,
   .access = PL1_R, .type = ARM_CP_CONST,
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index be5b31c2b0..eda4679fcd 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -555,6 +555,8 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
   ARM64_SYS_REG(3, 0, 0, 7, 1));
 err |= read_sys_reg64(fdarray[2], >isar.id_aa64mmfr2,
   ARM64_SYS_REG(3, 0, 0, 7, 2));
+err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
+  ARM64_SYS_REG(3, 0, 0, 4, 4));
 
 /*
  * Note that if AArch32 support is not present in the host,
-- 
2.20.1




[PATCH 02/31] target/arm: Implement SVE2 Integer Multiply - Unpredicated

2020-03-26 Thread Richard Henderson
For MUL, we can rely on generic support.  For SMULH and UMULH,
create some trivial helpers.  For PMUL, back in a21bb78e5817,
we organized helper_gvec_pmul_b in preparation for this use.

Signed-off-by: Richard Henderson 
---
 target/arm/helper.h| 10 
 target/arm/sve.decode  |  9 
 target/arm/translate-sve.c | 51 
 target/arm/vec_helper.c| 96 ++
 4 files changed, 166 insertions(+)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index d5f1c87192..80bc129763 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -688,6 +688,16 @@ DEF_HELPER_FLAGS_2(frint64_s, TCG_CALL_NO_RWG, f32, f32, 
ptr)
 DEF_HELPER_FLAGS_2(frint32_d, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_FLAGS_2(frint64_d, TCG_CALL_NO_RWG, f64, f64, ptr)
 
+DEF_HELPER_FLAGS_4(gvec_smulh_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smulh_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smulh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smulh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_umulh_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umulh_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umulh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umulh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(gvec_sshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_sshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 4f580a25e7..58e0b808e9 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1093,3 +1093,12 @@ ST1_zprz1110010 .. 00 . 100 ... . . \
 @rprr_scatter_store xs=0 esz=3 scale=0
 ST1_zprz1110010 .. 00 . 110 ... . . \
 @rprr_scatter_store xs=1 esz=3 scale=0
+
+ SVE2 Support
+
+### SVE2 Integer Multiply - Unpredicated
+
+MUL_zzz 0100 .. 1 . 0110 00 . .  @rd_rn_rm
+SMULH_zzz   0100 .. 1 . 0110 10 . .  @rd_rn_rm
+UMULH_zzz   0100 .. 1 . 0110 11 . .  @rd_rn_rm
+PMUL_zzz0100 00 1 . 0110 01 . .  @rd_rn_rm_e0
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index acf962b6b0..e962f45b32 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -5829,3 +5829,54 @@ static bool trans_MOVPRFX_z(DisasContext *s, arg_rpr_esz 
*a)
 }
 return true;
 }
+
+/*
+ * SVE2 Integer Multiply - Unpredicated
+ */
+
+static bool trans_MUL_zzz(DisasContext *s, arg_rrr_esz *a)
+{
+if (!dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+return do_vector3_z(s, tcg_gen_gvec_mul, a->esz, a->rd, a->rn, a->rm);
+}
+
+static bool do_sve2_zzz_ool(DisasContext *s, arg_rrr_esz *a,
+gen_helper_gvec_3 *fn)
+{
+if (fn == NULL || !dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+if (sve_access_check(s)) {
+unsigned vsz = vec_full_reg_size(s);
+tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   vec_full_reg_offset(s, a->rm),
+   vsz, vsz, 0, fn);
+}
+return true;
+}
+
+static bool trans_SMULH_zzz(DisasContext *s, arg_rrr_esz *a)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+gen_helper_gvec_smulh_b, gen_helper_gvec_smulh_h,
+gen_helper_gvec_smulh_s, gen_helper_gvec_smulh_d,
+};
+return do_sve2_zzz_ool(s, a, fns[a->esz]);
+}
+
+static bool trans_UMULH_zzz(DisasContext *s, arg_rrr_esz *a)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+gen_helper_gvec_umulh_b, gen_helper_gvec_umulh_h,
+gen_helper_gvec_umulh_s, gen_helper_gvec_umulh_d,
+};
+return do_sve2_zzz_ool(s, a, fns[a->esz]);
+}
+
+static bool trans_PMUL_zzz(DisasContext *s, arg_rrr_esz *a)
+{
+return do_sve2_zzz_ool(s, a, gen_helper_gvec_pmul_b);
+}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 8017bd88c4..00dc38c9db 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1257,3 +1257,99 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, 
uint32_t desc)
 }
 }
 #endif
+
+/*
+ * NxN -> N highpart multiply
+ *
+ * TODO: expose this as a generic vector operation.
+ */
+
+void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc);
+int8_t *d = vd, *n = vn, *m = vm;
+
+for (i = 0; i < opr_sz; ++i) {
+d[i] = ((int32_t)n[i] * m[i]) >> 8;
+}
+clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = 

Re: [PATCH 0/3] target/mips: Add loongson gs464 core

2020-03-26 Thread Aleksandar Markovic
12:05 Sre, 25.03.2020. Jiaxun Yang  је написао/ла:
>
> Loongson gs464 core can be found in Loongson-3A1000 processor.
> This patchset add minimal support for that core.
> There are still some instructions missing, I'm going to work on
> them later.
>
> The corresponding hw board is also missing. I'm using modified kernel
> for malta for testing purpose and planing to take the design of Lemote's
> KVM virtual machine.
>
> Official manual of this core can be found here [1] (In Chinese).
> My collection of instruction documents mainly based on Chinese
> version of manual, binutils gas code and experiments on real machine
> can be found here [2] (In English).
>
> [1]:
http://loongson.cn/uploadfile/cpu/3A1000/Loongson_3A1000_cpu_user_2.pdf
> [2]: https://github.com/FlyGoat/loongson-insn/blob/master/loongson-ext.md
>

Thanks, Jiaxun!

Just to mention whay you probably know, since this is a new feature, this
is too late for 5.0, so we are shooting for integrsying it in 5.1.

Speak to you later of course in more details.

Yours,
Aleksandar

> Jiaxun Yang (3):
>   target/mips: Introduce loongson ext & mmi ASE flags
>   target/mips: Add loongson ext lsdc2 instrustions
>   target/mips: Add loongson gs464 core
>
>  target/mips/mips-defs.h  |   2 +
>  target/mips/translate.c  | 166 ++-
>  target/mips/translate_init.inc.c |  25 -
>  3 files changed, 188 insertions(+), 5 deletions(-)
>
> --
> 2.26.0.rc2
>
>


Re: [PATCH v8 00/74] per-CPU locks

2020-03-26 Thread Aleksandar Markovic
21:37 Čet, 26.03.2020. Robert Foley  је написао/ла:
>
> V7: https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg00786.html
>
> This is a continuation of the series created by Emilio Cota.
> We are picking up this patch set with the goal to apply
> any fixes or updates needed to get this accepted.
>

Thank for this work, Robert.

However, I just hope you don't intend to request integrating the series in
5.0. The right timing for such wide-influencing patch is at the begining of
dev cycle, not really at the end of (5.0) cycle, IMHO.

Yours,
Aleksandar

> Quoting an earlier patch in the series:
> "For context, the goal of this series is to substitute the BQL for the
> per-CPU locks in many places, notably the execution loop in cpus.c.
> This leads to better scalability for MTTCG, since CPUs don't have
> to acquire a contended global lock (the BQL) every time they
> stop executing code.
> See the last commit for some performance numbers."
>
> Listed below are the changes for this version of the patch,
> aside from the merge related changes.
>
> Changes for V8:
> - Fixed issue where in rr mode we could destroy the BQL twice.
>   Added new function cpu_mutex_destroy().
> - Removed g_assert(qemu_mutex_iothread_locked())
>   from qemu_tcg_rr_all_cpu_threads_idle().  There is an existing
>   case where we call qemu_tcg_rr_all_cpu_threads_idle() without
>   the BQL held, so we cannot assert on the lock here.
> - Found/fixed bug that had been hit in testing previously during
>   the last consideration of this patch.
>   We reproduced the issue hit in the qtest: bios-tables-test.
>   The issue was introduced by dropping the BQL, and found us
>   (very rarely) missing the condition variable wakeup in
>   qemu_tcg_rr_cpu_thread_fn().
> - ppc: convert to cpu_halted
>   - Converted new code for cpu_halted and cpu_halted_set.
> - hw/semihosting: convert to cpu_halted_set
>   -  Added this patch as this code was new and needed converting.
> - ppc/translate_init.inc.c
>   - Translated some new code here to use cpu_has_work_with_iothread_lock.
> - ppc/sapr_hcall.c Translated new code to cpu_halted
> - i386/hax-all.c - converted new code to cpu_interrupt_request and
cpu_halted
> - mips/kvm.c - converted new code to cpu_halted
> - Some changes were related to files that moved, cpu.c and cpu.h
>   moved to hw/core/, and some changes needed to be put
>   there manually during the merge.
>
> Emilio G. Cota (69):
>   cpu: convert queued work to a QSIMPLEQ
>   cpu: rename cpu->work_mutex to cpu->lock
>   cpu: introduce cpu_mutex_lock/unlock
>   cpu: make qemu_work_cond per-cpu
>   cpu: move run_on_cpu to cpus-common
>   cpu: introduce process_queued_cpu_work_locked
>   cpu: make per-CPU locks an alias of the BQL in TCG rr mode
>   tcg-runtime: define helper_cpu_halted_set
>   ppc: convert to helper_cpu_halted_set
>   cris: convert to helper_cpu_halted_set
>   hppa: convert to helper_cpu_halted_set
>   m68k: convert to helper_cpu_halted_set
>   alpha: convert to helper_cpu_halted_set
>   microblaze: convert to helper_cpu_halted_set
>   cpu: define cpu_halted helpers
>   tcg-runtime: convert to cpu_halted_set
>   arm: convert to cpu_halted
>   ppc: convert to cpu_halted
>   sh4: convert to cpu_halted
>   i386: convert to cpu_halted
>   lm32: convert to cpu_halted
>   m68k: convert to cpu_halted
>   mips: convert to cpu_halted
>   riscv: convert to cpu_halted
>   s390x: convert to cpu_halted
>   sparc: convert to cpu_halted
>   xtensa: convert to cpu_halted
>   gdbstub: convert to cpu_halted
>   openrisc: convert to cpu_halted
>   cpu-exec: convert to cpu_halted
>   cpu: convert to cpu_halted
>   cpu: define cpu_interrupt_request helpers
>   exec: use cpu_reset_interrupt
>   arm: convert to cpu_interrupt_request
>   i386: convert to cpu_interrupt_request
>   i386/kvm: convert to cpu_interrupt_request
>   i386/hax-all: convert to cpu_interrupt_request
>   i386/whpx-all: convert to cpu_interrupt_request
>   i386/hvf: convert to cpu_request_interrupt
>   ppc: convert to cpu_interrupt_request
>   sh4: convert to cpu_interrupt_request
>   cris: convert to cpu_interrupt_request
>   hppa: convert to cpu_interrupt_request
>   lm32: convert to cpu_interrupt_request
>   m68k: convert to cpu_interrupt_request
>   mips: convert to cpu_interrupt_request
>   nios: convert to cpu_interrupt_request
>   s390x: convert to cpu_interrupt_request
>   alpha: convert to cpu_interrupt_request
>   moxie: convert to cpu_interrupt_request
>   sparc: convert to cpu_interrupt_request
>   openrisc: convert to cpu_interrupt_request
>   unicore32: convert to cpu_interrupt_request
>   microblaze: convert to cpu_interrupt_request
>   accel/tcg: convert to cpu_interrupt_request
>   cpu: convert to interrupt_request
>   cpu: call .cpu_has_work with the CPU lock held
>   cpu: introduce cpu_has_work_with_iothread_lock
>   ppc: convert to cpu_has_work_with_iothread_lock
>   mips: convert to cpu_has_work_with_iothread_lock
>   s390x: convert to 

[PATCH for 5.0 v1 2/2] riscv: AND stage-1 and stage-2 protection flags

2020-03-26 Thread Alistair Francis
Take the result of stage-1 and stage-2 page table walks and AND the two
protection flags together. This way we require both to set permissions
instead of just stage-2.

Signed-off-by: Alistair Francis 
---
 target/riscv/cpu_helper.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index f36d184b7b..50e13a064f 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -707,7 +707,7 @@ bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, int 
size,
 #ifndef CONFIG_USER_ONLY
 vaddr im_address;
 hwaddr pa = 0;
-int prot;
+int prot, prot2;
 bool pmp_violation = false;
 bool m_mode_two_stage = false;
 bool hs_mode_two_stage = false;
@@ -757,13 +757,15 @@ bool riscv_cpu_tlb_fill(CPUState *cs, vaddr address, int 
size,
 /* Second stage lookup */
 im_address = pa;
 
-ret = get_physical_address(env, , , im_address,
+ret = get_physical_address(env, , , im_address,
access_type, mmu_idx, false, true);
 
 qemu_log_mask(CPU_LOG_MMU,
 "%s 2nd-stage address=%" VADDR_PRIx " ret %d physical "
 TARGET_FMT_plx " prot %d\n",
-__func__, im_address, ret, pa, prot);
+__func__, im_address, ret, pa, prot2);
+
+prot &= prot2;
 
 if (riscv_feature(env, RISCV_FEATURE_PMP) &&
 (ret == TRANSLATE_SUCCESS) &&
-- 
2.26.0




[PATCH for 5.0 v1 0/2] RISC-V: Fix Hypervisor guest user space

2020-03-26 Thread Alistair Francis
This series fixes two bugs in the RISC-V two stage lookup
implementation. This fixes the Hypervisor userspace failing to start.

Alistair Francis (2):
  riscv: Don't use stage-2 PTE lookup protection flags
  riscv: AND stage-1 and stage-2 protection flags

 target/riscv/cpu_helper.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

-- 
2.26.0




[PATCH for 5.0 v1 1/2] riscv: Don't use stage-2 PTE lookup protection flags

2020-03-26 Thread Alistair Francis
When doing the fist of a two stage lookup (Hypervisor extensions) don't
set the current protection flags from the second stage lookup of the
base address PTE.

Signed-off-by: Alistair Francis 
---
 target/riscv/cpu_helper.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index d3ba9efb02..f36d184b7b 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -452,10 +452,11 @@ restart:
 hwaddr pte_addr;
 
 if (two_stage && first_stage) {
+int vbase_prot;
 hwaddr vbase;
 
 /* Do the second stage translation on the base PTE address. */
-get_physical_address(env, , prot, base, access_type,
+get_physical_address(env, , _prot, base, access_type,
  mmu_idx, false, true);
 
 pte_addr = vbase + idx * ptesize;
-- 
2.26.0




Re: [PATCH-for-5.0 08/12] hw/mips/boston: Add missing error-propagation code

2020-03-26 Thread Aleksandar Markovic
21:18 Sre, 25.03.2020. Philippe Mathieu-Daudé  је
написао/ла:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file
scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in
object_property_set_int() hw/mips/boston.c:462:4]]
>   [[manual check required: error_propagate() might be missing in
object_property_set_str() hw/mips/boston.c:460:4]]
>
> Since the uses are inside a MachineClass::init() function,
> directly use _fatal instead of error_propagate().
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/mips/boston.c | 17 ++---
>  1 file changed, 6 insertions(+), 11 deletions(-)
>
> diff --git a/hw/mips/boston.c b/hw/mips/boston.c
> index 98ecd25e8e..2e821ca7d6 100644
> --- a/hw/mips/boston.c
> +++ b/hw/mips/boston.c
> @@ -425,121 +425,116 @@ xilinx_pcie_init(MemoryRegion *sys_mem, uint32_t
bus_nr,
>  static void boston_mach_init(MachineState *machine)
>  {
>  DeviceState *dev;
>  BostonState *s;
> -Error *err = NULL;
>  MemoryRegion *flash, *ddr_low_alias, *lcd, *platreg;
>  MemoryRegion *sys_mem = get_system_memory();
>  XilinxPCIEHost *pcie2;
>  PCIDevice *ahci;
>  DriveInfo *hd[6];
>  Chardev *chr;
>  int fw_size, fit_err;
>  bool is_64b;
>
>  if ((machine->ram_size % GiB) ||
>  (machine->ram_size > (2 * GiB))) {
>  error_report("Memory size must be 1GB or 2GB");
>  exit(1);
>  }
>
>  dev = qdev_create(NULL, TYPE_MIPS_BOSTON);
>  qdev_init_nofail(dev);
>
>  s = BOSTON(dev);
>  s->mach = machine;
>
>  if (!cpu_supports_cps_smp(machine->cpu_type)) {
>  error_report("Boston requires CPUs which support CPS");
>  exit(1);
>  }
>
>  is_64b = cpu_supports_isa(machine->cpu_type, ISA_MIPS64);
>
>  sysbus_init_child_obj(OBJECT(machine), "cps", OBJECT(>cps),
>sizeof(s->cps), TYPE_MIPS_CPS);
>  object_property_set_str(OBJECT(>cps), machine->cpu_type,
"cpu-type",
> -);
> -object_property_set_int(OBJECT(>cps), machine->smp.cpus,
"num-vp", );
> -object_property_set_bool(OBJECT(>cps), true, "realized", );
> -
> -if (err != NULL) {
> -error_report("%s", error_get_pretty(err));
> -exit(1);
> -}
> -
> +_fatal);
> +object_property_set_int(OBJECT(>cps), machine->smp.cpus, "num-vp",
> +_fatal);
> +object_property_set_bool(OBJECT(>cps), true, "realized",
_fatal);
>  sysbus_mmio_map_overlap(SYS_BUS_DEVICE(>cps), 0, 0, 1);
>
>  flash =  g_new(MemoryRegion, 1);
> -memory_region_init_rom(flash, NULL, "boston.flash", 128 * MiB, );
> +memory_region_init_rom(flash, NULL, "boston.flash", 128 * MiB,
> +   _fatal);
>  memory_region_add_subregion_overlap(sys_mem, 0x1800, flash, 0);
>
>  memory_region_add_subregion_overlap(sys_mem, 0x8000,
machine->ram, 0);
>
>  ddr_low_alias = g_new(MemoryRegion, 1);
>  memory_region_init_alias(ddr_low_alias, NULL, "boston_low.ddr",
>   machine->ram, 0,
>   MIN(machine->ram_size, (256 * MiB)));
>  memory_region_add_subregion_overlap(sys_mem, 0, ddr_low_alias, 0);
>
>  xilinx_pcie_init(sys_mem, 0,
>   0x1000, 32 * MiB,
>   0x4000, 1 * GiB,
>   get_cps_irq(>cps, 2), false);
>
>  xilinx_pcie_init(sys_mem, 1,
>   0x1200, 32 * MiB,
>   0x2000, 512 * MiB,
>   get_cps_irq(>cps, 1), false);
>
>  pcie2 = xilinx_pcie_init(sys_mem, 2,
>   0x1400, 32 * MiB,
>   0x1600, 1 * MiB,
>   get_cps_irq(>cps, 0), true);
>
>  platreg = g_new(MemoryRegion, 1);
>  memory_region_init_io(platreg, NULL, _platreg_ops, s,
>"boston-platregs", 0x1000);
>  memory_region_add_subregion_overlap(sys_mem, 0x17ffd000, platreg, 0);
>
>  s->uart = serial_mm_init(sys_mem, 0x17ffe000, 2,
>   get_cps_irq(>cps, 3), 1000,
>   serial_hd(0), DEVICE_NATIVE_ENDIAN);
>
>  lcd = g_new(MemoryRegion, 1);
>  memory_region_init_io(lcd, NULL, _lcd_ops, s, "boston-lcd",
0x8);
>  memory_region_add_subregion_overlap(sys_mem, 0x17fff000, lcd, 0);
>
>  chr = qemu_chr_new("lcd", "vc:320x240", NULL);
>  qemu_chr_fe_init(>lcd_display, chr, NULL);
>  qemu_chr_fe_set_handlers(>lcd_display, NULL, NULL,
>   boston_lcd_event, NULL, s, NULL, true);
>
>  ahci =
pci_create_simple_multifunction(_BRIDGE(>root)->sec_bus,
>  

Re: [PATCH-for-5.0 09/12] hw/mips/mips_malta: Add missing error-propagation code

2020-03-26 Thread Aleksandar Markovic
21:19 Sre, 25.03.2020. Philippe Mathieu-Daudé  је
написао/ла:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file
scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in
object_property_set_int() hw/mips/mips_malta.c:1193:4]]
>   [[manual check required: error_propagate() might be missing in
object_property_set_str() hw/mips/mips_malta.c:1192:4]]
>
> Add the missing error_propagate() after manual review by adding
> a Error* parameter to create_cps().
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/mips/mips_malta.c | 19 ++-
>  1 file changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/hw/mips/mips_malta.c b/hw/mips/mips_malta.c
> index e4c4de1b4e..8d43cfd41b 100644
> --- a/hw/mips/mips_malta.c
> +++ b/hw/mips/mips_malta.c
> @@ -1183,22 +1183,31 @@ static void create_cpu_without_cps(MachineState
*ms,
>  }
>
>  static void create_cps(MachineState *ms, MaltaState *s,
> -   qemu_irq *cbus_irq, qemu_irq *i8259_irq)
> +   qemu_irq *cbus_irq, qemu_irq *i8259_irq,
> +   Error **errp)
>  {
>  Error *err = NULL;
>
>  sysbus_init_child_obj(OBJECT(s), "cps", OBJECT(>cps),
sizeof(s->cps),
>TYPE_MIPS_CPS);
>  object_property_set_str(OBJECT(>cps), ms->cpu_type, "cpu-type",
);
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>cps), ms->smp.cpus, "num-vp",
);
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_bool(OBJECT(>cps), true, "realized", );
> -if (err != NULL) {
> -error_report("%s", error_get_pretty(err));
> -exit(1);
> +if (err) {
> +error_propagate(errp, err);
> +return;
>  }
>
>  sysbus_mmio_map_overlap(SYS_BUS_DEVICE(>cps), 0, 0, 1);
>
>  *i8259_irq = get_cps_irq(>cps, 3);
>  *cbus_irq = NULL;
>  }
> @@ -1206,9 +1215,9 @@ static void create_cps(MachineState *ms, MaltaState
*s,
>  static void mips_create_cpu(MachineState *ms, MaltaState *s,
>  qemu_irq *cbus_irq, qemu_irq *i8259_irq)
>  {
>  if ((ms->smp.cpus > 1) && cpu_supports_cps_smp(ms->cpu_type)) {
> -create_cps(ms, s, cbus_irq, i8259_irq);
> +create_cps(ms, s, cbus_irq, i8259_irq, _fatal);
>  } else {
>  create_cpu_without_cps(ms, cbus_irq, i8259_irq);
>  }
>  }
> --
> 2.21.1
>

Reviewed-by: Aleksandar Markovic 


Re: [PATCH-for-5.0 07/12] hw/mips/cps: Add missing error-propagation code

2020-03-26 Thread Aleksandar Markovic
21:18 Sre, 25.03.2020. Philippe Mathieu-Daudé  је
написао/ла:
>
> Patch created mechanically by running:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file
scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --in-place --dir hw
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/mips/cps.c | 52 +++
>  1 file changed, 52 insertions(+)
>
> diff --git a/hw/mips/cps.c b/hw/mips/cps.c
> index 92b9b1a5f6..d682633401 100644
> --- a/hw/mips/cps.c
> +++ b/hw/mips/cps.c
> @@ -68,100 +68,152 @@ static bool cpu_mips_itu_supported(CPUMIPSState
*env)
>  static void mips_cps_realize(DeviceState *dev, Error **errp)
>  {
>  MIPSCPSState *s = MIPS_CPS(dev);
>  CPUMIPSState *env;
>  MIPSCPU *cpu;
>  int i;
>  Error *err = NULL;
>  target_ulong gcr_base;
>  bool itu_present = false;
>  bool saar_present = false;
>
>  for (i = 0; i < s->num_vp; i++) {
>  cpu = MIPS_CPU(cpu_create(s->cpu_type));
>
>  /* Init internal devices */
>  cpu_mips_irq_init_cpu(cpu);
>  cpu_mips_clock_init(cpu);
>
>  env = >env;
>  if (cpu_mips_itu_supported(env)) {
>  itu_present = true;
>  /* Attach ITC Tag to the VP */
>  env->itc_tag = mips_itu_get_tag_region(>itu);
>  env->itu = >itu;
>  }
>  qemu_register_reset(main_cpu_reset, cpu);
>  }
>
>  cpu = MIPS_CPU(first_cpu);
>  env = >env;
>  saar_present = (bool)env->saarp;
>
>  /* Inter-Thread Communication Unit */
>  if (itu_present) {
>  sysbus_init_child_obj(OBJECT(dev), "itu", >itu,
sizeof(s->itu),
>TYPE_MIPS_ITU);
>  object_property_set_int(OBJECT(>itu), 16, "num-fifo", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>itu), 16, "num-semaphores",
);
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_bool(OBJECT(>itu), saar_present,
"saar-present",
>   );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  if (saar_present) {
>  s->itu.saar = >CP0_SAAR;
>  }
>  object_property_set_bool(OBJECT(>itu), true, "realized",
);
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  if (err != NULL) {
>  error_propagate(errp, err);
>  return;
>  }
>
>  memory_region_add_subregion(>container, 0,
>
sysbus_mmio_get_region(SYS_BUS_DEVICE(>itu), 0));
>  }
>
>  /* Cluster Power Controller */
>  sysbus_init_child_obj(OBJECT(dev), "cpc", >cpc, sizeof(s->cpc),
>TYPE_MIPS_CPC);
>  object_property_set_int(OBJECT(>cpc), s->num_vp, "num-vp", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>cpc), 1, "vp-start-running",
);
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_bool(OBJECT(>cpc), true, "realized", );
>  if (err != NULL) {
>  error_propagate(errp, err);
>  return;
>  }
>
>  memory_region_add_subregion(>container, 0,
>
 sysbus_mmio_get_region(SYS_BUS_DEVICE(>cpc), 0));
>
>  /* Global Interrupt Controller */
>  sysbus_init_child_obj(OBJECT(dev), "gic", >gic, sizeof(s->gic),
>TYPE_MIPS_GIC);
>  object_property_set_int(OBJECT(>gic), s->num_vp, "num-vp", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>gic), 128, "num-irq", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_bool(OBJECT(>gic), true, "realized", );
>  if (err != NULL) {
>  error_propagate(errp, err);
>  return;
>  }
>
>  memory_region_add_subregion(>container, 0,
>
 sysbus_mmio_get_region(SYS_BUS_DEVICE(>gic), 0));
>
>  /* Global Configuration Registers */
>  gcr_base = env->CP0_CMGCRBase << 4;
>
>  sysbus_init_child_obj(OBJECT(dev), "gcr", >gcr, sizeof(s->gcr),
>TYPE_MIPS_GCR);
>  object_property_set_int(OBJECT(>gcr), s->num_vp, "num-vp", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>gcr), 0x800, "gcr-rev", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>gcr), gcr_base, "gcr-base", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  

[PATCH v5 3/6] 9pfs: make v9fs_readdir_response_size() public

2020-03-26 Thread Christian Schoenebeck
Rename function v9fs_readdir_data_size() -> v9fs_readdir_response_size()
and make it callable from other units. So far this function is only
used by 9p.c, however subsequent patch requires the function to be
callable from another 9pfs unit. And as we're at it; also make it clear
for what this function is used for.

Signed-off-by: Christian Schoenebeck 
---
 hw/9pfs/9p.c | 10 --
 hw/9pfs/9p.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 1aff4f1fa8..bd8a7cbfac 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -2322,7 +2322,13 @@ out_nofid:
 pdu_complete(pdu, err);
 }
 
-static size_t v9fs_readdir_data_size(V9fsString *name)
+/**
+ * Returns size required in Rreaddir response for the passed dirent @p name.
+ *
+ * @param name - directory entry's name (i.e. file name, directory name)
+ * @returns required size in bytes
+ */
+size_t v9fs_readdir_response_size(V9fsString *name)
 {
 /*
  * Size of each dirent on the wire: size of qid (13) + size of offset (8)
@@ -2357,7 +2363,7 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, 
V9fsFidState *fidp,
 }
 v9fs_string_init();
 v9fs_string_sprintf(, "%s", dent->d_name);
-if ((count + v9fs_readdir_data_size()) > maxsize) {
+if ((count + v9fs_readdir_response_size()) > maxsize) {
 v9fs_readdir_unlock(>fs.dir);
 
 /* Ran out of buffer. Set dir back to old position and return */
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index b8f72a3bd9..9553700dbb 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -419,6 +419,7 @@ void v9fs_path_init(V9fsPath *path);
 void v9fs_path_free(V9fsPath *path);
 void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...);
 void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src);
+size_t v9fs_readdir_response_size(V9fsString *name);
 int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
   const char *name, V9fsPath *path);
 int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
-- 
2.20.1




[PATCH v5 5/6] 9pfs: T_readdir latency optimization

2020-03-26 Thread Christian Schoenebeck
Make top half really top half and bottom half really bottom half:

Each T_readdir request handling is hopping between threads (main
I/O thread and background I/O driver threads) several times for
every individual directory entry, which sums up to huge latencies
for handling just a single T_readdir request.

Instead of doing that, collect now all required directory entries
(including all potentially required stat buffers for each entry) in
one rush on a background I/O thread from fs driver by calling the
previously added function v9fs_co_readdir_many() instead of
v9fs_co_readdir(), then assemble the entire resulting network
response message for the readdir request on main I/O thread. The
fs driver is still aborting the directory entry retrieval loop
(on the background I/O thread inside of v9fs_co_readdir_many())
as soon as it would exceed the client's requested maximum R_readdir
response size. So this will not introduce a performance penalty on
another end.

Signed-off-by: Christian Schoenebeck 
---
 hw/9pfs/9p.c | 122 +++
 1 file changed, 55 insertions(+), 67 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index bd8a7cbfac..5246d96a08 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -971,30 +971,6 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, 
V9fsFidState *fidp,
 return 0;
 }
 
-static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
-  struct dirent *dent, V9fsQID *qidp)
-{
-struct stat stbuf;
-V9fsPath path;
-int err;
-
-v9fs_path_init();
-
-err = v9fs_co_name_to_path(pdu, >path, dent->d_name, );
-if (err < 0) {
-goto out;
-}
-err = v9fs_co_lstat(pdu, , );
-if (err < 0) {
-goto out;
-}
-err = stat_to_qid(pdu, , qidp);
-
-out:
-v9fs_path_free();
-return err;
-}
-
 V9fsPDU *pdu_alloc(V9fsState *s)
 {
 V9fsPDU *pdu = NULL;
@@ -2337,6 +2313,18 @@ size_t v9fs_readdir_response_size(V9fsString *name)
 return 24 + v9fs_string_size(name);
 }
 
+static void v9fs_free_dirents(struct V9fsDirEnt *e)
+{
+struct V9fsDirEnt *next = NULL;
+
+for (; e; e = next) {
+next = e->next;
+g_free(e->dent);
+g_free(e->st);
+g_free(e);
+}
+}
+
 static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
 int32_t maxsize)
 {
@@ -2345,54 +2333,53 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, 
V9fsFidState *fidp,
 V9fsString name;
 int len, err = 0;
 int32_t count = 0;
-off_t saved_dir_pos;
 struct dirent *dent;
+struct stat *st;
+struct V9fsDirEnt *entries = NULL;
 
-/* save the directory position */
-saved_dir_pos = v9fs_co_telldir(pdu, fidp);
-if (saved_dir_pos < 0) {
-return saved_dir_pos;
-}
-
-while (1) {
-v9fs_readdir_lock(>fs.dir);
+/*
+ * inode remapping requires the device id, which in turn might be
+ * different for different directory entries, so if inode remapping is
+ * enabled we have to make a full stat for each directory entry
+ */
+const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
 
-err = v9fs_co_readdir(pdu, fidp, );
-if (err || !dent) {
-break;
-}
-v9fs_string_init();
-v9fs_string_sprintf(, "%s", dent->d_name);
-if ((count + v9fs_readdir_response_size()) > maxsize) {
-v9fs_readdir_unlock(>fs.dir);
+/*
+ * Fetch all required directory entries altogether on a background IO
+ * thread from fs driver. We don't want to do that for each entry
+ * individually, because hopping between threads (this main IO thread
+ * and background IO driver thread) would sum up to huge latencies.
+ */
+count = v9fs_co_readdir_many(pdu, fidp, , maxsize, dostat);
+if (count < 0) {
+err = count;
+count = 0;
+goto out;
+}
+count = 0;
 
-/* Ran out of buffer. Set dir back to old position and return */
-v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-v9fs_string_free();
-return count;
-}
+for (struct V9fsDirEnt *e = entries; e; e = e->next) {
+dent = e->dent;
 
 if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
-/*
- * dirent_to_qid() implies expensive stat call for each entry,
- * we must do that here though since inode remapping requires
- * the device id, which in turn might be different for
- * different entries; we cannot make any assumption to avoid
- * that here.
- */
-err = dirent_to_qid(pdu, fidp, dent, );
+st = e->st;
+/* e->st should never be NULL, but just to be sure */
+if (!st) {
+err = -1;
+break;
+}
+
+/* remap inode */
+err 

[PATCH v5 0/6] 9pfs: readdir optimization

2020-03-26 Thread Christian Schoenebeck
As previously mentioned, I was investigating performance issues with 9pfs.
Raw file read/write of 9pfs is actually quite good, provided that client
picked a reasonable high msize (maximum message size). I would recommend
to log a warning on 9p server side if a client attached with a small msize
that would cause performance issues for that reason.

However there are other aspects where 9pfs currently performs suboptimally,
especially readdir handling of 9pfs is extremely slow, a simple readdir
request of a guest typically blocks for several hundred milliseconds or
even several seconds, no matter how powerful the underlying hardware is.
The reason for this performance issue: latency.
Currently 9pfs is heavily dispatching a T_readdir request numerous times
between main I/O thread and a background I/O thread back and forth; in fact
it is actually hopping between threads even multiple times for every single
directory entry during T_readdir request handling which leads in total to
huge latencies for a single T_readdir request.

This patch series aims to address this severe performance issue of 9pfs
T_readdir request handling. The actual performance optimization is patch 5.

v4->v5:

  * Rebased to master (SHA-1 762fa6d7).

  * Dropped benchmark patches (see v4 if you want to run a benchmark on v5).

  * Divided split-readdir test into 3 individual tests, which also fixes the
previously discussed transport error [patch 1].

  * Fixed English spelling for 'split' [patch 1].

  * Rename max_count -> maxsize [NEW patch 2].

  * Divided previous huge readdir optimization patch into individual patches
[patch 3], [patch 4], [patch 5].

  * Added comment on v9fs_readdir_response_size() [patch 3].

  * Renamed v9fs_co_readdir_lowlat() -> v9fs_co_readdir_many() [patch 4].

  * Adjusted comment on v9fs_co_readdir_many() [patch 4].

  * Added comment on v9fs_co_run_in_worker() [NEW patch 6].

  * Adjusted commit log message of several patches.

Message-ID of previous version (v4):
  cover.1579567019.git.qemu_...@crudebyte.com

Christian Schoenebeck (6):
  tests/virtio-9p: added split readdir tests
  9pfs readdir: rename max_count -> maxsize
  9pfs: make v9fs_readdir_response_size() public
  9pfs: add new function v9fs_co_readdir_many()
  9pfs: T_readdir latency optimization
  9pfs: clarify latency of v9fs_co_run_in_worker()

 hw/9pfs/9p.c | 148 ++--
 hw/9pfs/9p.h |  23 +
 hw/9pfs/codir.c  | 181 ---
 hw/9pfs/coth.h   |  15 ++-
 tests/qtest/virtio-9p-test.c | 108 +
 5 files changed, 386 insertions(+), 89 deletions(-)

-- 
2.20.1




[PATCH v5 2/6] 9pfs readdir: rename max_count -> maxsize

2020-03-26 Thread Christian Schoenebeck
Although the 9p protocol specs use the term 'max_count' as argument
for Treaddir, let's rename our variables for that to 'maxsize'
instead, because 'max_count' is semantically completely wrong. This
variable does not count integral entries, it is rather a maximum
size (in bytes) of the destination (response) buffer being filled.

Since this is just refactoring, hence this patch does not introduce
any behaviour change at all.

Signed-off-by: Christian Schoenebeck 
---
 hw/9pfs/9p.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 9e046f7acb..1aff4f1fa8 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -2162,7 +2162,7 @@ static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, 
V9fsFidState *fidp,
 
 static int coroutine_fn v9fs_do_readdir_with_stat(V9fsPDU *pdu,
   V9fsFidState *fidp,
-  uint32_t max_count)
+  uint32_t maxsize)
 {
 V9fsPath path;
 V9fsStat v9stat;
@@ -2199,7 +2199,7 @@ static int coroutine_fn v9fs_do_readdir_with_stat(V9fsPDU 
*pdu,
 if (err < 0) {
 break;
 }
-if ((count + v9stat.size + 2) > max_count) {
+if ((count + v9stat.size + 2) > maxsize) {
 v9fs_readdir_unlock(>fs.dir);
 
 /* Ran out of buffer. Set dir back to old position and return */
@@ -2332,7 +2332,7 @@ static size_t v9fs_readdir_data_size(V9fsString *name)
 }
 
 static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
-int32_t max_count)
+int32_t maxsize)
 {
 size_t size;
 V9fsQID qid;
@@ -2357,7 +2357,7 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, 
V9fsFidState *fidp,
 }
 v9fs_string_init();
 v9fs_string_sprintf(, "%s", dent->d_name);
-if ((count + v9fs_readdir_data_size()) > max_count) {
+if ((count + v9fs_readdir_data_size()) > maxsize) {
 v9fs_readdir_unlock(>fs.dir);
 
 /* Ran out of buffer. Set dir back to old position and return */
@@ -2432,20 +2432,20 @@ static void coroutine_fn v9fs_readdir(void *opaque)
 size_t offset = 7;
 uint64_t initial_offset;
 int32_t count;
-uint32_t max_count;
+uint32_t maxsize;
 V9fsPDU *pdu = opaque;
 V9fsState *s = pdu->s;
 
 retval = pdu_unmarshal(pdu, offset, "dqd", ,
-   _offset, _count);
+   _offset, );
 if (retval < 0) {
 goto out_nofid;
 }
-trace_v9fs_readdir(pdu->tag, pdu->id, fid, initial_offset, max_count);
+trace_v9fs_readdir(pdu->tag, pdu->id, fid, initial_offset, maxsize);
 
 /* Enough space for a R_readdir header: size[4] Rreaddir tag[2] count[4] */
-if (max_count > s->msize - 11) {
-max_count = s->msize - 11;
+if (maxsize > s->msize - 11) {
+maxsize = s->msize - 11;
 warn_report_once(
 "9p: bad client: T_readdir with count > msize - 11"
 );
@@ -2465,7 +2465,7 @@ static void coroutine_fn v9fs_readdir(void *opaque)
 } else {
 v9fs_co_seekdir(pdu, fidp, initial_offset);
 }
-count = v9fs_do_readdir(pdu, fidp, max_count);
+count = v9fs_do_readdir(pdu, fidp, maxsize);
 if (count < 0) {
 retval = count;
 goto out;
-- 
2.20.1




[PATCH v5 1/6] tests/virtio-9p: added split readdir tests

2020-03-26 Thread Christian Schoenebeck
The previous, already existing 'basic' readdir test simply used a
'count' parameter big enough to retrieve all directory entries with a
single Treaddir request.

In the 3 new 'split' readdir tests added by this patch, directory
entries are retrieved, split over several Treaddir requests by picking
small 'count' parameters which force the server to truncate the
response. So the test client sends as many Treaddir requests as
necessary to get all directory entries.

The following 3 new tests are added (executed in this sequence):

1. Split readdir test with count=512
2. Split readdir test with count=256
3. Split readdir test with count=128

This test case sequence is chosen because the smaller the 'count' value,
the higher the chance of errors in case of implementation bugs on server
side.

Signed-off-by: Christian Schoenebeck 
---
 tests/qtest/virtio-9p-test.c | 108 +++
 1 file changed, 108 insertions(+)

diff --git a/tests/qtest/virtio-9p-test.c b/tests/qtest/virtio-9p-test.c
index 2167322985..de30b717b6 100644
--- a/tests/qtest/virtio-9p-test.c
+++ b/tests/qtest/virtio-9p-test.c
@@ -578,6 +578,7 @@ static bool fs_dirents_contain_name(struct V9fsDirent *e, 
const char* name)
 return false;
 }
 
+/* basic readdir test where reply fits into a single response message */
 static void fs_readdir(void *obj, void *data, QGuestAllocator *t_alloc)
 {
 QVirtio9P *v9p = obj;
@@ -631,6 +632,89 @@ static void fs_readdir(void *obj, void *data, 
QGuestAllocator *t_alloc)
 g_free(wnames[0]);
 }
 
+/* readdir test where overall request is split over several messages */
+static void fs_readdir_split(void *obj, void *data, QGuestAllocator *t_alloc,
+ uint32_t count)
+{
+QVirtio9P *v9p = obj;
+alloc = t_alloc;
+char *const wnames[] = { g_strdup(QTEST_V9FS_SYNTH_READDIR_DIR) };
+uint16_t nqid;
+v9fs_qid qid;
+uint32_t nentries, npartialentries;
+struct V9fsDirent *entries, *tail, *partialentries;
+P9Req *req;
+int fid;
+uint64_t offset;
+
+fs_attach(v9p, NULL, t_alloc);
+
+fid = 1;
+offset = 0;
+entries = NULL;
+nentries = 0;
+tail = NULL;
+
+req = v9fs_twalk(v9p, 0, fid, 1, wnames, 0);
+v9fs_req_wait_for_reply(req, NULL);
+v9fs_rwalk(req, , NULL);
+g_assert_cmpint(nqid, ==, 1);
+
+req = v9fs_tlopen(v9p, fid, O_DIRECTORY, 0);
+v9fs_req_wait_for_reply(req, NULL);
+v9fs_rlopen(req, , NULL);
+
+/*
+ * send as many Treaddir requests as required to get all directory
+ * entries
+ */
+while (true) {
+npartialentries = 0;
+partialentries = NULL;
+
+req = v9fs_treaddir(v9p, fid, offset, count, 0);
+v9fs_req_wait_for_reply(req, NULL);
+v9fs_rreaddir(req, , , );
+if (npartialentries > 0 && partialentries) {
+if (!entries) {
+entries = partialentries;
+nentries = npartialentries;
+tail = partialentries;
+} else {
+tail->next = partialentries;
+nentries += npartialentries;
+}
+while (tail->next) {
+tail = tail->next;
+}
+offset = tail->offset;
+} else {
+break;
+}
+}
+
+g_assert_cmpint(
+nentries, ==,
+QTEST_V9FS_SYNTH_READDIR_NFILES + 2 /* "." and ".." */
+);
+
+/*
+ * Check all file names exist in returned entries, ignore their order
+ * though.
+ */
+g_assert_cmpint(fs_dirents_contain_name(entries, "."), ==, true);
+g_assert_cmpint(fs_dirents_contain_name(entries, ".."), ==, true);
+for (int i = 0; i < QTEST_V9FS_SYNTH_READDIR_NFILES; ++i) {
+char *name = g_strdup_printf(QTEST_V9FS_SYNTH_READDIR_FILE, i);
+g_assert_cmpint(fs_dirents_contain_name(entries, name), ==, true);
+g_free(name);
+}
+
+v9fs_free_dirents(entries);
+
+g_free(wnames[0]);
+}
+
 static void fs_walk_no_slash(void *obj, void *data, QGuestAllocator *t_alloc)
 {
 QVirtio9P *v9p = obj;
@@ -793,6 +877,24 @@ static void fs_flush_ignored(void *obj, void *data, 
QGuestAllocator *t_alloc)
 g_free(wnames[0]);
 }
 
+static void fs_readdir_split_128(void *obj, void *data,
+ QGuestAllocator *t_alloc)
+{
+fs_readdir_split(obj, data, t_alloc, 128);
+}
+
+static void fs_readdir_split_256(void *obj, void *data,
+ QGuestAllocator *t_alloc)
+{
+fs_readdir_split(obj, data, t_alloc, 256);
+}
+
+static void fs_readdir_split_512(void *obj, void *data,
+ QGuestAllocator *t_alloc)
+{
+fs_readdir_split(obj, data, t_alloc, 512);
+}
+
 static void register_virtio_9p_test(void)
 {
 qos_add_test("config", "virtio-9p", pci_config, NULL);
@@ -810,6 +912,12 @@ static void register_virtio_9p_test(void)
 qos_add_test("fs/flush/ignored", "virtio-9p", fs_flush_ignored,

[PATCH v5 6/6] 9pfs: clarify latency of v9fs_co_run_in_worker()

2020-03-26 Thread Christian Schoenebeck
As we just fixed a severe performance issue with Treaddir request
handling, clarify this overall issue as a comment on
v9fs_co_run_in_worker() with the intention to hopefully prevent
such performance mistakes in future (and fixing other yet
outstanding ones).

Signed-off-by: Christian Schoenebeck 
---
 hw/9pfs/coth.h | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
index a6851822d5..8b6f76840a 100644
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -19,7 +19,7 @@
 #include "qemu/coroutine.h"
 #include "9p.h"
 
-/*
+/**
  * we want to use bottom half because we want to make sure the below
  * sequence of events.
  *
@@ -28,6 +28,16 @@
  *   3. Enter the coroutine in the worker thread.
  * we cannot swap step 1 and 2, because that would imply worker thread
  * can enter coroutine while step1 is still running
+ *
+ * @b PERFORMANCE @b CONSIDERATIONS: As a rule of thumb, keep in mind
+ * that hopping between threads adds @b latency! So when handling a
+ * 9pfs request, avoid calling v9fs_co_run_in_worker() too often, because
+ * this might otherwise sum up to a significant, huge overall latency for
+ * providing the response for just a single request. For that reason it
+ * is highly recommended to fetch all data from fs driver with a single
+ * fs driver request on a background I/O thread (bottom half) in one rush
+ * first and then eventually assembling the final response from that data
+ * on main I/O thread (top half).
  */
 #define v9fs_co_run_in_worker(code_block)   \
 do {\
-- 
2.20.1




[PATCH v5 4/6] 9pfs: add new function v9fs_co_readdir_many()

2020-03-26 Thread Christian Schoenebeck
The newly added function v9fs_co_readdir_many() retrieves multiple
directory entries with a single fs driver request. It is intended to
replace uses of v9fs_co_readdir(), the latter only retrives a single
directory entry per fs driver request instead.

The reason for this planned replacement is that for every fs driver
request the coroutine is dispatched from main I/O thread to a
background I/O thread and eventually dispatched back to main I/O
thread. Hopping between threads adds latency. So if a 9pfs Treaddir
request reads a large amount of directory entries, this currently
sums up to huge latencies of several hundred ms or even more. So
using v9fs_co_readdir_many() instead of v9fs_co_readdir() will
provide significant performance improvements.

Signed-off-by: Christian Schoenebeck 
---
 hw/9pfs/9p.h|  22 ++
 hw/9pfs/codir.c | 181 +---
 hw/9pfs/coth.h  |   3 +
 3 files changed, 195 insertions(+), 11 deletions(-)

diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 9553700dbb..116977939b 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -215,6 +215,28 @@ static inline void v9fs_readdir_init(V9fsDir *dir)
 qemu_mutex_init(>readdir_mutex);
 }
 
+/**
+ * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
+ * which is a chained list of directory entries.
+ */
+typedef struct V9fsDirEnt {
+/* mandatory (must not be NULL) information for all readdir requests */
+struct dirent *dent;
+/*
+ * optional (may be NULL): A full stat of each directory entry is just
+ * done if explicitly told to fs driver.
+ */
+struct stat *st;
+/*
+ * instead of an array, directory entries are always returned as
+ * chained list, that's because the amount of entries retrieved by fs
+ * drivers is dependent on the individual entries' name (since response
+ * messages are size limited), so the final amount cannot be estimated
+ * before hand
+ */
+struct V9fsDirEnt *next;
+} V9fsDirEnt;
+
 /*
  * Filled by fs driver on open and other
  * calls.
diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
index 73f9a751e1..45c65a8f5b 100644
--- a/hw/9pfs/codir.c
+++ b/hw/9pfs/codir.c
@@ -18,28 +18,187 @@
 #include "qemu/main-loop.h"
 #include "coth.h"
 
+/*
+ * This is solely executed on a background IO thread.
+ */
+static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent)
+{
+int err = 0;
+V9fsState *s = pdu->s;
+struct dirent *entry;
+
+errno = 0;
+entry = s->ops->readdir(>ctx, >fs);
+if (!entry && errno) {
+*dent = NULL;
+err = -errno;
+} else {
+*dent = entry;
+}
+return err;
+}
+
+/*
+ * TODO: This will be removed for performance reasons.
+ * Use v9fs_co_readdir_many() instead.
+ */
 int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
  struct dirent **dent)
 {
 int err;
-V9fsState *s = pdu->s;
 
 if (v9fs_request_cancelled(pdu)) {
 return -EINTR;
 }
-v9fs_co_run_in_worker(
-{
-struct dirent *entry;
+v9fs_co_run_in_worker({
+err = do_readdir(pdu, fidp, dent);
+});
+return err;
+}
+
+/*
+ * This is solely executed on a background IO thread.
+ *
+ * See v9fs_co_readdir_many() (as its only user) below for details.
+ */
+static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
+ struct V9fsDirEnt **entries,
+ int32_t maxsize, bool dostat)
+{
+V9fsState *s = pdu->s;
+V9fsString name;
+int len, err = 0;
+int32_t size = 0;
+off_t saved_dir_pos;
+struct dirent *dent;
+struct V9fsDirEnt *e = NULL;
+V9fsPath path;
+struct stat stbuf;
 
-errno = 0;
-entry = s->ops->readdir(>ctx, >fs);
-if (!entry && errno) {
+*entries = NULL;
+v9fs_path_init();
+
+/*
+ * TODO: Here should be a warn_report_once() if lock failed.
+ *
+ * With a good 9p client we should not get into concurrency here,
+ * because a good client would not use the same fid for concurrent
+ * requests. We do the lock here for safety reasons though. However
+ * the client would then suffer performance issues, so better log that
+ * issue here.
+ */
+v9fs_readdir_lock(>fs.dir);
+
+/* save the directory position */
+saved_dir_pos = s->ops->telldir(>ctx, >fs);
+if (saved_dir_pos < 0) {
+err = saved_dir_pos;
+goto out;
+}
+
+while (true) {
+/* get directory entry from fs driver */
+err = do_readdir(pdu, fidp, );
+if (err || !dent) {
+break;
+}
+
+/*
+ * stop this loop as soon as it would exceed the allowed maximum
+ * response message size for the directory entries collected so far,
+ * because anything beyond that size would need to be discarded by
+ * 9p controller (main thread / top 

Re: [PULL 0/6] Linux user for 5.0 patches

2020-03-26 Thread Peter Maydell
On Thu, 26 Mar 2020 at 16:43, Laurent Vivier  wrote:
>
> Le 26/03/2020 à 16:42, Peter Maydell a écrit :
> > On Thu, 26 Mar 2020 at 07:24, Laurent Vivier  wrote:
> >>
> >> The following changes since commit 
> >> 736cf607e40674776d752acc201f565723e86045:
> >>
> >>   Update version for v5.0.0-rc0 release (2020-03-24 17:50:00 +)
> >>
> >> are available in the Git repository at:
> >>
> >>   git://github.com/vivier/qemu.git tags/linux-user-for-5.0-pull-request
> >>
> >> for you to fetch changes up to a52f5f87bece827a338d6eb3332e3def86fb9c33:
> >>
> >>   linux-user: Flush out implementation of gettimeofday (2020-03-26 
> >> 08:08:54 +0100)
> >>
> >> 
> >> Emulate x86_64 vsyscalls
> >> Fix syscall_nr.h cleanup
> >>
> >> 
> >
> > Still fails :-(
>
> I would say it was expected...
>
> I think your build dir is corrupted by a previous build.
>
> You should have old .o file without .d file, and thus the .o file is not
> refreshed (check the date of cpu_loop.o). We cannot cleanup that before
> starting the build. The purpose of the cleanup in configure was to avoid
> this kind of problem but I did it badly.
>
> If you want to check incremental build, cleanup your build dir, checkout
> v4.20 or master, build it, and then build the PR branch. it will work:
> it's tested.

Seems like it. I blew away and recreated the build tree, did a build
with master, and then applied this pullreq with my usual process
(so incremental build from there), and it was fine.

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.0
for any user-visible changes.

-- PMM



Re: [PATCH-for-5.0 12/12] hw/riscv/sifive_u: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:19, Philippe Mathieu-Daudé  wrote:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_bool() hw/riscv/sifive_u.c:558:4]]
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_bool() hw/riscv/sifive_u.c:561:4]]
>
> Add the missing error_propagate() after manual review.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/riscv/sifive_u.c | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/hw/riscv/sifive_u.c b/hw/riscv/sifive_u.c
> index 56351c4faa..01e44018cd 100644
> --- a/hw/riscv/sifive_u.c
> +++ b/hw/riscv/sifive_u.c
> @@ -473,113 +473,121 @@ static void 
> riscv_sifive_u_machine_instance_init(Object *obj)
>  static void riscv_sifive_u_soc_realize(DeviceState *dev, Error **errp)
>  {
>  MachineState *ms = MACHINE(qdev_get_machine());
>  SiFiveUSoCState *s = RISCV_U_SOC(dev);
>  const struct MemmapEntry *memmap = sifive_u_memmap;
>  MemoryRegion *system_memory = get_system_memory();
>  MemoryRegion *mask_rom = g_new(MemoryRegion, 1);
>  MemoryRegion *l2lim_mem = g_new(MemoryRegion, 1);
>  qemu_irq plic_gpios[SIFIVE_U_PLIC_NUM_SOURCES];
>  char *plic_hart_config;
>  size_t plic_hart_config_len;
>  int i;
>  Error *err = NULL;
>  NICInfo *nd = _table[0];
>
>  object_property_set_bool(OBJECT(>e_cpus), true, "realized",
>   _abort);
>  object_property_set_bool(OBJECT(>u_cpus), true, "realized",
>   _abort);
>  /*
>   * The cluster must be realized after the RISC-V hart array container,
>   * as the container's CPU object is only created on realize, and the
>   * CPU must exist and have been parented into the cluster before the
>   * cluster is realized.
>   */
>  object_property_set_bool(OBJECT(>e_cluster), true, "realized",
>   _abort);
>  object_property_set_bool(OBJECT(>u_cluster), true, "realized",
>   _abort);

Different bug noticed in passing: these really ought not to be
using error_abort to realize things, as realize is a fairly
likely-to-fail operation on most objects (either now or in
the future if the object implementation changes).

>
>  /* boot rom */
>  memory_region_init_rom(mask_rom, OBJECT(dev), "riscv.sifive.u.mrom",
> memmap[SIFIVE_U_MROM].size, _fatal);
>  memory_region_add_subregion(system_memory, memmap[SIFIVE_U_MROM].base,
>  mask_rom);

>  object_property_set_bool(OBJECT(>prci), true, "realized", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  sysbus_mmio_map(SYS_BUS_DEVICE(>prci), 0, memmap[SIFIVE_U_PRCI].base);
>
>  object_property_set_bool(OBJECT(>otp), true, "realized", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}

The changes made in this patch are fine though:
Reviewed-by: Peter Maydell 

thanks
-- PMM



Re: [PATCH v2] hw/arm/collie: Put StrongARMState* into a CollieMachineState struct

2020-03-26 Thread Richard Henderson
On 3/26/20 1:49 PM, Peter Maydell wrote:
> Coverity complains that the collie_init() function leaks the memory
> allocated in sa1110_init().  This is true but not significant since
> the function is called only once on machine init and the memory must
> remain in existence until QEMU exits anyway.
> 
> Still, we can avoid the technical memory leak by keeping the pointer
> to the StrongARMState inside the machine state struct.  Switch from
> the simple DEFINE_MACHINE() style to defining a subclass of
> TYPE_MACHINE which extends the MachineState struct, and keep the
> pointer there.
> 
> Fixes: CID 1421921
> Signed-off-by: Peter Maydell 
> ---
> v1->v2: folded in the uncommitted change that fixes the
> arm_load_kernel() first argument.

Reviewed-by: Richard Henderson 

r~



Re: [PATCH-for-5.0 11/12] hw/net/xilinx_axienet: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:19, Philippe Mathieu-Daudé  wrote:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_link() hw/net/xilinx_axienet.c:969:4]]
>
> Add the missing error_propagate() after manual review.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/net/xilinx_axienet.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/hw/net/xilinx_axienet.c b/hw/net/xilinx_axienet.c
> index 704788811a..f11510a71e 100644
> --- a/hw/net/xilinx_axienet.c
> +++ b/hw/net/xilinx_axienet.c
> @@ -948,39 +948,42 @@ static NetClientInfo net_xilinx_enet_info = {
>  static void xilinx_enet_realize(DeviceState *dev, Error **errp)
>  {
>  XilinxAXIEnet *s = XILINX_AXI_ENET(dev);
>  XilinxAXIEnetStreamSlave *ds = 
> XILINX_AXI_ENET_DATA_STREAM(>rx_data_dev);
>  XilinxAXIEnetStreamSlave *cs = XILINX_AXI_ENET_CONTROL_STREAM(
>  
> >rx_control_dev);
>  Error *local_err = NULL;
>
>  object_property_add_link(OBJECT(ds), "enet", "xlnx.axi-ethernet",
>   (Object **) >enet,
>   object_property_allow_set_link,
>   OBJ_PROP_LINK_STRONG,
>   _err);

Again, Coccinelle seems to have not spotted the missing error check here...

>  object_property_add_link(OBJECT(cs), "enet", "xlnx.axi-ethernet",
>   (Object **) >enet,
>   object_property_allow_set_link,
>   OBJ_PROP_LINK_STRONG,
>   _err);
>  if (local_err) {
>  goto xilinx_enet_realize_fail;
>  }
>  object_property_set_link(OBJECT(ds), OBJECT(s), "enet", _err);
> +if (local_err) {
> +goto xilinx_enet_realize_fail;
> +}
>  object_property_set_link(OBJECT(cs), OBJECT(s), "enet", _err);
>  if (local_err) {
>  goto xilinx_enet_realize_fail;
>  }

thanks
-- PMM



Re: [PATCH-for-5.0 10/12] hw/misc/macio/macio: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:19, Philippe Mathieu-Daudé  wrote:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_bool() hw/misc/macio/macio.c:350:8]]
>
> Add the missing error_propagate() after manual review.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---

Reviewed-by: Peter Maydell 

thanks
-- PMM



Re: [PATCH-for-5.0 09/12] hw/mips/mips_malta: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:19, Philippe Mathieu-Daudé  wrote:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_int() hw/mips/mips_malta.c:1193:4]]
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_str() hw/mips/mips_malta.c:1192:4]]
>
> Add the missing error_propagate() after manual review by adding
> a Error* parameter to create_cps().
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/mips/mips_malta.c | 19 ++-
>  1 file changed, 14 insertions(+), 5 deletions(-)


Reviewed-by: Peter Maydell 

thanks
-- PMM



Re: [PATCH-for-5.0 08/12] hw/mips/boston: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:18, Philippe Mathieu-Daudé  wrote:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_int() hw/mips/boston.c:462:4]]
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_str() hw/mips/boston.c:460:4]]
>
> Since the uses are inside a MachineClass::init() function,
> directly use _fatal instead of error_propagate().
>
> Signed-off-by: Philippe Mathieu-Daudé 

Reviewed-by: Peter Maydell 

thanks
-- PMM



Re: [PATCH-for-5.0 06/12] hw/dma/xilinx_axidma: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:18, Philippe Mathieu-Daudé  wrote:
>
> Running the coccinelle script produced:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --dir hw
>
>   [[manual check required: error_propagate() might be missing in 
> object_property_set_link() hw//dma/xilinx_axidma.c:542:4]]
>
> Add the missing error_propagate() after manual review.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/dma/xilinx_axidma.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/hw/dma/xilinx_axidma.c b/hw/dma/xilinx_axidma.c
> index 018f36991b..6e3406321c 100644
> --- a/hw/dma/xilinx_axidma.c
> +++ b/hw/dma/xilinx_axidma.c
> @@ -521,39 +521,42 @@ static const MemoryRegionOps axidma_ops = {
>  static void xilinx_axidma_realize(DeviceState *dev, Error **errp)
>  {
>  XilinxAXIDMA *s = XILINX_AXI_DMA(dev);
>  XilinxAXIDMAStreamSlave *ds = 
> XILINX_AXI_DMA_DATA_STREAM(>rx_data_dev);
>  XilinxAXIDMAStreamSlave *cs = XILINX_AXI_DMA_CONTROL_STREAM(
>  
> >rx_control_dev);
>  Error *local_err = NULL;
>
>  object_property_add_link(OBJECT(ds), "dma", TYPE_XILINX_AXI_DMA,
>   (Object **)>dma,
>   object_property_allow_set_link,
>   OBJ_PROP_LINK_STRONG,
>   _err);

Isn't there also a check on local_err missing here ?

>  object_property_add_link(OBJECT(cs), "dma", TYPE_XILINX_AXI_DMA,
>   (Object **)>dma,
>   object_property_allow_set_link,
>   OBJ_PROP_LINK_STRONG,
>   _err);
>  if (local_err) {
>  goto xilinx_axidma_realize_fail;
>  }
>  object_property_set_link(OBJECT(ds), OBJECT(s), "dma", _err);
> +if (local_err) {
> +goto xilinx_axidma_realize_fail;
> +}
>  object_property_set_link(OBJECT(cs), OBJECT(s), "dma", _err);
>  if (local_err) {
>  goto xilinx_axidma_realize_fail;
>  }

thanks
-- PMM



Re: [PATCH v16 Kernel 2/7] vfio iommu: Remove atomicity of ref_count of pinned pages

2020-03-26 Thread Kirti Wankhede




On 3/26/2020 4:19 PM, Cornelia Huck wrote:

On Wed, 25 Mar 2020 01:02:34 +0530
Kirti Wankhede  wrote:


vfio_pfn.ref_count is always updated by holding iommu->lock, using atomic


s/by/while/



Ok.


variable is overkill.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Eric Auger 
---
  drivers/vfio/vfio_iommu_type1.c | 9 +
  1 file changed, 5 insertions(+), 4 deletions(-)



Reviewed-by: Cornelia Huck 



Thanks,
Kirti



Re: [PATCH-for-5.0 04/12] hw/arm/stm32fx05_soc: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:18, Philippe Mathieu-Daudé  wrote:
>
> Patch created mechanically by running:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --in-place --dir hw
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/arm/stm32f205_soc.c | 4 
>  hw/arm/stm32f405_soc.c | 4 
>  2 files changed, 8 insertions(+)

Reviewed-by: Peter Maydell 

thanks
-- PMM



Re: [PATCH-for-5.0 07/12] hw/mips/cps: Add missing error-propagation code

2020-03-26 Thread Peter Maydell
On Wed, 25 Mar 2020 at 19:18, Philippe Mathieu-Daudé  wrote:
>
> Patch created mechanically by running:
>
>   $ spatch \
> --macro-file scripts/cocci-macro-file.h --include-headers \
> --sp-file 
> scripts/coccinelle/object_property_missing_error_propagate.cocci \
> --keep-comments --smpl-spacing --in-place --dir hw
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/mips/cps.c | 52 +++
>  1 file changed, 52 insertions(+)
>

>  /* Inter-Thread Communication Unit */
>  if (itu_present) {
>  sysbus_init_child_obj(OBJECT(dev), "itu", >itu, sizeof(s->itu),
>TYPE_MIPS_ITU);
>  object_property_set_int(OBJECT(>itu), 16, "num-fifo", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>itu), 16, "num-semaphores", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_bool(OBJECT(>itu), saar_present, 
> "saar-present",
>   );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  if (saar_present) {
>  s->itu.saar = >CP0_SAAR;
>  }
>  object_property_set_bool(OBJECT(>itu), true, "realized", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  if (err != NULL) {
>  error_propagate(errp, err);
>  return;
>  }

I think Coccinelle has been fooled here by the slightly non-idiomatic
use of "err != NULL" in the guard and has inserted a duplicate
check...

>  memory_region_add_subregion(>container, 0,
> sysbus_mmio_get_region(SYS_BUS_DEVICE(>itu), 
> 0));
>  }
>
>  /* Cluster Power Controller */
>  sysbus_init_child_obj(OBJECT(dev), "cpc", >cpc, sizeof(s->cpc),
>TYPE_MIPS_CPC);
>  object_property_set_int(OBJECT(>cpc), s->num_vp, "num-vp", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_int(OBJECT(>cpc), 1, "vp-start-running", );
> +if (err) {
> +error_propagate(errp, err);
> +return;
> +}
>  object_property_set_bool(OBJECT(>cpc), true, "realized", );
>  if (err != NULL) {
>  error_propagate(errp, err);
>  return;
>  }

...but oddly it gets it right here and in a couple of other cases
in this patch.

thanks
-- PMM



Re: [PATCH v16 Kernel 1/7] vfio: KABI for migration interface for device state

2020-03-26 Thread Kirti Wankhede




On 3/26/2020 4:11 PM, Cornelia Huck wrote:

On Wed, 25 Mar 2020 01:02:33 +0530
Kirti Wankhede  wrote:


- Defined MIGRATION region type and sub-type.

- Defined vfio_device_migration_info structure which will be placed at the
   0th offset of migration region to get/set VFIO device related
   information. Defined members of structure and usage on read/write access.

- Defined device states and state transition details.

- Defined sequence to be followed while saving and resuming VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  include/uapi/linux/vfio.h | 228 ++
  1 file changed, 228 insertions(+)


(...)


+struct vfio_device_migration_info {
+   __u32 device_state; /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP  (0)
+#define VFIO_DEVICE_STATE_RUNNING   (1 << 0)
+#define VFIO_DEVICE_STATE_SAVING(1 << 1)
+#define VFIO_DEVICE_STATE_RESUMING  (1 << 2)
+#define VFIO_DEVICE_STATE_MASK  (VFIO_DEVICE_STATE_RUNNING | \
+VFIO_DEVICE_STATE_SAVING |  \
+VFIO_DEVICE_STATE_RESUMING)
+
+#define VFIO_DEVICE_STATE_VALID(state) \
+   (state & VFIO_DEVICE_STATE_RESUMING ? \
+   (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
+   ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING))
+
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
+   ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
+VFIO_DEVICE_STATE_RESUMING)
+
+   __u32 reserved;
+   __u64 pending_bytes;
+   __u64 data_offset;
+   __u64 data_size;
+} __attribute__((packed));


The 'packed' should not even be needed, I think?



Right, Above structure is padded properly. Removing it.


+
  /*
   * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
   * which allows direct access to non-MSIX registers which happened to be 
within


Generally, this looks sane to me; however, we should really have
something under Documentation/ in the long run that describes how this
works, so that you can find out about the protocol without having to
dig through headers.



But the documentation will have almost the same text as in this comment. 
Should we replicate it?


Thanks,
Kirti




Re: [PATCH v16 Kernel 5/7] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-03-26 Thread Kirti Wankhede




On 3/25/2020 7:48 AM, Yan Zhao wrote:

On Wed, Mar 25, 2020 at 03:32:37AM +0800, Kirti Wankhede wrote:

DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
size of allocated memory, set page size to be considered for bitmap and
set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 54 ++---
  include/uapi/linux/vfio.h   | 10 
  2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 27ed069c5053..b98a8d79e13a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -982,7 +982,8 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
  }
  
  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
  {
uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
@@ -1033,6 +1034,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 * will be returned if these conditions are not met.  The v2 interface
 * will only return success and a size of zero if there were no
 * mappings within the range.
+*
+* When VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP flag is set, unmap request
+* must be for single mapping. Multiple mappings with this flag set is
+* not supported.
 */
if (iommu->v2) {
dma = vfio_find_dma(iommu, unmap->iova, 1);
@@ -1040,6 +1045,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (dma->iova != unmap->iova || dma->size != unmap->size)) {

potential NULL pointer!

And could you address the comments in v14?
How to handle DSI unmaps in vIOMMU
(https://lore.kernel.org/kvm/20200323011041.GB5456@joy-OptiPlex-7040/)



Sorry, I drafted reply to it, but I missed to send, it remained in my drafts

>
> it happens in vIOMMU Domain level invalidation of IOTLB
> (domain-selective invalidation, see vtd_iotlb_domain_invalidate() in 
qemu).

> common in VTD lazy mode, and NOT just happening once at boot time.
> rather than invalidate page by page, it batches the page invalidation.
> so, when this invalidation takes place, even higher level page tables
> have been invalid and therefore it has to invalidate a bigger 
combined range.

> That's why we see IOVAs are mapped in 4k pages, but are unmapped in 2M
> pages.
>
> I think those UNMAPs should also have GET_DIRTY_BIMTAP flag on, right?


vtd_iotlb_domain_invalidate()
  vtd_sync_shadow_page_table()
vtd_sync_shadow_page_table_range(vtd_as, , 0, UINT64_MAX)
  vtd_page_walk()
vtd_page_walk_level() - walk over specific level for IOVA range
  vtd_page_walk_one()
memory_region_notify_iommu()
...
  vfio_iommu_map_notify()

In the above trace, isn't page walk will take care of creating proper 
IOTLB entry which should be same as created during mapping for that 
IOTLB entry?



>>>
>>> Such unmap would callback vfio_iommu_map_notify() in QEMU. In
>>> vfio_iommu_map_notify(), unmap is called on same range >>> iotlb->addr_mask + 1> which was used for map. Secondly unmap with 
bitmap

>>> will be called only when device state has _SAVING flag set.
>>
> in this case, iotlb->addr_mask in unmap is 0x20 -1.
> different than 0x1000 -1 used for map.
>> It might be helpful for Yan, and everyone else, to see the latest QEMU
>> patch series.  Thanks,
>>
> yes, please. also curious of log_sync part for vIOMMU. given most 
IOVAs in

> address space are unmapped and therefore no IOTLBs are able to be found.
>

Qemu patches compatible with v16 version are at:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg691806.html

Hope that helps.

Thanks,
Kirti




Re: [PATCH v15 Kernel 1/7] vfio: KABI for migration interface for device state

2020-03-26 Thread Kirti Wankhede




On 3/26/2020 3:03 PM, Christoph Hellwig wrote:

s/KABI/UAPI/ in the subject and anywhere else in the series.



Ok.


Please avoid __packed__ structures and just properly pad them, they
have a major performance impact on some platforms and will cause
compiler warnings when taking addresses of members.



Yes, removing it.

Thanks,
Kirti



Re: [PATCH v16 Kernel 4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

2020-03-26 Thread Kirti Wankhede




On 3/25/2020 7:41 AM, Yan Zhao wrote:

On Wed, Mar 25, 2020 at 05:18:52AM +0800, Kirti Wankhede wrote:

VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
   copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 266 +++-
  1 file changed, 260 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 70aeab921d0f..874a1a7ae925 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@ struct vfio_iommu {
unsigned intdma_avail;
boolv2;
boolnesting;
+   booldirty_page_tracking;
  };
  
  struct vfio_domain {

@@ -91,6 +92,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
  };
  
  struct vfio_group {

@@ -125,7 +127,21 @@ struct vfio_regions {
  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)   \
(!list_empty(>domain_list))
  
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX (uint64_t)(INT_MAX - 1)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
  static int put_pfn(unsigned long pfn, int prot);
+static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
  
  /*

   * This code handles mapping and unmapping of user data buffers
@@ -175,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
  }
  
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, uint64_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, uint64_t pgsize)
+{
+   struct rb_node *p;
+
+   if (RB_EMPTY_ROOT(>pfn_list))
+   return;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, uint64_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+}
+
  /*
  

  1   2   3   4   >