Re: [RFC v3 18/21] vfio-pci: Add a new VFIO_REGION_TYPE_NESTED region type

2019-01-11 Thread Alex Williamson
On Tue,  8 Jan 2019 11:26:30 +0100
Eric Auger  wrote:

> This patch adds a new 64kB region aiming to report nested mode
> translation faults.
> 
> The region contains a header with the size of the queue,
> the producer and consumer indices and then the actual
> fault queue data. The producer is updated by the kernel while
> the consumer is updated by the userspace.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> ---
>  drivers/vfio/pci/vfio_pci.c | 102 +++-
>  drivers/vfio/pci/vfio_pci_private.h |   2 +
>  include/uapi/linux/vfio.h   |  15 
>  3 files changed, 118 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index ff60bd1ea587..2ba181ab2edd 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -56,6 +56,11 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(disable_idle_d3,
>"Disable using the PCI D3 low power state for idle, unused 
> devices");
>  
> +#define VFIO_FAULT_REGION_SIZE 0x1

Why 64K?

> +#define VFIO_FAULT_QUEUE_SIZE\
> + ((VFIO_FAULT_REGION_SIZE - sizeof(struct vfio_fault_region_header)) / \
> + sizeof(struct iommu_fault))
> +
>  static inline bool vfio_vga_disabled(void)
>  {
>  #ifdef CONFIG_VFIO_PCI_VGA
> @@ -1226,6 +1231,100 @@ static const struct vfio_device_ops vfio_pci_ops = {
>  static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
>  static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
>  
> +static size_t
> +vfio_pci_dma_fault_rw(struct vfio_pci_device *vdev, char __user *buf,
> +   size_t count, loff_t *ppos, bool iswrite)
> +{
> + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
> + void *base = vdev->region[i].data;
> + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +
> + if (pos >= vdev->region[i].size)
> + return -EINVAL;
> +
> + count = min(count, (size_t)(vdev->region[i].size - pos));
> +
> + if (copy_to_user(buf, base + pos, count))
> + return -EFAULT;
> +
> + *ppos += count;
> +
> + return count;
> +}
> +
> +static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
> +struct vfio_pci_region *region,
> +struct vm_area_struct *vma)
> +{
> + u64 phys_len, req_len, pgoff, req_start;
> + unsigned long long addr;
> + unsigned int index;
> +
> + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> +
> + if (vma->vm_end < vma->vm_start)
> + return -EINVAL;
> + if ((vma->vm_flags & VM_SHARED) == 0)
> + return -EINVAL;
> +
> + phys_len = VFIO_FAULT_REGION_SIZE;
> +
> + req_len = vma->vm_end - vma->vm_start;
> + pgoff = vma->vm_pgoff &
> + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> + req_start = pgoff << PAGE_SHIFT;
> +
> + if (req_start + req_len > phys_len)
> + return -EINVAL;
> +
> + addr = virt_to_phys(vdev->fault_region);
> + vma->vm_private_data = vdev;
> + vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
> +
> + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> +req_len, vma->vm_page_prot);
> +}
> +
> +void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev,
> + struct vfio_pci_region *region)
> +{
> +}
> +
> +static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
> + .rw = vfio_pci_dma_fault_rw,
> + .mmap   = vfio_pci_dma_fault_mmap,
> + .release= vfio_pci_dma_fault_release,
> +};
> +
> +static int vfio_pci_init_dma_fault_region(struct vfio_pci_device *vdev)
> +{
> + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
> + VFIO_REGION_INFO_FLAG_MMAP;
> + int ret;
> +
> + spin_lock_init(>fault_queue_lock);
> +
> + vdev->fault_region = kmalloc(VFIO_FAULT_REGION_SIZE, GFP_KERNEL);
> + if (!vdev->fault_region)
> + return -ENOMEM;
> +
> + ret = vfio_pci_register_dev_region(vdev,
> + VFIO_REGION_TYPE_NESTED,
> + VFIO_REGION_SUBTYPE_NESTED_FAULT_REGION,
> + _pci_dma_fault_regops, VFIO_FAULT_REGION_SIZE,
> + flags, vdev->fault_region);
> + if (ret) {
> + kfree(vdev->fault_region);
> + return ret;
> + }
> +
> + vdev->fault_region->header.prod = 0;
> + vdev->fault_region->header.cons = 0;
> + vdev->fault_region->header.reserved = 0;

Use kzalloc above or else we're leaking kernel memory to userspace
anyway.

> + vdev->fault_region->header.size = VFIO_FAULT_QUEUE_SIZE;
> + return 0;
> +}
> +
>  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
> *id)
>  {
>   struct vfio_pci_device *vdev;
> @@ -1300,7 +1399,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
> 

Re: [RFC v3 06/21] vfio: VFIO_IOMMU_BIND_MSI

2019-01-11 Thread Alex Williamson
On Fri, 11 Jan 2019 16:02:44 -0700
Alex Williamson  wrote:

> On Tue,  8 Jan 2019 11:26:18 +0100
> Eric Auger  wrote:
> 
> > This patch adds the VFIO_IOMMU_BIND_MSI ioctl which aims at
> > passing the guest MSI binding to the host.
> > 
> > Signed-off-by: Eric Auger 
> > 
> > ---
> > 
> > v2 -> v3:
> > - adapt to new proto of bind_guest_msi
> > - directly use vfio_iommu_for_each_dev
> > 
> > v1 -> v2:
> > - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 27 +++
> >  include/uapi/linux/vfio.h   |  7 +++
> >  2 files changed, 34 insertions(+)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c 
> > b/drivers/vfio/vfio_iommu_type1.c
> > index c3ba3f249438..59229f6e2d84 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -1673,6 +1673,15 @@ static int vfio_cache_inv_fn(struct device *dev, 
> > void *data)
> > return iommu_cache_invalidate(d, dev, >info);
> >  }
> >  
> > +static int vfio_bind_guest_msi_fn(struct device *dev, void *data)
> > +{
> > +   struct vfio_iommu_type1_bind_guest_msi *ustruct =
> > +   (struct vfio_iommu_type1_bind_guest_msi *)data;
> > +   struct iommu_domain *d = iommu_get_domain_for_dev(dev);
> > +
> > +   return iommu_bind_guest_msi(d, dev, >binding);
> > +}
> > +
> >  static int
> >  vfio_set_pasid_table(struct vfio_iommu *iommu,
> >   struct vfio_iommu_type1_set_pasid_table *ustruct)
> > @@ -1792,6 +1801,24 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> >   vfio_cache_inv_fn);
> > mutex_unlock(>lock);
> > return ret;
> > +   } else if (cmd == VFIO_IOMMU_BIND_MSI) {
> > +   struct vfio_iommu_type1_bind_guest_msi ustruct;
> > +   int ret;
> > +
> > +   minsz = offsetofend(struct vfio_iommu_type1_bind_guest_msi,
> > +   binding);
> > +
> > +   if (copy_from_user(, (void __user *)arg, minsz))
> > +   return -EFAULT;
> > +
> > +   if (ustruct.argsz < minsz || ustruct.flags)
> > +   return -EINVAL;
> > +
> > +   mutex_lock(>lock);
> > +   ret = vfio_iommu_for_each_dev(iommu, ,
> > + vfio_bind_guest_msi_fn);  
> 
> The vfio_iommu_for_each_dev() interface is fine for invalidation, where
> a partial failure requires no unwind, but it's not sufficiently robust
> here.

Additionally, what happens as devices are added and removed from the
guest?  Are we designing an interface that specifically precludes
hotplug?  Thanks,

Alex
 
> > +   mutex_unlock(>lock);
> > +   return ret;
> > }
> >  
> > return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 11a07165e7e1..352e795a93c8 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -774,6 +774,13 @@ struct vfio_iommu_type1_cache_invalidate {
> >  };
> >  #define VFIO_IOMMU_CACHE_INVALIDATE  _IO(VFIO_TYPE, VFIO_BASE + 23)
> >  
> > +struct vfio_iommu_type1_bind_guest_msi {
> > +   __u32   argsz;
> > +   __u32   flags;
> > +   struct iommu_guest_msi_binding binding;
> > +};
> > +#define VFIO_IOMMU_BIND_MSI  _IO(VFIO_TYPE, VFIO_BASE + 24)  
> 
> -ENOCOMMENTS  MSIs are setup and torn down, is this only a machine init
> sort of interface?  How does the user un-bind?  Thanks,
> 
> Alex
> 
> > +
> >  /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
> >  
> >  /*  
> 

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 06/21] vfio: VFIO_IOMMU_BIND_MSI

2019-01-11 Thread Alex Williamson
On Tue,  8 Jan 2019 11:26:18 +0100
Eric Auger  wrote:

> This patch adds the VFIO_IOMMU_BIND_MSI ioctl which aims at
> passing the guest MSI binding to the host.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> v2 -> v3:
> - adapt to new proto of bind_guest_msi
> - directly use vfio_iommu_for_each_dev
> 
> v1 -> v2:
> - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
> ---
>  drivers/vfio/vfio_iommu_type1.c | 27 +++
>  include/uapi/linux/vfio.h   |  7 +++
>  2 files changed, 34 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index c3ba3f249438..59229f6e2d84 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1673,6 +1673,15 @@ static int vfio_cache_inv_fn(struct device *dev, void 
> *data)
>   return iommu_cache_invalidate(d, dev, >info);
>  }
>  
> +static int vfio_bind_guest_msi_fn(struct device *dev, void *data)
> +{
> + struct vfio_iommu_type1_bind_guest_msi *ustruct =
> + (struct vfio_iommu_type1_bind_guest_msi *)data;
> + struct iommu_domain *d = iommu_get_domain_for_dev(dev);
> +
> + return iommu_bind_guest_msi(d, dev, >binding);
> +}
> +
>  static int
>  vfio_set_pasid_table(struct vfio_iommu *iommu,
> struct vfio_iommu_type1_set_pasid_table *ustruct)
> @@ -1792,6 +1801,24 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> vfio_cache_inv_fn);
>   mutex_unlock(>lock);
>   return ret;
> + } else if (cmd == VFIO_IOMMU_BIND_MSI) {
> + struct vfio_iommu_type1_bind_guest_msi ustruct;
> + int ret;
> +
> + minsz = offsetofend(struct vfio_iommu_type1_bind_guest_msi,
> + binding);
> +
> + if (copy_from_user(, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (ustruct.argsz < minsz || ustruct.flags)
> + return -EINVAL;
> +
> + mutex_lock(>lock);
> + ret = vfio_iommu_for_each_dev(iommu, ,
> +   vfio_bind_guest_msi_fn);

The vfio_iommu_for_each_dev() interface is fine for invalidation, where
a partial failure requires no unwind, but it's not sufficiently robust
here.

> + mutex_unlock(>lock);
> + return ret;
>   }
>  
>   return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 11a07165e7e1..352e795a93c8 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -774,6 +774,13 @@ struct vfio_iommu_type1_cache_invalidate {
>  };
>  #define VFIO_IOMMU_CACHE_INVALIDATE  _IO(VFIO_TYPE, VFIO_BASE + 23)
>  
> +struct vfio_iommu_type1_bind_guest_msi {
> + __u32   argsz;
> + __u32   flags;
> + struct iommu_guest_msi_binding binding;
> +};
> +#define VFIO_IOMMU_BIND_MSI  _IO(VFIO_TYPE, VFIO_BASE + 24)

-ENOCOMMENTS  MSIs are setup and torn down, is this only a machine init
sort of interface?  How does the user un-bind?  Thanks,

Alex

> +
>  /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
>  
>  /*

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 04/21] vfio: VFIO_IOMMU_SET_PASID_TABLE

2019-01-11 Thread Alex Williamson
On Tue,  8 Jan 2019 11:26:16 +0100
Eric Auger  wrote:

> From: "Liu, Yi L" 
> 
> This patch adds VFIO_IOMMU_SET_PASID_TABLE ioctl which aims at
> passing the virtual iommu guest configuration to the VFIO driver
> downto to the iommu subsystem.
> 
> Signed-off-by: Jacob Pan 
> Signed-off-by: Liu, Yi L 
> Signed-off-by: Eric Auger 
> 
> ---
> v2 -> v3:
> - s/BIND_PASID_TABLE/SET_PASID_TABLE
> 
> v1 -> v2:
> - s/BIND_GUEST_STAGE/BIND_PASID_TABLE
> - remove the struct device arg
> ---
>  drivers/vfio/vfio_iommu_type1.c | 31 +++
>  include/uapi/linux/vfio.h   |  8 
>  2 files changed, 39 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 7651cfb14836..d9dd23f64f00 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1644,6 +1644,24 @@ static int vfio_domains_have_iommu_cache(struct 
> vfio_iommu *iommu)
>   return ret;
>  }
>  
> +static int
> +vfio_set_pasid_table(struct vfio_iommu *iommu,
> +   struct vfio_iommu_type1_set_pasid_table *ustruct)
> +{
> + struct vfio_domain *d;
> + int ret = 0;
> +
> + mutex_lock(>lock);
> +
> + list_for_each_entry(d, >domain_list, next) {
> + ret = iommu_set_pasid_table(d->domain, >config);
> + if (ret)
> + break;
> + }

There's no unwind on failure here, leaves us in an inconsistent state
should something go wrong or domains don't have homogeneous PASID
support.  What's expected to happen if a PASID table is already set for
a domain, does it replace the old one or return -EBUSY?

> + mutex_unlock(>lock);
> + return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  unsigned int cmd, unsigned long arg)
>  {
> @@ -1714,6 +1732,19 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>   return copy_to_user((void __user *)arg, , minsz) ?
>   -EFAULT : 0;
> + } else if (cmd == VFIO_IOMMU_SET_PASID_TABLE) {
> + struct vfio_iommu_type1_set_pasid_table ustruct;
> +
> + minsz = offsetofend(struct vfio_iommu_type1_set_pasid_table,
> + config);
> +
> + if (copy_from_user(, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (ustruct.argsz < minsz || ustruct.flags)
> + return -EINVAL;
> +
> + return vfio_set_pasid_table(iommu, );
>   }
>  
>   return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 02bb7ad6e986..0d9f4090c95d 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -14,6 +14,7 @@
>  
>  #include 
>  #include 
> +#include 
>  
>  #define VFIO_API_VERSION 0
>  
> @@ -759,6 +760,13 @@ struct vfio_iommu_type1_dma_unmap {
>  #define VFIO_IOMMU_ENABLE_IO(VFIO_TYPE, VFIO_BASE + 15)
>  #define VFIO_IOMMU_DISABLE   _IO(VFIO_TYPE, VFIO_BASE + 16)
>  
> +struct vfio_iommu_type1_set_pasid_table {
> + __u32   argsz;
> + __u32   flags;
> + struct iommu_pasid_table_config config;
> +};
> +#define VFIO_IOMMU_SET_PASID_TABLE   _IO(VFIO_TYPE, VFIO_BASE + 22)

-ENOCOMMENTS  Thanks,

Alex

> +
>  /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
>  
>  /*

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 03/21] iommu: Introduce bind_guest_msi

2019-01-11 Thread Alex Williamson
On Tue,  8 Jan 2019 11:26:15 +0100
Eric Auger  wrote:

> On ARM, MSI are translated by the SMMU. An IOVA is allocated
> for each MSI doorbell. If both the host and the guest are exposed
> with SMMUs, we end up with 2 different IOVAs allocated by each.
> guest allocates an IOVA (gIOVA) to map onto the guest MSI
> doorbell (gDB). The Host allocates another IOVA (hIOVA) to map
> onto the physical doorbell (hDB).
> 
> So we end up with 2 untied mappings:
>  S1S2
> gIOVA->gDB
>   hIOVA->gDB
   ^^^ hDB

> Currently the PCI device is programmed by the host with hIOVA
> as MSI doorbell. So this does not work.
> 
> This patch introduces an API to pass gIOVA/gDB to the host so
> that gIOVA can be reused by the host instead of re-allocating
> a new IOVA. So the goal is to create the following nested mapping:
> 
>  S1S2
> gIOVA->gDB ->hDB
> 
> and program the PCI device with gIOVA MSI doorbell.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> v2 -> v3:
> - add a struct device handle
> ---
>  drivers/iommu/iommu.c  | 10 ++
>  include/linux/iommu.h  | 13 +
>  include/uapi/linux/iommu.h |  6 ++
>  3 files changed, 29 insertions(+)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index b2e248770508..ea11442e7054 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1431,6 +1431,16 @@ static void __iommu_detach_device(struct iommu_domain 
> *domain,
>   trace_detach_device_from_domain(dev);
>  }
>  
> +int iommu_bind_guest_msi(struct iommu_domain *domain, struct device *dev,
> +  struct iommu_guest_msi_binding *binding)
> +{
> + if (unlikely(!domain->ops->bind_guest_msi))
> + return -ENODEV;
> +
> + return domain->ops->bind_guest_msi(domain, dev, binding);
> +}
> +EXPORT_SYMBOL_GPL(iommu_bind_guest_msi);
> +
>  void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
>  {
>   struct iommu_group *group;
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 96d59886f230..244c1a3d5989 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -235,6 +235,9 @@ struct iommu_ops {
>   int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
>   struct iommu_cache_invalidate_info *inv_info);
>  
> + int (*bind_guest_msi)(struct iommu_domain *domain, struct device *dev,
> +   struct iommu_guest_msi_binding *binding);
> +
>   unsigned long pgsize_bitmap;
>  };
>  
> @@ -301,6 +304,9 @@ extern int iommu_set_pasid_table(struct iommu_domain 
> *domain,
>  extern int iommu_cache_invalidate(struct iommu_domain *domain,
>   struct device *dev,
>   struct iommu_cache_invalidate_info *inv_info);
> +extern int iommu_bind_guest_msi(struct iommu_domain *domain, struct device 
> *dev,
> + struct iommu_guest_msi_binding *binding);
> +
>  extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
>  extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
>  extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> @@ -724,6 +730,13 @@ iommu_cache_invalidate(struct iommu_domain *domain,
>   return -ENODEV;
>  }
>  
> +static inline
> +int iommu_bind_guest_msi(struct iommu_domain *domain, struct device *dev,
> +  struct iommu_guest_msi_binding *binding)
> +{
> + return -ENODEV;
> +}
> +
>  #endif /* CONFIG_IOMMU_API */
>  
>  #ifdef CONFIG_IOMMU_DEBUGFS
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> index 4605f5cfac84..f28cd9a1aa96 100644
> --- a/include/uapi/linux/iommu.h
> +++ b/include/uapi/linux/iommu.h
> @@ -142,4 +142,10 @@ struct iommu_cache_invalidate_info {
>   __u64   arch_id;
>   __u64   addr;
>  };
> +
> +struct iommu_guest_msi_binding {
> + __u64   iova;
> + __u64   gpa;
> + __u32   granule;

What's granule?  The size?  This looks a lot like just a stage 1
mapping interface, I can't really figure out from the description how
this matches to any specific MSI mapping.  Zero comments in the code
or headers here about how this is supposed to work.  Thanks,

Alex
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 02/21] iommu: Introduce cache_invalidate API

2019-01-11 Thread Alex Williamson
On Tue,  8 Jan 2019 11:26:14 +0100
Eric Auger  wrote:

> From: "Liu, Yi L" 
> 
> In any virtualization use case, when the first translation stage
> is "owned" by the guest OS, the host IOMMU driver has no knowledge
> of caching structure updates unless the guest invalidation activities
> are trapped by the virtualizer and passed down to the host.
> 
> Since the invalidation data are obtained from user space and will be
> written into physical IOMMU, we must allow security check at various
> layers. Therefore, generic invalidation data format are proposed here,
> model specific IOMMU drivers need to convert them into their own format.
> 
> Signed-off-by: Liu, Yi L 
> Signed-off-by: Jean-Philippe Brucker 
> Signed-off-by: Jacob Pan 
> Signed-off-by: Ashok Raj 
> Signed-off-by: Eric Auger 
> 
> ---
> v1 -> v2:
> - add arch_id field
> - renamed tlb_invalidate into cache_invalidate as this API allows
>   to invalidate context caches on top of IOTLBs
> 
> v1:
> renamed sva_invalidate into tlb_invalidate and add iommu_ prefix in
> header. Commit message reworded.
> ---
>  drivers/iommu/iommu.c  | 14 ++
>  include/linux/iommu.h  | 14 ++
>  include/uapi/linux/iommu.h | 95 ++
>  3 files changed, 123 insertions(+)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 0f2b7f1fc7c8..b2e248770508 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1403,6 +1403,20 @@ int iommu_set_pasid_table(struct iommu_domain *domain,
>  }
>  EXPORT_SYMBOL_GPL(iommu_set_pasid_table);
>  
> +int iommu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
> +struct iommu_cache_invalidate_info *inv_info)
> +{
> + int ret = 0;
> +
> + if (unlikely(!domain->ops->cache_invalidate))
> + return -ENODEV;
> +
> + ret = domain->ops->cache_invalidate(domain, dev, inv_info);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
> +
>  static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 1da2a2357ea4..96d59886f230 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -186,6 +186,7 @@ struct iommu_resv_region {
>   * @of_xlate: add OF master IDs to iommu grouping
>   * @pgsize_bitmap: bitmap of all possible supported page sizes
>   * @set_pasid_table: set pasid table
> + * @cache_invalidate: invalidate translation caches
>   */
>  struct iommu_ops {
>   bool (*capable)(enum iommu_cap);
> @@ -231,6 +232,9 @@ struct iommu_ops {
>   int (*set_pasid_table)(struct iommu_domain *domain,
>  struct iommu_pasid_table_config *cfg);
>  
> + int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
> + struct iommu_cache_invalidate_info *inv_info);
> +
>   unsigned long pgsize_bitmap;
>  };
>  
> @@ -294,6 +298,9 @@ extern void iommu_detach_device(struct iommu_domain 
> *domain,
>   struct device *dev);
>  extern int iommu_set_pasid_table(struct iommu_domain *domain,
>struct iommu_pasid_table_config *cfg);
> +extern int iommu_cache_invalidate(struct iommu_domain *domain,
> + struct device *dev,
> + struct iommu_cache_invalidate_info *inv_info);
>  extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
>  extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
>  extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> @@ -709,6 +716,13 @@ int iommu_set_pasid_table(struct iommu_domain *domain,
>  {
>   return -ENODEV;
>  }
> +static inline int
> +iommu_cache_invalidate(struct iommu_domain *domain,
> +struct device *dev,
> +struct iommu_cache_invalidate_info *inv_info)
> +{
> + return -ENODEV;
> +}
>  
>  #endif /* CONFIG_IOMMU_API */
>  
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> index 7a7cf7a3de7c..4605f5cfac84 100644
> --- a/include/uapi/linux/iommu.h
> +++ b/include/uapi/linux/iommu.h
> @@ -47,4 +47,99 @@ struct iommu_pasid_table_config {
>   };
>  };
>  
> +/**
> + * enum iommu_inv_granularity - Generic invalidation granularity
> + * @IOMMU_INV_GRANU_DOMAIN_ALL_PASID:TLB entries or PASID caches of 
> all
> + *   PASIDs associated with a domain ID
> + * @IOMMU_INV_GRANU_PASID_SEL:   TLB entries or PASID cache 
> associated
> + *   with a PASID and a domain
> + * @IOMMU_INV_GRANU_PAGE_PASID:  TLB entries of selected page 
> range
> + *   within a PASID
> + *
> + * When an invalidation request is passed down to IOMMU to flush translation
> + * caches, it may carry different granularity 

Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-11 Thread Borislav Petkov
On Fri, Jan 11, 2019 at 06:09:28PM +, James Morse wrote:
> We can return on ENOENT out earlier, as nothing needs doing in that case. Its
> what the GHES_TO_CLEAR spaghetti is for, we can probably move the ack thing 
> into
> ghes_clear_estatus(), that way that thing means 'I'm done with this memory'.

That actually sounds nice and other code in the kernel already does
that: when a failure has been encountered during reading status, you
free up resources right then and there. No need for passing retvals back
and forth. And this would simplify the spaghetti. Which is something
good(tm) on its own!

Thx.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-11 Thread Borislav Petkov
On Fri, Jan 11, 2019 at 06:25:21PM +, James Morse wrote:
> We ack it in the corrupt-record case too, because we are done with the
> memory.

Ok, so the only thing that we need to do unconditionally is ACK in order
to free the memory. Or is there an exception to that set of steps in
error handling?

> I think it is. 18.3.2.8 of ACPI v6.2 (search for Generic Hardware Error Source
> version 2", then below the table):
> * OSPM detects error (via interrupt/exception or polling the block status)
> * OSPM copies the error status block
> * OSPM clears the block status field of the error status block
> * OSPM acknowledges the error via Read Ack register
> 
> The ENOENT case is excluded by 'polling the block status'.

Ok, so we signal the absence of an error record with ENOENT.

if (!buf_paddr)
return -ENOENT;

Can that even happen?

Also, in that case, what would happen if we ACK the error anyway? We'd
confuse the firmware?

I sure hope firmware is prepared for spurious ACKs :)

> Unsurprisingly the spec doesn't consider the case that firmware generates
> corrupt records!

You mean the EIO case?

Not surprised at all. But we do not report that record so all good.

Thx.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


ESORICS 2019 - Call for Workshop Proposals [24th European Symposium on Research in Computer Security, Luxembourg, 2019]

2019-01-11 Thread Publicity Chair ESORICS 2019
[apologies for cross-posting]

==
CALL for WORKSHOP PROPOSALS: ESORICS 2019
24th European Symposium on Research in Computer Security
University of Luxembourg, Luxembourg, September 23-27, 2019

Website: https://esorics2019.uni.lu/workshops/
==

Overview

ESORICS is the annual European research event in Computer Security.
The Symposium started in 1990 and has been held in several European
countries, attracting a wide international audience from both the
academic and industrial communities.

Proposals are solicited for Workshops to be held in conjunction with
ESORICS 2019. A Workshop should aim at providing a forum on emerging
topics of high interest to the security and privacy community.

In the Workshop selection, particular consideration will be paid to:
* Its potential interest for the security and privacy community
* Its novelty with respect to other forums, especially with
  respect to other ESORICS workshops.
* Its likely impact on the target community, including likely
  high participation.

A workshop can be either one day or two days in length.


Important Dates

* Workshop proposals due: February 8, 2019
* Notification of decision: March 15, 2019


Submitting a Workshop Proposal

To submit a proposal send an email to the Workshop Chair with
with the information requested in the application form:
* Joaquin Garcia-Alfaro (jgalf...@ieee.org)

The following application form should be used.
(also available in Word format at http://j.mp/ESORICS19wsh)


==
ESORICS 2019 - WORKSHOP APPLICATION FORM:

- Title of the Workshop:

- Duration (maximum 2 days):

- Draft "Call for Papers", articulating the workshop's scope
  and topics:

- Brief summary and justification for the workshop, including
  anticipated benefits to the ESORICS community:

- Planned activities:

- Expected number of participants:

- Workshop organizers:
* PC Chair(s):
* General Chair(s):

- Workshop deadlines:
* Submission deadline:
* Notification to authors:
* Camera-ready versions:

- Planned publication of Workshop proceedings:

- Data of last three years for the workshop (if applicable):
* Number of submissions:
* Number of accepted papers:
* Number of participants:
* Publication of proceedings:
* Venue (co-location):
==
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 01/21] iommu: Introduce set_pasid_table API

2019-01-11 Thread Alex Williamson
On Tue,  8 Jan 2019 11:26:13 +0100
Eric Auger  wrote:

> From: Jacob Pan 
> 
> In virtualization use case, when a guest is assigned
> a PCI host device, protected by a virtual IOMMU on a guest,
> the physical IOMMU must be programmed to be consistent with
> the guest mappings. If the physical IOMMU supports two
> translation stages it makes sense to program guest mappings
> onto the first stage/level (ARM/VTD terminology) while to host
> owns the stage/level 2.
> 
> In that case, it is mandated to trap on guest configuration
> settings and pass those to the physical iommu driver.
> 
> This patch adds a new API to the iommu subsystem that allows
> to set the pasid table information.
> 
> A generic iommu_pasid_table_config struct is introduced in
> a new iommu.h uapi header. This is going to be used by the VFIO
> user API. We foresee at least two specializations of this struct,
> for PASID table passing and ARM SMMUv3.
> 
> Signed-off-by: Jean-Philippe Brucker 
> Signed-off-by: Liu, Yi L 
> Signed-off-by: Ashok Raj 
> Signed-off-by: Jacob Pan 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> This patch generalizes the API introduced by Jacob & co-authors in
> https://lwn.net/Articles/754331/
> 
> v2 -> v3:
> - replace unbind/bind by set_pasid_table
> - move table pointer and pasid bits in the generic part of the struct
> 
> v1 -> v2:
> - restore the original pasid table name
> - remove the struct device * parameter in the API
> - reworked iommu_pasid_smmuv3
> ---
>  drivers/iommu/iommu.c  | 10 
>  include/linux/iommu.h  | 14 +++
>  include/uapi/linux/iommu.h | 50 ++
>  3 files changed, 74 insertions(+)
>  create mode 100644 include/uapi/linux/iommu.h
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 3ed4db334341..0f2b7f1fc7c8 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1393,6 +1393,16 @@ int iommu_attach_device(struct iommu_domain *domain, 
> struct device *dev)
>  }
>  EXPORT_SYMBOL_GPL(iommu_attach_device);
>  
> +int iommu_set_pasid_table(struct iommu_domain *domain,
> +   struct iommu_pasid_table_config *cfg)
> +{
> + if (unlikely(!domain->ops->set_pasid_table))
> + return -ENODEV;
> +
> + return domain->ops->set_pasid_table(domain, cfg);
> +}
> +EXPORT_SYMBOL_GPL(iommu_set_pasid_table);
> +
>  static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index e90da6b6f3d1..1da2a2357ea4 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define IOMMU_READ   (1 << 0)
>  #define IOMMU_WRITE  (1 << 1)
> @@ -184,6 +185,7 @@ struct iommu_resv_region {
>   * @domain_window_disable: Disable a particular window for a domain
>   * @of_xlate: add OF master IDs to iommu grouping
>   * @pgsize_bitmap: bitmap of all possible supported page sizes
> + * @set_pasid_table: set pasid table
>   */
>  struct iommu_ops {
>   bool (*capable)(enum iommu_cap);
> @@ -226,6 +228,9 @@ struct iommu_ops {
>   int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
>   bool (*is_attach_deferred)(struct iommu_domain *domain, struct device 
> *dev);
>  
> + int (*set_pasid_table)(struct iommu_domain *domain,
> +struct iommu_pasid_table_config *cfg);
> +
>   unsigned long pgsize_bitmap;
>  };
>  
> @@ -287,6 +292,8 @@ extern int iommu_attach_device(struct iommu_domain 
> *domain,
>  struct device *dev);
>  extern void iommu_detach_device(struct iommu_domain *domain,
>   struct device *dev);
> +extern int iommu_set_pasid_table(struct iommu_domain *domain,
> +  struct iommu_pasid_table_config *cfg);
>  extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
>  extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
>  extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> @@ -696,6 +703,13 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct 
> fwnode_handle *fwnode)
>   return NULL;
>  }
>  
> +static inline
> +int iommu_set_pasid_table(struct iommu_domain *domain,
> +   struct iommu_pasid_table_config *cfg)
> +{
> + return -ENODEV;
> +}
> +
>  #endif /* CONFIG_IOMMU_API */
>  
>  #ifdef CONFIG_IOMMU_DEBUGFS
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> new file mode 100644
> index ..7a7cf7a3de7c
> --- /dev/null
> +++ b/include/uapi/linux/iommu.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * IOMMU user API definitions
> + *
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 

Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-11 Thread James Morse
Hi Boris,

On 11/01/2019 17:45, Borislav Petkov wrote:
> On Fri, Jan 11, 2019 at 10:32:23AM -0500, Tyler Baicar wrote:
>> The kernel would have no way of knowing what to do here.
> 
> What do you mean, there's no way of knowing what to do? It needs to
> clear registers so that the next error can get reported properly.
> 
> Or of the status read failed and it doesn't need to do anything, then it
> shouldn't.

I think we're speaking at cross-purposes. If the error-detecting-hardware has
some state, that's firmware's problem to deal with.
What we're dealing with here is the memory we read the error records from.


> Whatever it is, the kernel either needs to do something in the error
> case to clean up, or nothing if the firmware doesn't need anything done
> in the error case; *or* ack the error in the success case.

We ack it in the corrupt-record case too, because we are done with the memory.


> This should all be written down somewhere in that GHES v2
> spec/doc/writeup whatever, explaining what the OS is supposed to do to
> signal the error has been read by the OS.

I think it is. 18.3.2.8 of ACPI v6.2 (search for Generic Hardware Error Source
version 2", then below the table):
* OSPM detects error (via interrupt/exception or polling the block status)
* OSPM copies the error status block
* OSPM clears the block status field of the error status block
* OSPM acknowledges the error via Read Ack register

The ENOENT case is excluded by 'polling the block status'.
Unsurprisingly the spec doesn't consider the case that firmware generates
corrupt records!


Thanks,

James
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 01/21] iommu: Introduce set_pasid_table API

2019-01-11 Thread Jean-Philippe Brucker
On 08/01/2019 10:26, Eric Auger wrote:
> From: Jacob Pan 
> 
> In virtualization use case, when a guest is assigned
> a PCI host device, protected by a virtual IOMMU on a guest,
> the physical IOMMU must be programmed to be consistent with
> the guest mappings. If the physical IOMMU supports two
> translation stages it makes sense to program guest mappings
> onto the first stage/level (ARM/VTD terminology) while to host
> owns the stage/level 2.
> 
> In that case, it is mandated to trap on guest configuration
> settings and pass those to the physical iommu driver.
> 
> This patch adds a new API to the iommu subsystem that allows
> to set the pasid table information.
> 
> A generic iommu_pasid_table_config struct is introduced in
> a new iommu.h uapi header. This is going to be used by the VFIO
> user API. We foresee at least two specializations of this struct,
> for PASID table passing and ARM SMMUv3.

Last sentence is a bit confusing. With SMMUv3 it is also used for the
PASID table, even when it only has one entry and PASID is disabled.

> Signed-off-by: Jean-Philippe Brucker 
> Signed-off-by: Liu, Yi L 
> Signed-off-by: Ashok Raj 
> Signed-off-by: Jacob Pan 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> This patch generalizes the API introduced by Jacob & co-authors in
> https://lwn.net/Articles/754331/
> 
> v2 -> v3:
> - replace unbind/bind by set_pasid_table
> - move table pointer and pasid bits in the generic part of the struct
> 
> v1 -> v2:
> - restore the original pasid table name
> - remove the struct device * parameter in the API
> - reworked iommu_pasid_smmuv3
> ---
>  drivers/iommu/iommu.c  | 10 
>  include/linux/iommu.h  | 14 +++
>  include/uapi/linux/iommu.h | 50 ++
>  3 files changed, 74 insertions(+)
>  create mode 100644 include/uapi/linux/iommu.h
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 3ed4db334341..0f2b7f1fc7c8 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1393,6 +1393,16 @@ int iommu_attach_device(struct iommu_domain *domain, 
> struct device *dev)
>  }
>  EXPORT_SYMBOL_GPL(iommu_attach_device);
>  
> +int iommu_set_pasid_table(struct iommu_domain *domain,
> +   struct iommu_pasid_table_config *cfg)
> +{
> + if (unlikely(!domain->ops->set_pasid_table))
> + return -ENODEV;
> +
> + return domain->ops->set_pasid_table(domain, cfg);
> +}
> +EXPORT_SYMBOL_GPL(iommu_set_pasid_table);
> +
>  static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index e90da6b6f3d1..1da2a2357ea4 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define IOMMU_READ   (1 << 0)
>  #define IOMMU_WRITE  (1 << 1)
> @@ -184,6 +185,7 @@ struct iommu_resv_region {
>   * @domain_window_disable: Disable a particular window for a domain
>   * @of_xlate: add OF master IDs to iommu grouping
>   * @pgsize_bitmap: bitmap of all possible supported page sizes
> + * @set_pasid_table: set pasid table
>   */
>  struct iommu_ops {
>   bool (*capable)(enum iommu_cap);
> @@ -226,6 +228,9 @@ struct iommu_ops {
>   int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
>   bool (*is_attach_deferred)(struct iommu_domain *domain, struct device 
> *dev);
>  
> + int (*set_pasid_table)(struct iommu_domain *domain,
> +struct iommu_pasid_table_config *cfg);
> +
>   unsigned long pgsize_bitmap;
>  };
>  
> @@ -287,6 +292,8 @@ extern int iommu_attach_device(struct iommu_domain 
> *domain,
>  struct device *dev);
>  extern void iommu_detach_device(struct iommu_domain *domain,
>   struct device *dev);
> +extern int iommu_set_pasid_table(struct iommu_domain *domain,
> +  struct iommu_pasid_table_config *cfg);
>  extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
>  extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
>  extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> @@ -696,6 +703,13 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct 
> fwnode_handle *fwnode)
>   return NULL;
>  }
>  
> +static inline
> +int iommu_set_pasid_table(struct iommu_domain *domain,
> +   struct iommu_pasid_table_config *cfg)
> +{
> + return -ENODEV;
> +}
> +
>  #endif /* CONFIG_IOMMU_API */
>  
>  #ifdef CONFIG_IOMMU_DEBUGFS
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> new file mode 100644
> index ..7a7cf7a3de7c
> --- /dev/null
> +++ b/include/uapi/linux/iommu.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * IOMMU user API definitions
> + *
> + *
> + * This program is 

Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-11 Thread James Morse
Hi guys,

On 11/01/2019 15:32, Tyler Baicar wrote:
> On Fri, Jan 11, 2019 at 7:03 AM Borislav Petkov  wrote:
>> On Thu, Jan 10, 2019 at 04:01:27PM -0500, Tyler Baicar wrote:
>>> On Thu, Jan 10, 2019 at 1:23 PM James Morse  wrote:
>>
>> +if (is_hest_type_generic_v2(ghes) && 
>> ghes_ack_error(ghes->generic_v2))
>
> Since ghes_ack_error() is always prepended with this check, you could
> push it down into the function:
>
> ghes_ack_error(ghes)
> ...
>
>   if (!is_hest_type_generic_v2(ghes))
>   return 0;
>
> and simplify the two callsites :)

 Great idea! ...

 .. huh. Turns out for ghes_proc() we discard any errors other than ENOENT 
 from
 ghes_read_estatus() if is_hest_type_generic_v2(). This masks EIO.

 Most of the error sources discard the result, the worst thing I can find is
 ghes_irq_func() will return IRQ_HANDLED, instead of IRQ_NONE when we didn't
 really handle the IRQ. They're registered as SHARED, but I don't have an 
 example
 of what goes wrong next.

 I think this will also stop the spurious handling code kicking in to shut 
 it up
 if its broken and screaming. Unlikely, but not impossible.

[]

>>> Looks good to me, I guess there's no harm in acking invalid error status 
>>> blocks.

Great, I didn't miss something nasty...


>> Err, why?
> 
> If ghes_read_estatus() fails, then either there was no error populated or the
> error status block was invalid.
> If the error status block is invalid, then the kernel doesn't know what 
> happened
> in hardware.

What do we mean by 'hardware' here? We're receiving a corrupt report of
something via memory.
The GHESv2 ack just means we're done with the memory. I think it exists because
the external-agent can't peek into the CPU to see if its returned from the
notification.


> I originally thought this was changing what's acked, but it's just changing 
> the
> return value of ghes_proc() when ghes_read_estatus() returns -EIO.

Sorry, that will be due to my bad description.


>> I don't know what the firmware glue does on ARM but if I'd have to
>> remain logical - which is hard to do with firmware - the proper thing to
>> do would be this:
>>
>> rc = ghes_read_estatus(ghes, _paddr);
>> if (rc) {
>> ghes_reset_hardware();
> 
> The kernel would have no way of knowing what to do here.

Is there anything wrong with what we do today? We stamp on the records so that
we don't processes them again. (especially if is polled), and we tell firmware
it can re-use this memory.

(I think we should return an error, or print a ratelimited warning for corrupt
records)


>> }
>>
>> /* clear estatus and bla bla */
>>
>> /* Now, I'm in the success case: */
>>  ghes_ack_error();
>>
>>
>> This way, you have the error path clear of something unexpected happened
>> when reading the hardware, obvious and separated. ghes_reset_hardware()
>> clears the registers and does the necessary steps to put the hardware in
>> good state again so that it can report the next error.
>>
>> And the success path simply acks the error and does possibly the same
>> thing. The naming of the functions is important though, to denote what
>> gets called when.

I think this duplicates the record-stamping/acking. If there is anything in that
memory region, the action for processed/copied/ignored-because-its-corrupt is
the same.

We can return on ENOENT out earlier, as nothing needs doing in that case. Its
what the GHES_TO_CLEAR spaghetti is for, we can probably move the ack thing into
ghes_clear_estatus(), that way that thing means 'I'm done with this memory'.

Something like:
-
rc = ghes_read_estatus();
if (rc == -ENOENT)
return 0;

if (!rc) {
ghes_do_proc() and friends;
}

ghes_clear_estatus();

return rc;
-

We would no longer return errors from the ack code, I suspect that can only
happen for a corrupt gas, which we would have caught earlier as we rely on the
mapping being cached.



Thanks,

James
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kvmtool v2 4/6] arm: Support firmware loading

2019-01-11 Thread Andre Przywara
On Thu, 10 Jan 2019 14:20:44 +
Julien Thierry  wrote:

Hi,

> Implement firmware image loading for arm and set the boot start
> address to the firmware address.
> 
> Add an option for the user to specify where to map the firmware.

Many thanks for making this optional!

> Signed-off-by: Julien Thierry 
> ---
>  arm/fdt.c| 14 +++-
>  arm/include/arm-common/kvm-config-arch.h |  5 ++-
>  arm/kvm.c| 61
> +++- 3 files changed, 77 insertions(+), 3
> deletions(-)
> 
> diff --git a/arm/fdt.c b/arm/fdt.c
> index 664bb62..2936986 100644
> --- a/arm/fdt.c
> +++ b/arm/fdt.c
> @@ -131,7 +131,19 @@ static int setup_fdt(struct kvm *kvm)
>   /* /chosen */
>   _FDT(fdt_begin_node(fdt, "chosen"));
>   _FDT(fdt_property_cell(fdt, "linux,pci-probe-only", 1));
> - _FDT(fdt_property_string(fdt, "bootargs",
> kvm->cfg.real_cmdline)); +
> + if (kvm->cfg.firmware_filename) {
> + /*
> +  * When using a firmware, command line is not passed
> through DT,
> +  * or the firmware can add it itself
> +  */
> + if (kvm->cfg.kernel_cmdline)
> + pr_warning("Ignoring custom bootargs: %s\n",
> +kvm->cfg.kernel_cmdline);
> + } else
> + _FDT(fdt_property_string(fdt, "bootargs",
> +  kvm->cfg.real_cmdline));
> +
>   _FDT(fdt_property_u64(fdt, "kaslr-seed",
> kvm->cfg.arch.kaslr_seed)); 
>   /* Initrd */
> diff --git a/arm/include/arm-common/kvm-config-arch.h
> b/arm/include/arm-common/kvm-config-arch.h index 6a196f1..5734c46
> 100644 --- a/arm/include/arm-common/kvm-config-arch.h
> +++ b/arm/include/arm-common/kvm-config-arch.h
> @@ -11,6 +11,7 @@ struct kvm_config_arch {
>   boolhas_pmuv3;
>   u64 kaslr_seed;
>   enum irqchip_type irqchip;
> + u64 fw_addr;
>  };
>  
>  int irqchip_parser(const struct option *opt, const char *arg, int
> unset); @@ -30,6 +31,8 @@ int irqchip_parser(const struct option
> *opt, const char *arg, int unset); OPT_CALLBACK('\0', "irqchip",
> &(cfg)->irqchip,  \
> "[gicv2|gicv2m|gicv3|gicv3-its]", \
> "Type of interrupt controller to emulate in the guest",   \
> -  irqchip_parser, NULL),
> +  irqchip_parser,
> NULL),\
> + OPT_U64('\0', "firmware-address",
> &(cfg)->fw_addr,  \
> + "Address where firmware should be loaded"),
>  
>  #endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */
> diff --git a/arm/kvm.c b/arm/kvm.c
> index c6843e5..d3f8f5d 100644
> --- a/arm/kvm.c
> +++ b/arm/kvm.c
> @@ -169,9 +169,68 @@ bool kvm__arch_load_kernel_image(struct kvm
> *kvm, int fd_kernel, int fd_initrd, return true;
>  }
>  
> +static bool validate_fw_addr(struct kvm *kvm, u64 fw_addr)
> +{
> + u64 ram_phys;
> +
> + ram_phys = host_to_guest_flat(kvm, kvm->ram_start);
> +
> + if (fw_addr < ram_phys || fw_addr >= ram_phys +
> kvm->ram_size) {

I am OK with this check for now, but was wondering if we really *need*
to have the firmware in RAM? Couldn't we possibly put it into some
ROM/flash/nvmem device memory area and utilise XIP, for instance? I
understand that EDKII does not support this at the moment, but was just
thinking whether this would be an artificial restriction for other
firmware ports.

But I guess we can relax this later on, possibly when we support this
properly.

> + pr_err("Provide --firmware-address an address in
> RAM: "
> +"0x%016llx - 0x%016llx",
> +ram_phys, ram_phys + kvm->ram_size);
> +
> + return false;
> + }
> +
> + return true;
> +}
> +
>  bool kvm__load_firmware(struct kvm *kvm, const char
> *firmware_filename) {
> - return false;
> + u64 fw_addr = kvm->cfg.arch.fw_addr;
> + void *host_pos;
> + void *limit;
> + ssize_t fw_sz;
> + int fd;
> +
> + limit = kvm->ram_start + kvm->ram_size;
> +
> + /* For default firmware address, lets load it at the
> begining of RAM */
> + if (fw_addr == 0)
> + fw_addr = ARM_MEMORY_AREA;
> +
> + if (!validate_fw_addr(kvm, fw_addr))
> + die("Bad firmware destination: 0x%016llx", fw_addr);
> +
> + fd = open(firmware_filename, O_RDONLY);
> + if (fd < 0)
> + return false;
> +
> + host_pos = guest_flat_to_host(kvm, fw_addr);
> + if (!host_pos || host_pos < kvm->ram_start)
> + return false;
> +
> + fw_sz = read_file(fd, host_pos, limit - host_pos);
> + if (fw_sz < 0)
> + die("failed to load firmware");
> + close(fd);
> +
> + /* Kernel isn't loaded by kvm, point start address to
> firmware */
> + kvm->arch.kern_guest_start = fw_addr;
> +
> + /* Load dtb just after the 

Re: [RFC v3 17/21] iommu/smmuv3: Report non recoverable faults

2019-01-11 Thread Jean-Philippe Brucker
On 08/01/2019 10:26, Eric Auger wrote:
> When a stage 1 related fault event is read from the event queue,
> let's propagate it to potential external fault listeners, ie. users
> who registered a fault handler.
> 
> Signed-off-by: Eric Auger 
> ---
>  drivers/iommu/arm-smmu-v3.c | 124 
>  1 file changed, 113 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 999ee470a2ae..6a711cbbb228 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -168,6 +168,26 @@
>  #define ARM_SMMU_PRIQ_IRQ_CFG1   0xd8
>  #define ARM_SMMU_PRIQ_IRQ_CFG2   0xdc
>  
> +/* Events */
> +#define ARM_SMMU_EVT_F_UUT   0x01
> +#define ARM_SMMU_EVT_C_BAD_STREAMID  0x02
> +#define ARM_SMMU_EVT_F_STE_FETCH 0x03
> +#define ARM_SMMU_EVT_C_BAD_STE   0x04
> +#define ARM_SMMU_EVT_F_BAD_ATS_TREQ  0x05
> +#define ARM_SMMU_EVT_F_STREAM_DISABLED   0x06
> +#define ARM_SMMU_EVT_F_TRANSL_FORBIDDEN  0x07
> +#define ARM_SMMU_EVT_C_BAD_SUBSTREAMID   0x08
> +#define ARM_SMMU_EVT_F_CD_FETCH  0x09
> +#define ARM_SMMU_EVT_C_BAD_CD0x0a
> +#define ARM_SMMU_EVT_F_WALK_EABT 0x0b
> +#define ARM_SMMU_EVT_F_TRANSLATION   0x10
> +#define ARM_SMMU_EVT_F_ADDR_SIZE 0x11
> +#define ARM_SMMU_EVT_F_ACCESS0x12
> +#define ARM_SMMU_EVT_F_PERMISSION0x13
> +#define ARM_SMMU_EVT_F_TLB_CONFLICT  0x20
> +#define ARM_SMMU_EVT_F_CFG_CONFLICT  0x21
> +#define ARM_SMMU_EVT_E_PAGE_REQUEST  0x24
> +
>  /* Common MSI config fields */
>  #define MSI_CFG0_ADDR_MASK   GENMASK_ULL(51, 2)
>  #define MSI_CFG2_SH  GENMASK(5, 4)
> @@ -333,6 +353,11 @@
>  #define EVTQ_MAX_SZ_SHIFT7
>  
>  #define EVTQ_0_IDGENMASK_ULL(7, 0)
> +#define EVTQ_0_SUBSTREAMID   GENMASK_ULL(31, 12)
> +#define EVTQ_0_STREAMID  GENMASK_ULL(63, 32)
> +#define EVTQ_1_S2GENMASK_ULL(39, 39)
> +#define EVTQ_1_CLASS GENMASK_ULL(40, 41)
> +#define EVTQ_3_FETCH_ADDRGENMASK_ULL(51, 3)
>  
>  /* PRI queue */
>  #define PRIQ_ENT_DWORDS  2
> @@ -1270,7 +1295,6 @@ static int arm_smmu_init_l2_strtab(struct 
> arm_smmu_device *smmu, u32 sid)
>   return 0;
>  }
>  
> -__maybe_unused
>  static struct arm_smmu_master_data *
>  arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
>  {
> @@ -1296,24 +1320,102 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, 
> u32 sid)
>   return master;
>  }
>  
> +static void arm_smmu_report_event(struct arm_smmu_device *smmu, u64 *evt)
> +{
> + u64 fetch_addr = FIELD_GET(EVTQ_3_FETCH_ADDR, evt[3]);
> + u32 sid = FIELD_GET(EVTQ_0_STREAMID, evt[0]);
> + bool s1 = !FIELD_GET(EVTQ_1_S2, evt[1]);
> + u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
> + struct arm_smmu_master_data *master;
> + struct iommu_fault_event event;
> + bool propagate = true;
> + u64 addr = evt[2];
> + int i;
> +
> + master = arm_smmu_find_master(smmu, sid);
> + if (WARN_ON(!master))
> + return;
> +
> + event.fault.type = IOMMU_FAULT_DMA_UNRECOV;
> +
> + switch (type) {
> + case ARM_SMMU_EVT_C_BAD_STREAMID:
> + event.fault.reason = IOMMU_FAULT_REASON_SOURCEID_INVALID;
> + break;
> + case ARM_SMMU_EVT_F_STREAM_DISABLED:
> + case ARM_SMMU_EVT_C_BAD_SUBSTREAMID:
> + event.fault.reason = IOMMU_FAULT_REASON_PASID_INVALID;
> + break;
> + case ARM_SMMU_EVT_F_CD_FETCH:
> + event.fault.reason = IOMMU_FAULT_REASON_PASID_FETCH;
> + break;
> + case ARM_SMMU_EVT_F_WALK_EABT:
> + event.fault.reason = IOMMU_FAULT_REASON_WALK_EABT;
> + event.fault.addr = addr;
> + event.fault.fetch_addr = fetch_addr;
> + propagate = s1;
> + break;
> + case ARM_SMMU_EVT_F_TRANSLATION:
> + event.fault.reason = IOMMU_FAULT_REASON_PTE_FETCH;
> + event.fault.addr = addr;
> + event.fault.fetch_addr = fetch_addr;
> + propagate = s1;
> + break;
> + case ARM_SMMU_EVT_F_PERMISSION:
> + event.fault.reason = IOMMU_FAULT_REASON_PERMISSION;
> + event.fault.addr = addr;
> + propagate = s1;
> + break;
> + case ARM_SMMU_EVT_F_ACCESS:
> + event.fault.reason = IOMMU_FAULT_REASON_ACCESS;
> + event.fault.addr = addr;
> + propagate = s1;
> + break;
> + case ARM_SMMU_EVT_C_BAD_STE:
> + event.fault.reason = 
> IOMMU_FAULT_REASON_BAD_DEVICE_CONTEXT_ENTRY;
> + break;
> + case ARM_SMMU_EVT_C_BAD_CD:
> + event.fault.reason = IOMMU_FAULT_REASON_BAD_PASID_ENTRY;
> + break;
> + case ARM_SMMU_EVT_F_ADDR_SIZE:
> + event.fault.reason = 

Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-11 Thread Borislav Petkov
On Fri, Jan 11, 2019 at 10:32:23AM -0500, Tyler Baicar wrote:
> The kernel would have no way of knowing what to do here.

What do you mean, there's no way of knowing what to do? It needs to
clear registers so that the next error can get reported properly.

Or of the status read failed and it doesn't need to do anything, then it
shouldn't.

Whatever it is, the kernel either needs to do something in the error
case to clean up, or nothing if the firmware doesn't need anything done
in the error case; *or* ack the error in the success case.

This should all be written down somewhere in that GHES v2
spec/doc/writeup whatever, explaining what the OS is supposed to do to
signal the error has been read by the OS.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 11/21] iommu/smmuv3: Implement cache_invalidate

2019-01-11 Thread Jean-Philippe Brucker
On 08/01/2019 10:26, Eric Auger wrote:
> Implement IOMMU_INV_TYPE_TLB invalidations. When
> nr_pages is null we interpret this as a context
> invalidation.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> The user API needs to be refined to discriminate context
> invalidations from NH_VA invalidations. Also the leaf attribute
> is not yet properly handled.
> 
> v2 -> v3:
> - replace __arm_smmu_tlb_sync by arm_smmu_cmdq_issue_sync
> 
> v1 -> v2:
> - properly pass the asid
> ---
>  drivers/iommu/arm-smmu-v3.c | 40 +
>  1 file changed, 40 insertions(+)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 0e006babc8a6..ca72e0ce92f6 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -2293,6 +2293,45 @@ static int arm_smmu_set_pasid_table(struct 
> iommu_domain *domain,
>   return ret;
>  }
>  
> +static int
> +arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
> +   struct iommu_cache_invalidate_info *inv_info)
> +{
> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> + struct arm_smmu_device *smmu = smmu_domain->smmu;
> +
> + if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> + return -EINVAL;
> +
> + if (!smmu)
> + return -EINVAL;
> +
> + switch (inv_info->hdr.type) {
> + case IOMMU_INV_TYPE_TLB:
> + /*
> +  * TODO: On context invalidation, the userspace sets nr_pages
> +  * to 0. Refine the API to add a dedicated flags and also
> +  * properly handle the leaf parameter.
> +  */

That's what inv->granularity is for: if inv->granularity is PASID_SEL,
then the invalidation is for the whole context (and nr_pages, size,
addr, etc. should be ignored). If inv->granularity is PAGE_PASID, then
it's a range. The names could probably be improved but it's already in
the API

Thanks,
Jean

> + if (!inv_info->nr_pages) {
> + smmu_domain->s1_cfg.cd.asid = inv_info->arch_id;
> + arm_smmu_tlb_inv_context(smmu_domain);
> + } else {
> + size_t granule = 1 << (inv_info->size + 12);
> + size_t size = inv_info->nr_pages * granule;
> +
> + smmu_domain->s1_cfg.cd.asid = inv_info->arch_id;
> + arm_smmu_tlb_inv_range_nosync(inv_info->addr, size,
> +   granule, false,
> +   smmu_domain);
> + arm_smmu_cmdq_issue_sync(smmu);
> + }
> + return 0;
> + default:
> + return -EINVAL;
> + }
> +}
> +
>  static struct iommu_ops arm_smmu_ops = {
>   .capable= arm_smmu_capable,
>   .domain_alloc   = arm_smmu_domain_alloc,
> @@ -2312,6 +2351,7 @@ static struct iommu_ops arm_smmu_ops = {
>   .get_resv_regions   = arm_smmu_get_resv_regions,
>   .put_resv_regions   = arm_smmu_put_resv_regions,
>   .set_pasid_table= arm_smmu_set_pasid_table,
> + .cache_invalidate   = arm_smmu_cache_invalidate,
>   .pgsize_bitmap  = -1UL, /* Restricted during device attach */
>  };
>  
> 

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 09/21] iommu/smmuv3: Get prepared for nested stage support

2019-01-11 Thread Jean-Philippe Brucker
Hi Eric,

On 08/01/2019 10:26, Eric Auger wrote:
> To allow nested stage support, we need to store both
> stage 1 and stage 2 configurations (and remove the former
> union).
> 
> arm_smmu_write_strtab_ent() is modified to write both stage
> fields in the STE.
> 
> We add a nested_bypass field to the S1 configuration as the first
> stage can be bypassed. Also the guest may force the STE to abort:
> this information gets stored into the nested_abort field.
> 
> Only S2 stage is "finalized" as the host does not configure
> S1 CD, guest does.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> v1 -> v2:
> - invalidate the STE before moving from a live STE config to another
> - add the nested_abort and nested_bypass fields
> ---
>  drivers/iommu/arm-smmu-v3.c | 43 -
>  1 file changed, 33 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 9af68266bbb1..9716a301d9ae 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -212,6 +212,7 @@
>  #define STRTAB_STE_0_CFG_BYPASS  4
>  #define STRTAB_STE_0_CFG_S1_TRANS5
>  #define STRTAB_STE_0_CFG_S2_TRANS6
> +#define STRTAB_STE_0_CFG_NESTED  7
>  
>  #define STRTAB_STE_0_S1FMT   GENMASK_ULL(5, 4)
>  #define STRTAB_STE_0_S1FMT_LINEAR0
> @@ -491,6 +492,10 @@ struct arm_smmu_strtab_l1_desc {
>  struct arm_smmu_s1_cfg {
>   __le64  *cdptr;
>   dma_addr_t  cdptr_dma;
> + /* in nested mode, tells s1 must be bypassed */
> + boolnested_bypass;
> + /* in nested mode, abort is forced by guest */
> + boolnested_abort;
>  
>   struct arm_smmu_ctx_desc {
>   u16 asid;
> @@ -515,6 +520,7 @@ struct arm_smmu_strtab_ent {
>* configured according to the domain type.
>*/
>   boolassigned;
> + boolnested;
>   struct arm_smmu_s1_cfg  *s1_cfg;
>   struct arm_smmu_s2_cfg  *s2_cfg;
>  };
> @@ -629,10 +635,8 @@ struct arm_smmu_domain {
>   boolnon_strict;
>  
>   enum arm_smmu_domain_stage  stage;
> - union {
> - struct arm_smmu_s1_cfg  s1_cfg;
> - struct arm_smmu_s2_cfg  s2_cfg;
> - };
> + struct arm_smmu_s1_cfg  s1_cfg;
> + struct arm_smmu_s2_cfg  s2_cfg;
>  
>   struct iommu_domain domain;
>  
> @@ -1139,10 +1143,11 @@ static void arm_smmu_write_strtab_ent(struct 
> arm_smmu_device *smmu, u32 sid,

Could you also update the "This is hideously complicated..." comment
with the nested case? This function was complicated before, but it
becomes hell when adding nested and SVA support, so we really need the
comments :)

>   break;
>   case STRTAB_STE_0_CFG_S1_TRANS:
>   case STRTAB_STE_0_CFG_S2_TRANS:
> + case STRTAB_STE_0_CFG_NESTED:
>   ste_live = true;
>   break;
>   case STRTAB_STE_0_CFG_ABORT:
> - if (disable_bypass)
> + if (disable_bypass || ste->nested)
>   break;
>   default:
>   BUG(); /* STE corruption */
> @@ -1154,7 +1159,8 @@ static void arm_smmu_write_strtab_ent(struct 
> arm_smmu_device *smmu, u32 sid,
>  
>   /* Bypass/fault */
>   if (!ste->assigned || !(ste->s1_cfg || ste->s2_cfg)) {
> - if (!ste->assigned && disable_bypass)
> + if ((!ste->assigned && disable_bypass) ||
> + (ste->s1_cfg && ste->s1_cfg->nested_abort))

I don't think we're ever reaching this, given that ste->assigned is true
and ste->s2_cfg is set.

Something I find noteworthy is that with STRTAB_STE_0_CFG_ABORT, no
event is recorded in case of DMA fault. For vSMMU you'd want to emulate
the SMMU behavior closely, so you don't want to inject faults if the
guest sets CFG_ABORT, but this way you also can't report errors to the
VMM. If we did want to notify the VMM of faults, we'd need to implement
nested_abort differently, for example by installing an empty context
descriptor with Config=s1translate-s2translate.

>   val |= FIELD_PREP(STRTAB_STE_0_CFG, 
> STRTAB_STE_0_CFG_ABORT);
>   else
>   val |= FIELD_PREP(STRTAB_STE_0_CFG, 
> STRTAB_STE_0_CFG_BYPASS);
> @@ -1172,8 +1178,17 @@ static void arm_smmu_write_strtab_ent(struct 
> arm_smmu_device *smmu, u32 sid,
>   return;
>   }
>  
> + if (ste->nested && ste_live) {
> + /*
> +  * When enabling nested, the STE may be transitionning from

transitioning (my bad)

> +  * s2 to nested and back. Invalidate the STE before changing it.
> +  */
> + dst[0] = cpu_to_le64(0);
> + 

Re: [PATCH v6 0/7] Add virtio-iommu driver

2019-01-11 Thread Jean-Philippe Brucker
On 11/01/2019 12:28, Joerg Roedel wrote:
> Hi Jean-Philippe,
> 
> On Thu, Dec 13, 2018 at 12:50:29PM +, Jean-Philippe Brucker wrote:
>> We already do deferred flush: UNMAP requests are added to the queue by
>> iommu_unmap(), and then flushed out by iotlb_sync(). So we switch to the
>> host only on iotlb_sync(), or when the request queue is full.
> 
> So the mappings can stay in place until iotlb_sync() returns? What
> happens when the guest sends a map-request for a region it sent and
> unmap request before, but did not call iotlb_sync inbetween?

At that point the unmap is still in the request queue, and the host will
handle it before getting to the map request. For correctness requests
are necessarily handled in-order by the host. So if the map and unmap
refer to the same domain and IOVA, the host will remove the old mapping
before creating the new one.

Thanks,
Jean
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v6 0/7] Add virtio-iommu driver

2019-01-11 Thread Joerg Roedel
Hi Jean-Philippe,

On Thu, Dec 13, 2018 at 12:50:29PM +, Jean-Philippe Brucker wrote:
> We already do deferred flush: UNMAP requests are added to the queue by
> iommu_unmap(), and then flushed out by iotlb_sync(). So we switch to the
> host only on iotlb_sync(), or when the request queue is full.

So the mappings can stay in place until iotlb_sync() returns? What
happens when the guest sends a map-request for a region it sent and
unmap request before, but did not call iotlb_sync inbetween?

Regards,

Joerg
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-11 Thread Borislav Petkov
On Thu, Jan 10, 2019 at 04:01:27PM -0500, Tyler Baicar wrote:
> On Thu, Jan 10, 2019 at 1:23 PM James Morse  wrote:
> > >>
> > >> +if (is_hest_type_generic_v2(ghes) && 
> > >> ghes_ack_error(ghes->generic_v2))
> > >
> > > Since ghes_ack_error() is always prepended with this check, you could
> > > push it down into the function:
> > >
> > > ghes_ack_error(ghes)
> > > ...
> > >
> > >   if (!is_hest_type_generic_v2(ghes))
> > >   return 0;
> > >
> > > and simplify the two callsites :)
> >
> > Great idea! ...
> >
> > .. huh. Turns out for ghes_proc() we discard any errors other than ENOENT 
> > from
> > ghes_read_estatus() if is_hest_type_generic_v2(). This masks EIO.
> >
> > Most of the error sources discard the result, the worst thing I can find is
> > ghes_irq_func() will return IRQ_HANDLED, instead of IRQ_NONE when we didn't
> > really handle the IRQ. They're registered as SHARED, but I don't have an 
> > example
> > of what goes wrong next.
> >
> > I think this will also stop the spurious handling code kicking in to shut 
> > it up
> > if its broken and screaming. Unlikely, but not impossible.
> >
> > Fixed in a prior patch, with Boris' suggestion, ghes_proc()s tail ends up 
> > look
> > like this:
> > --%<--
> > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> > index 0321d9420b1e..8d1f9930b159 100644
> > --- a/drivers/acpi/apei/ghes.c
> > +++ b/drivers/acpi/apei/ghes.c
> > @@ -700,18 +708,11 @@ static int ghes_proc(struct ghes *ghes)
> >
> >  out:
> > ghes_clear_estatus(ghes, buf_paddr);
> > +   if (rc != -ENOENT)
> > +   rc_ack = ghes_ack_error(ghes);
> >
> > -   if (rc == -ENOENT)
> > -   return rc;
> > -
> > -   /*
> > -* GHESv2 type HEST entries introduce support for error 
> > acknowledgment,
> > -* so only acknowledge the error if this support is present.
> > -*/
> > -   if (is_hest_type_generic_v2(ghes))
> > -   return ghes_ack_error(ghes->generic_v2);
> > -
> > -   return rc;
> > +   /* If rc and rc_ack failed, return the first one */
> > +   return rc ? rc : rc_ack;
> >  }
> > --%<--
> >
> 
> Looks good to me, I guess there's no harm in acking invalid error status 
> blocks.

Err, why?

I don't know what the firmware glue does on ARM but if I'd have to
remain logical - which is hard to do with firmware - the proper thing to
do would be this:

rc = ghes_read_estatus(ghes, _paddr);
if (rc) {
ghes_reset_hardware();
}

/* clear estatus and bla bla */

/* Now, I'm in the success case: */
 ghes_ack_error();


This way, you have the error path clear of something unexpected happened
when reading the hardware, obvious and separated. ghes_reset_hardware()
clears the registers and does the necessary steps to put the hardware in
good state again so that it can report the next error.

And the success path simply acks the error and does possibly the same
thing. The naming of the functions is important though, to denote what
gets called when.

This way you handle all the cases just fine. No looking at the error
type and blabla.

Right?

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v7 09/25] ACPI / APEI: Generalise the estatus queue's notify code

2019-01-11 Thread Borislav Petkov
On Thu, Jan 10, 2019 at 06:21:21PM +, James Morse wrote:
> Something like:
> ghes_notify_nmi() -> in_nmi_spool_from_list(list) -> 
> in_nmi_queue_one_entry(ghes).

Yah, but make that

ghes_notify_nmi() -> ghes_nmi_spool_from_list(list) -> 
ghes_nmi_queue_one_entry(ghes).

to denote it is the GHES NMI path.

Thx.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


(ghes|hest)_disable

2019-01-11 Thread Borislav Petkov
Ok,

lemme split this out into a separate thread and add Tony.

On Thu, Jan 10, 2019 at 06:20:35PM +, James Morse wrote:
> > Grrr, what an effing mess that code is! There's hest_disable *and*
> > ghes_disable. Do we really need them both?
> 
> ghes_disable lets you ignore the firmware-first notifications, but still 'use'
> the other error sources:
> drivers/pci/pcie/aer.c picks out the three AER types, and uses 
> apei_hest_parse()
> to know if firmware is controlling AER, even if ghes_disable is set.

Ok, that kinda makes sense.

But look what our sparse documentation says:

hest_disable[ACPI]
Disable Hardware Error Source Table (HEST) support;
corresponding firmware-first mode error processing
logic will be disabled.


and from looking at the code, hest_disable is kinda like the master
switch because it gets evaluated first. Right?

Which sounds to me like we want a generic switch which does:

apei=disable_ff_notifications

to explicitly do exactly that - disable the firmware-first notification
method. And then the master switch will be

apei=disable

And we'll be able to pass whatever options here instead of all those
different _disable switches which need lotsa code staring to figure out
what exactly they even do in the first place.

> x86's arch_apei_enable_cmcff() looks like it disables MCE to get firmware to
> handle them. hest_disable would stop this, but instead ghes_disable keeps 
> that,
> and stops the NOTIFY_NMI being registered.

Yeah, and when you boot with ghes_disable, that would say:

pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");

but there will be no notifications and users will scratch heads.

> (do you consider cmdline arguments as ABI, or hard to justify and hard to 
> remove?)

I don't, frankly. I guess we will have to have a transition period where
we keep them and issue a warning message that users should switch to
"apei=xxx" instead and remove them after a lot of time has passed.

> I don't think its broken enough to justify ripping them out. A user of
> ghes_disable would be someone with broken firmware-first handling of AER. They
> need to know firmware is changing the register values behind their back (so 
> need
> to parse the HEST), but want to ignore the junk notifications. It doesn't 
> sound
> like an unlikely scenario.

Yes, that makes sense.

But I think we should add a generic cmdline arg with suboptions and
document exactly what all those do. Similar to "mce=" on x86 which is
nicely documented in Documentation/x86/x86_64/boot-options.txt.

Right now, only a few people understand what those do and in some of the
cases they do too much/the wrong thing.

Thoughts?

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 14/21] iommu: introduce device fault data

2019-01-11 Thread Jean-Philippe Brucker
On 10/01/2019 18:45, Jacob Pan wrote:
> On Tue,  8 Jan 2019 11:26:26 +0100
> Eric Auger  wrote:
> 
>> From: Jacob Pan 
>>
>> Device faults detected by IOMMU can be reported outside IOMMU
>> subsystem for further processing. This patch intends to provide
>> a generic device fault data such that device drivers can be
>> communicated with IOMMU faults without model specific knowledge.
>>
>> The proposed format is the result of discussion at:
>> https://lkml.org/lkml/2017/11/10/291
>> Part of the code is based on Jean-Philippe Brucker's patchset
>> (https://patchwork.kernel.org/patch/9989315/).
>>
>> The assumption is that model specific IOMMU driver can filter and
>> handle most of the internal faults if the cause is within IOMMU driver
>> control. Therefore, the fault reasons can be reported are grouped
>> and generalized based common specifications such as PCI ATS.
>>
>> Signed-off-by: Jacob Pan 
>> Signed-off-by: Jean-Philippe Brucker 
>> Signed-off-by: Liu, Yi L 
>> Signed-off-by: Ashok Raj 
>> Signed-off-by: Eric Auger 
>> [moved part of the iommu_fault_event struct in the uapi, enriched
>>  the fault reasons to be able to map unrecoverable SMMUv3 errors]
>> ---
>>  include/linux/iommu.h  | 55 -
>>  include/uapi/linux/iommu.h | 83
>> ++ 2 files changed, 136
>> insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index 244c1a3d5989..1dedc2d247c2 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -49,13 +49,17 @@ struct bus_type;
>>  struct device;
>>  struct iommu_domain;
>>  struct notifier_block;
>> +struct iommu_fault_event;
>>  
>>  /* iommu fault flags */
>> -#define IOMMU_FAULT_READ0x0
>> -#define IOMMU_FAULT_WRITE   0x1
>> +#define IOMMU_FAULT_READ(1 << 0)
>> +#define IOMMU_FAULT_WRITE   (1 << 1)
>> +#define IOMMU_FAULT_EXEC(1 << 2)
>> +#define IOMMU_FAULT_PRIV(1 << 3)
>>  
>>  typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
>>  struct device *, unsigned long, int, void *);
>> +typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault_event *,
>> void *); 
>>  struct iommu_domain_geometry {
>>  dma_addr_t aperture_start; /* First address that can be
>> mapped*/ @@ -255,6 +259,52 @@ struct iommu_device {
>>  struct device *dev;
>>  };
>>  
>> +/**
>> + * struct iommu_fault_event - Generic per device fault data
>> + *
>> + * - PCI and non-PCI devices
>> + * - Recoverable faults (e.g. page request), information based on
>> PCI ATS
>> + * and PASID spec.
>> + * - Un-recoverable faults of device interest
>> + * - DMA remapping and IRQ remapping faults
>> + *
>> + * @fault: fault descriptor
>> + * @device_private: if present, uniquely identify device-specific
>> + *  private data for an individual page request.
>> + * @iommu_private: used by the IOMMU driver for storing
>> fault-specific
>> + * data. Users should not modify this field before
>> + * sending the fault response.
>> + */
>> +struct iommu_fault_event {
>> +struct iommu_fault fault;
>> +u64 device_private;
> I think we want to move device_private to uapi since it gets injected
> into the guest, then returned by guest in case of page response. For
> VT-d we also need 128 bits of private data. VT-d spec. 7.7.1

Ah, I didn't notice the format changed in VT-d rev3. On that topic, how
do we manage future extensions to the iommu_fault struct? Should we add
~48 bytes of padding after device_private, along with some flags telling
which field is valid, or deal with it using a structure version like we
do for the invalidate and bind structs? In the first case, iommu_fault
wouldn't fit in a 64-byte cacheline anymore, but I'm not sure we care.

> For exception tracking (e.g. unanswered page request), I can add timer
> and list info later when I include PRQ. sounds ok?
>> +u64 iommu_private;
[...]
>> +/**
>> + * struct iommu_fault - Generic fault data
>> + *
>> + * @type contains fault type
>> + * @reason fault reasons if relevant outside IOMMU driver.
>> + * IOMMU driver internal faults are not reported.
>> + * @addr: tells the offending page address
>> + * @fetch_addr: tells the address that caused an abort, if any
>> + * @pasid: contains process address space ID, used in shared virtual
>> memory
>> + * @page_req_group_id: page request group index
>> + * @last_req: last request in a page request group
>> + * @pasid_valid: indicates if the PRQ has a valid PASID
>> + * @prot: page access protection flag:
>> + *  IOMMU_FAULT_READ, IOMMU_FAULT_WRITE
>> + */
>> +
>> +struct iommu_fault {
>> +__u32   type;   /* enum iommu_fault_type */
>> +__u32   reason; /* enum iommu_fault_reason */
>> +__u64   addr;
>> +__u64   fetch_addr;
>> +__u32   pasid;
>> +__u32   page_req_group_id;
>> +__u32   last_req;
>> +__u32   pasid_valid;
>> +__u32   prot;
>> 

Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-11 Thread Tyler Baicar
On Thu, Jan 10, 2019 at 1:23 PM James Morse  wrote:
> >>
> >> +if (is_hest_type_generic_v2(ghes) && ghes_ack_error(ghes->generic_v2))
> >
> > Since ghes_ack_error() is always prepended with this check, you could
> > push it down into the function:
> >
> > ghes_ack_error(ghes)
> > ...
> >
> >   if (!is_hest_type_generic_v2(ghes))
> >   return 0;
> >
> > and simplify the two callsites :)
>
> Great idea! ...
>
> .. huh. Turns out for ghes_proc() we discard any errors other than ENOENT from
> ghes_read_estatus() if is_hest_type_generic_v2(). This masks EIO.
>
> Most of the error sources discard the result, the worst thing I can find is
> ghes_irq_func() will return IRQ_HANDLED, instead of IRQ_NONE when we didn't
> really handle the IRQ. They're registered as SHARED, but I don't have an 
> example
> of what goes wrong next.
>
> I think this will also stop the spurious handling code kicking in to shut it 
> up
> if its broken and screaming. Unlikely, but not impossible.
>
> Fixed in a prior patch, with Boris' suggestion, ghes_proc()s tail ends up look
> like this:
> --%<--
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 0321d9420b1e..8d1f9930b159 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -700,18 +708,11 @@ static int ghes_proc(struct ghes *ghes)
>
>  out:
> ghes_clear_estatus(ghes, buf_paddr);
> +   if (rc != -ENOENT)
> +   rc_ack = ghes_ack_error(ghes);
>
> -   if (rc == -ENOENT)
> -   return rc;
> -
> -   /*
> -* GHESv2 type HEST entries introduce support for error 
> acknowledgment,
> -* so only acknowledge the error if this support is present.
> -*/
> -   if (is_hest_type_generic_v2(ghes))
> -   return ghes_ack_error(ghes->generic_v2);
> -
> -   return rc;
> +   /* If rc and rc_ack failed, return the first one */
> +   return rc ? rc : rc_ack;
>  }
> --%<--
>

Looks good to me, I guess there's no harm in acking invalid error status blocks.

T
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm