Re: [PATCH 2/4] iommu/virtio: Add probe request

2018-03-23 Thread Robin Murphy

On 14/02/18 14:53, Jean-Philippe Brucker wrote:

When the device offers the probe feature, send a probe request for each
device managed by the IOMMU. Extract RESV_MEM information. When we
encounter a MSI doorbell region, set it up as a IOMMU_RESV_MSI region.
This will tell other subsystems that there is no need to map the MSI
doorbell in the virtio-iommu, because MSIs bypass it.

Signed-off-by: Jean-Philippe Brucker 
---
  drivers/iommu/virtio-iommu.c  | 163 --
  include/uapi/linux/virtio_iommu.h |  37 +
  2 files changed, 193 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index a9c9245e8ba2..3ac4b38eaf19 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -45,6 +45,7 @@ struct viommu_dev {
struct iommu_domain_geometrygeometry;
u64 pgsize_bitmap;
u8  domain_bits;
+   u32 probe_size;
  };
  
  struct viommu_mapping {

@@ -72,6 +73,7 @@ struct viommu_domain {
  struct viommu_endpoint {
struct viommu_dev   *viommu;
struct viommu_domain*vdomain;
+   struct list_headresv_regions;
  };
  
  struct viommu_request {

@@ -140,6 +142,10 @@ static int viommu_get_req_size(struct viommu_dev *viommu,
case VIRTIO_IOMMU_T_UNMAP:
size = sizeof(r->unmap);
break;
+   case VIRTIO_IOMMU_T_PROBE:
+   *bottom += viommu->probe_size;
+   size = sizeof(r->probe) + *bottom;
+   break;
default:
return -EINVAL;
}
@@ -448,6 +454,105 @@ static int viommu_replay_mappings(struct viommu_domain 
*vdomain)
return ret;
  }
  
+static int viommu_add_resv_mem(struct viommu_endpoint *vdev,

+  struct virtio_iommu_probe_resv_mem *mem,
+  size_t len)
+{
+   struct iommu_resv_region *region = NULL;
+   unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+   u64 addr = le64_to_cpu(mem->addr);
+   u64 size = le64_to_cpu(mem->size);
+
+   if (len < sizeof(*mem))
+   return -EINVAL;
+
+   switch (mem->subtype) {
+   case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+   region = iommu_alloc_resv_region(addr, size, prot,
+IOMMU_RESV_MSI);
+   break;
+   case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+   default:
+   region = iommu_alloc_resv_region(addr, size, 0,
+IOMMU_RESV_RESERVED);
+   break;
+   }
+
+   list_add(>resv_regions, >list);
+
+   /*
+* Treat unknown subtype as RESERVED, but urge users to update their
+* driver.
+*/
+   if (mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
+   mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_MSI)
+   pr_warn("unknown resv mem subtype 0x%x\n", mem->subtype);


Might as well avoid the extra comparisons by incorporating this into the 
switch statement, i.e.:


default:
dev_warn(vdev->viommu_dev->dev, ...);
/* Fallthrough */
case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
...

(dev_warn is generally preferable to pr_warn when feasible)


+
+   return 0;
+}
+
+static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
+{
+   int ret;
+   u16 type, len;
+   size_t cur = 0;
+   struct virtio_iommu_req_probe *probe;
+   struct virtio_iommu_probe_property *prop;
+   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct viommu_endpoint *vdev = fwspec->iommu_priv;
+
+   if (!fwspec->num_ids)
+   /* Trouble ahead. */
+   return -EINVAL;
+
+   probe = kzalloc(sizeof(*probe) + viommu->probe_size +
+   sizeof(struct virtio_iommu_req_tail), GFP_KERNEL);
+   if (!probe)
+   return -ENOMEM;
+
+   probe->head.type = VIRTIO_IOMMU_T_PROBE;
+   /*
+* For now, assume that properties of an endpoint that outputs multiple
+* IDs are consistent. Only probe the first one.
+*/
+   probe->endpoint = cpu_to_le32(fwspec->ids[0]);
+
+   ret = viommu_send_req_sync(viommu, probe);
+   if (ret)
+   goto out_free;
+
+   prop = (void *)probe->properties;
+   type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+
+   while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
+  cur < viommu->probe_size) {
+   len = le16_to_cpu(prop->length);
+
+   switch (type) {
+   case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
+   ret = viommu_add_resv_mem(vdev, (void *)prop->value, 
len);
+   break;
+

Re: [PATCH 1/4] iommu: Add virtio-iommu driver

2018-03-23 Thread Robin Murphy

On 14/02/18 14:53, Jean-Philippe Brucker wrote:

The virtio IOMMU is a para-virtualized device, allowing to send IOMMU
requests such as map/unmap over virtio-mmio transport without emulating
page tables. This implementation handles ATTACH, DETACH, MAP and UNMAP
requests.

The bulk of the code transforms calls coming from the IOMMU API into
corresponding virtio requests. Mappings are kept in an interval tree
instead of page tables.

Signed-off-by: Jean-Philippe Brucker 
---
  MAINTAINERS   |   6 +
  drivers/iommu/Kconfig |  11 +
  drivers/iommu/Makefile|   1 +
  drivers/iommu/virtio-iommu.c  | 960 ++
  include/uapi/linux/virtio_ids.h   |   1 +
  include/uapi/linux/virtio_iommu.h | 116 +
  6 files changed, 1095 insertions(+)
  create mode 100644 drivers/iommu/virtio-iommu.c
  create mode 100644 include/uapi/linux/virtio_iommu.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 3bdc260e36b7..2a181924d420 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14818,6 +14818,12 @@ S: Maintained
  F:drivers/virtio/virtio_input.c
  F:include/uapi/linux/virtio_input.h
  
+VIRTIO IOMMU DRIVER

+M: Jean-Philippe Brucker 
+S: Maintained
+F: drivers/iommu/virtio-iommu.c
+F: include/uapi/linux/virtio_iommu.h
+
  VIRTUAL BOX GUEST DEVICE DRIVER
  M:Hans de Goede 
  M:Arnd Bergmann 
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f3a21343e636..1ea0ec74524f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -381,4 +381,15 @@ config QCOM_IOMMU
help
  Support for IOMMU on certain Qualcomm SoCs.
  
+config VIRTIO_IOMMU

+   bool "Virtio IOMMU driver"
+   depends on VIRTIO_MMIO
+   select IOMMU_API
+   select INTERVAL_TREE
+   select ARM_DMA_USE_IOMMU if ARM
+   help
+ Para-virtualised IOMMU driver with virtio.
+
+ Say Y here if you intend to run this kernel as a guest.
+
  endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 1fb695854809..9c68be1365e1 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
  obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
  obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
  obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
new file mode 100644
index ..a9c9245e8ba2
--- /dev/null
+++ b/drivers/iommu/virtio-iommu.c
@@ -0,0 +1,960 @@
+/*
+ * Virtio driver for the paravirtualized IOMMU
+ *
+ * Copyright (C) 2018 ARM Limited
+ * Author: Jean-Philippe Brucker 
+ *
+ * SPDX-License-Identifier: GPL-2.0


This wants to be a // comment at the very top of the file (thankfully 
the policy is now properly documented in-tree since 
Documentation/process/license-rules.rst got merged)



+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define MSI_IOVA_BASE  0x800
+#define MSI_IOVA_LENGTH0x10
+
+struct viommu_dev {
+   struct iommu_device iommu;
+   struct device   *dev;
+   struct virtio_device*vdev;
+
+   struct ida  domain_ids;
+
+   struct virtqueue*vq;
+   /* Serialize anything touching the request queue */
+   spinlock_t  request_lock;
+
+   /* Device configuration */
+   struct iommu_domain_geometrygeometry;
+   u64 pgsize_bitmap;
+   u8  domain_bits;
+};
+
+struct viommu_mapping {
+   phys_addr_t paddr;
+   struct interval_tree_node   iova;
+   union {
+   struct virtio_iommu_req_map map;
+   struct virtio_iommu_req_unmap unmap;
+   } req;
+};
+
+struct viommu_domain {
+   struct iommu_domain domain;
+   struct viommu_dev   *viommu;
+   struct mutexmutex;
+   unsigned intid;
+
+   spinlock_t  mappings_lock;
+   struct rb_root_cached   mappings;
+
+   /* Number of endpoints attached to this domain */
+   unsigned long   endpoints;
+};
+
+struct viommu_endpoint {
+   struct viommu_dev   *viommu;
+   struct viommu_domain*vdomain;
+};
+
+struct viommu_request {
+   struct scatterlist  top;
+   struct scatterlist  bottom;
+
+   int 

RE: [PATCH 1/4] iommu: Add virtio-iommu driver

2018-03-23 Thread Tian, Kevin
> From: Tian, Kevin
> Sent: Thursday, March 22, 2018 6:06 PM
> 
> > From: Robin Murphy [mailto:robin.mur...@arm.com]
> > Sent: Wednesday, March 21, 2018 10:24 PM
> >
> > On 21/03/18 13:14, Jean-Philippe Brucker wrote:
> > > On 21/03/18 06:43, Tian, Kevin wrote:
> > > [...]
> > >>> +
> > >>> +#include 
> > >>> +
> > >>> +#define MSI_IOVA_BASE  0x800
> > >>> +#define MSI_IOVA_LENGTH0x10
> > >>
> > >> this is ARM specific, and according to virtio-iommu spec isn't it
> > >> better probed on the endpoint instead of hard-coding here?
> > >
> > > These values are arbitrary, not really ARM-specific even if ARM is the
> > > only user yet: we're just reserving a random IOVA region for mapping
> > MSIs.
> > > It is hard-coded because of the way iommu-dma.c works, but I don't
> > quite
> > > remember why that allocation isn't dynamic.
> >
> > The host kernel needs to have *some* MSI region in place before the
> > guest can start configuring interrupts, otherwise it won't know what
> > address to give to the underlying hardware. However, as soon as the host
> > kernel has picked a region, host userspace needs to know that it can no
> > longer use addresses in that region for DMA-able guest memory. It's a
> > lot easier when the address is fixed in hardware and the host userspace
> > will never be stupid enough to try and VFIO_IOMMU_DMA_MAP it, but in
> > the
> > more general case where MSI writes undergo IOMMU address translation
> > so
> > it's an arbitrary IOVA, this has the potential to conflict with stuff
> > like guest memory hotplug.
> >
> > What we currently have is just the simplest option, with the host kernel
> > just picking something up-front and pretending to host userspace that
> > it's a fixed hardware address. There's certainly scope for it to be a
> > bit more dynamic in the sense of adding an interface to let userspace
> > move it around (before attaching any devices, at least), but I don't
> > think it's feasible for the host kernel to second-guess userspace enough
> > to make it entirely transparent like it is in the DMA API domain case.
> >
> > Of course, that's all assuming the host itself is using a virtio-iommu
> > (e.g. in a nested virt or emulation scenario). When it's purely within a
> > guest then an MSI reservation shouldn't matter so much, since the guest
> > won't be anywhere near the real hardware configuration anyway.
> >
> > Robin.
> 
> Curious since anyway we are defining a new iommu architecture
> is it possible to avoid those ARM-specific burden completely?
> 

OK, after some study around those tricks below is my learning:

- MSI_IOVA window is used only on request (iommu_dma_get
_msi_page), not meant to take effect on all architectures once 
initialized. e.g. ARM GIC does it but not x86. So it is reasonable 
for virtio-iommu driver to implement such capability;

- I thought whether hardware MSI doorbell can be always reported
on virtio-iommu since it's newly defined. Looks there is a problem
if underlying IOMMU is sw-managed MSI style - valid mapping is
expected in all level of translations, meaning guest has to manage
stage-1 mapping in nested configuration since stage-1 is owned
by guest. 

Then virtio-iommu is naturally expected to report the same MSI 
model as supported by underlying hardware. Below are some
further thoughts along this route (use 'IOMMU' to represent the
physical one and 'virtio-iommu' for virtual one):



In the scope of current virtio-iommu spec v.6, there is no nested
consideration yet. Guest driver is expected to use MAP/UNMAP
interface on assigned endpoints. In this case the MAP requests
(IOVA->GPA) is caught and maintained within Qemu which then 
further talks to VFIO to map IOVA->HPA in IOMMU.

Qemu can learn the MSI model of IOMMU from sysfs.

For hardware MSI doorbell (x86 and some ARM):
* Host kernel reports to Qemu as IOMMU_RESV_MSI
* Qemu report to guest as VIRTIO_IOMMU_RESV_MEM_T_MSI
* Guest takes the range as IOMMU_RESV_MSI. reserved
* Qemu MAP database has no mapping for the doorbell
* Physical IOMMU page table has no mapping for the doorbell
* MSI from passthrough device bypass IOMMU
* MSI from emulated device bypass virtio-iommu

For software MSI doorbell (most ARM):
* Host kernel reports to Qemu as IOMMU_RESV_SW_MSI
* Qemu report to guest as VIRTIO_IOMMU_RESV_MEM_T_RESERVED
* Guest takes the range as IOMMU_RESV_RESERVED
* vGIC requests to map 'GPA of the virtual doorbell'
* a map request (IOVA->GPA) sent on endpoint
* Qemu maintains the mapping in MAP database
* but no VFIO_MAP request since it's purely virtual
* GIC requests to map 'HPA of the physical doorbell'
* e.g. triggered by VFIO enable msi
* IOMMU now includes a valid mapping (IOVA->HPA)
* MSI from emulated device go through Qemu MAP
database (IOVA->'GPA of virtual doorbell') and then hit vGIC
* MSI from passthrough device go through IOMMU
(IOVA->'HPA of physical doorbell') and then hit GIC

In this case, host