[RFC PATCH v2 4/5] ACPI/IORT: Support paravirtualized IOMMU

2017-11-17 Thread Jean-Philippe Brucker
To describe the virtual topology in relation to a virtio-iommu device,
ACPI-based systems use a "paravirtualized IOMMU" IORT node. Add support
for it.

This is a RFC because the IORT specification doesn't describe the
paravirtualized node at the moment, it is only provided as an example in
the virtio-iommu spec. What we need to do first is confirm that x86
kernels are able to use the IORT driver with the virtio-iommu. There isn't
anything specific to arm64 in the driver but there might be other blockers
we're not aware of (I know for example that x86 also requires custom DMA
ops rather than iommu-dma ones, but it's unrelated) so this needs to be
tested on the x86 prototype.

Rationale: virtio-iommu requires an ACPI table to be passed between host
and guest that describes its relation to PCI and platform endpoints in the
virtual system. A table that maps PCI RIDs and integrated devices to IOMMU
device IDs, telling the IOMMU driver which endpoints it manages.

As far as I'm aware, there are three existing tables that solve this
problem: Intel DMAR, AMD IVRS and ARM IORT. The first two are specific to
Intel VT-d and AMD IOMMU respectively, while the third describes multiple
remapping devices -- currently only ARM IOMMUs and MSI controllers, but it
is easy to extend.

IORT table and drivers are easiest to extend and they do the job, so
rather than introducing a fourth solution to solve a generic problem,
reuse what exists.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/acpi/arm64/iort.c | 95 +++
 drivers/iommu/Kconfig |  1 +
 include/acpi/actbl2.h | 18 -
 3 files changed, 106 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index fde279b0a6d8..c7132e4a0560 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -29,7 +29,8 @@
 #define IORT_TYPE_MASK(type)   (1 << (type))
 #define IORT_MSI_TYPE  (1 << ACPI_IORT_NODE_ITS_GROUP)
 #define IORT_IOMMU_TYPE((1 << ACPI_IORT_NODE_SMMU) |   \
-   (1 << ACPI_IORT_NODE_SMMU_V3))
+   (1 << ACPI_IORT_NODE_SMMU_V3) | \
+   (1 << ACPI_IORT_NODE_PARAVIRT))
 
 /* Until ACPICA headers cover IORT rev. C */
 #ifndef ACPI_IORT_SMMU_V3_CAVIUM_CN99XX
@@ -616,6 +617,8 @@ static inline bool iort_iommu_driver_enabled(u8 type)
return IS_BUILTIN(CONFIG_ARM_SMMU_V3);
case ACPI_IORT_NODE_SMMU:
return IS_BUILTIN(CONFIG_ARM_SMMU);
+   case ACPI_IORT_NODE_PARAVIRT:
+   return IS_BUILTIN(CONFIG_VIRTIO_IOMMU);
default:
pr_warn("IORT node type %u does not describe an SMMU\n", type);
return false;
@@ -1062,6 +1065,48 @@ static bool __init arm_smmu_is_coherent(struct 
acpi_iort_node *node)
return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
 }
 
+static int __init paravirt_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pviommu *pviommu;
+
+   pviommu = (struct acpi_iort_pviommu *)node->node_data;
+
+   /* Mem + IRQs */
+   return 1 + pviommu->interrupt_count;
+}
+
+static void __init paravirt_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   int i;
+   int num_res = 0;
+   int hw_irq, trigger;
+   struct acpi_iort_pviommu *pviommu;
+
+   pviommu = (struct acpi_iort_pviommu *)node->node_data;
+
+   res[num_res].start = pviommu->base_address;
+   res[num_res].end = pviommu->base_address + pviommu->span - 1;
+   res[num_res].flags = IORESOURCE_MEM;
+   num_res++;
+
+   for (i = 0; i < pviommu->interrupt_count; i++) {
+   hw_irq = IORT_IRQ_MASK(pviommu->interrupts[i]);
+   trigger = IORT_IRQ_TRIGGER_MASK(pviommu->interrupts[i]);
+
+   acpi_iort_register_irq(hw_irq, "pviommu", trigger, 
&res[num_res++]);
+   }
+}
+
+static bool __init paravirt_is_coherent(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pviommu *pviommu;
+
+   pviommu = (struct acpi_iort_pviommu *)node->node_data;
+
+   return pviommu->flags & ACPI_IORT_NODE_PV_CACHE_COHERENT;
+}
+
 struct iort_iommu_config {
const char *name;
int (*iommu_init)(struct acpi_iort_node *node);
@@ -1088,6 +1133,13 @@ static const struct iort_iommu_config iort_arm_smmu_cfg 
__initconst = {
.iommu_init_resources = arm_smmu_init_resources
 };
 
+static const struct iort_iommu_config iort_paravirt_cfg __initconst = {
+   .name = "pviommu",
+   .iommu_is_coherent = paravirt_is_coherent,
+   .iommu_count_resources = paravirt_count_resources,
+   .iommu_init_resources = paravirt_init_resources
+};
+
 static __init
 const struct iort_iommu_config *iort_get_iommu_cfg(struct acpi_iort_node *node)
 {
@@ -1096,18 +1148,22 @@ const struct iort_iommu_config 
*iort_get_iommu_cfg(struct acpi_iort_node

[RFC PATCH v2 1/5] iommu: Add virtio-iommu driver

2017-11-17 Thread Jean-Philippe Brucker
The virtio IOMMU is a para-virtualized device, allowing to send IOMMU
requests such as map/unmap over virtio-mmio transport without emulating
page tables. This implementation handle ATTACH, DETACH, MAP and UNMAP
requests.

The bulk of the code is to create requests and send them through virtio.
Implementing the IOMMU API is fairly straightforward since the
virtio-iommu MAP/UNMAP interface is almost identical.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/Kconfig |  11 +
 drivers/iommu/Makefile|   1 +
 drivers/iommu/virtio-iommu.c  | 958 ++
 include/uapi/linux/virtio_ids.h   |   1 +
 include/uapi/linux/virtio_iommu.h | 140 ++
 5 files changed,  insertions(+)
 create mode 100644 drivers/iommu/virtio-iommu.c
 create mode 100644 include/uapi/linux/virtio_iommu.h

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 17b212f56e6a..7271e59e8b23 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -403,4 +403,15 @@ config QCOM_IOMMU
help
  Support for IOMMU on certain Qualcomm SoCs.
 
+config VIRTIO_IOMMU
+   bool "Virtio IOMMU driver"
+   depends on VIRTIO_MMIO
+   select IOMMU_API
+   select INTERVAL_TREE
+   select ARM_DMA_USE_IOMMU if ARM
+   help
+ Para-virtualised IOMMU driver with virtio.
+
+ Say Y here if you intend to run this kernel as a guest.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index dca71fe1c885..432242f3a328 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
 obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
 obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
new file mode 100644
index ..feb8c8925c3a
--- /dev/null
+++ b/drivers/iommu/virtio-iommu.c
@@ -0,0 +1,958 @@
+/*
+ * Virtio driver for the paravirtualized IOMMU
+ *
+ * Copyright (C) 2017 ARM Limited
+ * Author: Jean-Philippe Brucker 
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define MSI_IOVA_BASE  0x800
+#define MSI_IOVA_LENGTH0x10
+
+struct viommu_dev {
+   struct iommu_device iommu;
+   struct device   *dev;
+   struct virtio_device*vdev;
+
+   struct ida  domain_ids;
+
+   struct virtqueue*vq;
+   /* Serialize anything touching the request queue */
+   spinlock_t  request_lock;
+
+   /* Device configuration */
+   struct iommu_domain_geometrygeometry;
+   u64 pgsize_bitmap;
+   u8  domain_bits;
+};
+
+struct viommu_mapping {
+   phys_addr_t paddr;
+   struct interval_tree_node   iova;
+   union {
+   struct virtio_iommu_req_map map;
+   struct virtio_iommu_req_unmap unmap;
+   } req;
+};
+
+struct viommu_domain {
+   struct iommu_domain domain;
+   struct viommu_dev   *viommu;
+   struct mutexmutex;
+   unsigned intid;
+
+   spinlock_t  mappings_lock;
+   struct rb_root_cached   mappings;
+
+   /* Number of endpoints attached to this domain */
+   refcount_t  endpoints;
+};
+
+struct viommu_endpoint {
+   struct viommu_dev   *viommu;
+   struct viommu_domain*vdomain;
+};
+
+struct viommu_request {
+   struct scatterlist  top;
+   struct scatterlist  bottom;
+
+   int written;
+   struct list_headlist;
+};
+
+#define to_viommu_domain(domain) container_of(domain, struct viommu_domain, 
domain)
+
+/* Virtio transport */
+
+static int viommu_status_to_errno(u8 status)
+{
+   switch (status) {
+   case VIRTIO_IOMMU_S_OK:
+   return 0;
+   case VIRTIO_IOMMU_S_UNSUPP:
+   return -ENOSYS;
+   case VIRTIO_IOMMU_S_INVAL:
+   return -EINVAL;
+   case VIRTIO_IOMMU_S_RANGE:
+   return -ERANGE;
+   case VIRTIO_IOMMU_S_NOENT:
+   return -ENOENT;
+   case VIRTIO_IOMMU_S_FAULT:
+   return -EFAULT;
+   case VIRTIO_IOMMU_S_IOERR:
+   case VIRTIO_IOMMU_S_DEVERR:
+   default:
+   return -EIO;
+   }
+}
+
+/*
+ * viommu_get_req_size - compute request size
+ *
+ * A virtio-iommu request is split in

[RFC PATCH v2 3/5] iommu/virtio-iommu: Add event queue

2017-11-17 Thread Jean-Philippe Brucker
The event queue offers a way for the device to report access faults from
devices. It is implemented on virtqueue #1, whenever the host needs to
signal a fault it fills one of the buffers offered by the guest and
interrupts it.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/virtio-iommu.c  | 138 ++
 include/uapi/linux/virtio_iommu.h |  18 +
 2 files changed, 142 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 79e0add94e05..fe0d449bf489 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -30,6 +30,12 @@
 #define MSI_IOVA_BASE  0x800
 #define MSI_IOVA_LENGTH0x10
 
+enum viommu_vq_idx {
+   VIOMMU_REQUEST_VQ   = 0,
+   VIOMMU_EVENT_VQ = 1,
+   VIOMMU_NUM_VQS  = 2,
+};
+
 struct viommu_dev {
struct iommu_device iommu;
struct device   *dev;
@@ -37,7 +43,7 @@ struct viommu_dev {
 
struct ida  domain_ids;
 
-   struct virtqueue*vq;
+   struct virtqueue*vqs[VIOMMU_NUM_VQS];
/* Serialize anything touching the request queue */
spinlock_t  request_lock;
 
@@ -84,6 +90,15 @@ struct viommu_request {
struct list_headlist;
 };
 
+#define VIOMMU_FAULT_RESV_MASK 0xff00
+
+struct viommu_event {
+   union {
+   u32 head;
+   struct virtio_iommu_fault fault;
+   };
+};
+
 #define to_viommu_domain(domain) container_of(domain, struct viommu_domain, 
domain)
 
 /* Virtio transport */
@@ -160,12 +175,13 @@ static int viommu_receive_resp(struct viommu_dev *viommu, 
int nr_sent,
unsigned int len;
int nr_received = 0;
struct viommu_request *req, *pending;
+   struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
 
pending = list_first_entry_or_null(sent, struct viommu_request, list);
if (WARN_ON(!pending))
return 0;
 
-   while ((req = virtqueue_get_buf(viommu->vq, &len)) != NULL) {
+   while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
if (req != pending) {
dev_warn(viommu->dev, "discarding stale request\n");
continue;
@@ -202,6 +218,7 @@ static int _viommu_send_reqs_sync(struct viommu_dev *viommu,
 * dies.
 */
unsigned long timeout_ms = 1000;
+   struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ];
 
*nr_sent = 0;
 
@@ -211,15 +228,14 @@ static int _viommu_send_reqs_sync(struct viommu_dev 
*viommu,
sg[0] = &req->top;
sg[1] = &req->bottom;
 
-   ret = virtqueue_add_sgs(viommu->vq, sg, 1, 1, req,
-   GFP_ATOMIC);
+   ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC);
if (ret)
break;
 
list_add_tail(&req->list, &pending);
}
 
-   if (i && !virtqueue_kick(viommu->vq))
+   if (i && !virtqueue_kick(vq))
return -EPIPE;
 
timeout = ktime_add_ms(ktime_get(), timeout_ms * i);
@@ -554,6 +570,70 @@ static int viommu_probe_endpoint(struct viommu_dev 
*viommu, struct device *dev)
return 0;
 }
 
+static int viommu_fault_handler(struct viommu_dev *viommu,
+   struct virtio_iommu_fault *fault)
+{
+   char *reason_str;
+
+   u8 reason   = fault->reason;
+   u32 flags   = le32_to_cpu(fault->flags);
+   u32 endpoint= le32_to_cpu(fault->endpoint);
+   u64 address = le64_to_cpu(fault->address);
+
+   switch (reason) {
+   case VIRTIO_IOMMU_FAULT_R_DOMAIN:
+   reason_str = "domain";
+   break;
+   case VIRTIO_IOMMU_FAULT_R_MAPPING:
+   reason_str = "page";
+   break;
+   case VIRTIO_IOMMU_FAULT_R_UNKNOWN:
+   default:
+   reason_str = "unknown";
+   break;
+   }
+
+   /* TODO: find EP by ID and report_iommu_fault */
+   if (flags & VIRTIO_IOMMU_FAULT_F_ADDRESS)
+   dev_err_ratelimited(viommu->dev, "%s fault from EP %u at %#llx 
[%s%s%s]\n",
+   reason_str, endpoint, address,
+   flags & VIRTIO_IOMMU_FAULT_F_READ ? "R" : 
"",
+   flags & VIRTIO_IOMMU_FAULT_F_WRITE ? "W" : 
"",
+   flags & VIRTIO_IOMMU_FAULT_F_EXEC ? "X" : 
"");
+   else
+   dev_err_ratelimited(viommu->dev, "%s fault from EP %u\n",
+   reason_str, endpoint);
+
+   return 0;
+}
+
+static void viommu_event_handler(struct virtqueue *vq)
+{
+   int ret;
+   unsigned int len;
+   struct scatterlist sg[1];
+  

[RFC PATCH v2 2/5] iommu/virtio-iommu: Add probe request

2017-11-17 Thread Jean-Philippe Brucker
When the device offers the probe feature, send a probe request for each
device managed by the IOMMU. Extract RESV_MEM information. When we
encounter a MSI doorbell region, set it up as a IOMMU_RESV_MSI region.
This will tell other subsystems that there is no need to map the MSI
doorbell in the virtio-iommu, because MSIs bypass it.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/virtio-iommu.c  | 165 --
 include/uapi/linux/virtio_iommu.h |  37 +
 2 files changed, 195 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index feb8c8925c3a..79e0add94e05 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -45,6 +45,7 @@ struct viommu_dev {
struct iommu_domain_geometrygeometry;
u64 pgsize_bitmap;
u8  domain_bits;
+   u32 probe_size;
 };
 
 struct viommu_mapping {
@@ -72,6 +73,7 @@ struct viommu_domain {
 struct viommu_endpoint {
struct viommu_dev   *viommu;
struct viommu_domain*vdomain;
+   struct list_headresv_regions;
 };
 
 struct viommu_request {
@@ -139,6 +141,10 @@ static int viommu_get_req_size(struct viommu_dev *viommu,
case VIRTIO_IOMMU_T_UNMAP:
size = sizeof(r->unmap);
break;
+   case VIRTIO_IOMMU_T_PROBE:
+   *bottom += viommu->probe_size;
+   size = sizeof(r->probe) + *bottom;
+   break;
default:
return -EINVAL;
}
@@ -448,6 +454,106 @@ static int viommu_replay_mappings(struct viommu_domain 
*vdomain)
return ret;
 }
 
+static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
+  struct virtio_iommu_probe_resv_mem *mem,
+  size_t len)
+{
+   struct iommu_resv_region *region = NULL;
+   unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+   u64 addr = le64_to_cpu(mem->addr);
+   u64 size = le64_to_cpu(mem->size);
+
+   if (len < sizeof(*mem))
+   return -EINVAL;
+
+   switch (mem->subtype) {
+   case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+   region = iommu_alloc_resv_region(addr, size, prot,
+IOMMU_RESV_MSI);
+   break;
+   case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+   default:
+   region = iommu_alloc_resv_region(addr, size, 0,
+IOMMU_RESV_RESERVED);
+   break;
+   }
+
+   list_add(&vdev->resv_regions, ®ion->list);
+
+   if (mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
+   mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_MSI) {
+   /* Please update your driver. */
+   pr_warn("unknown resv mem subtype 0x%x\n", mem->subtype);
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
+{
+   int ret;
+   u16 type, len;
+   size_t cur = 0;
+   struct virtio_iommu_req_probe *probe;
+   struct virtio_iommu_probe_property *prop;
+   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct viommu_endpoint *vdev = fwspec->iommu_priv;
+
+   if (!fwspec->num_ids)
+   /* Trouble ahead. */
+   return -EINVAL;
+
+   probe = kzalloc(sizeof(*probe) + viommu->probe_size +
+   sizeof(struct virtio_iommu_req_tail), GFP_KERNEL);
+   if (!probe)
+   return -ENOMEM;
+
+   probe->head.type = VIRTIO_IOMMU_T_PROBE;
+   /*
+* For now, assume that properties of an endpoint that outputs multiple
+* IDs are consistent. Only probe the first one.
+*/
+   probe->endpoint = cpu_to_le32(fwspec->ids[0]);
+
+   ret = viommu_send_req_sync(viommu, probe);
+   if (ret) {
+   kfree(probe);
+   return ret;
+   }
+
+   prop = (void *)probe->properties;
+   type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+
+   while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
+  cur < viommu->probe_size) {
+   len = le16_to_cpu(prop->length);
+
+   switch (type) {
+   case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
+   ret = viommu_add_resv_mem(vdev, (void *)prop->value, 
len);
+   break;
+   default:
+   dev_dbg(dev, "unknown viommu prop 0x%x\n", type);
+   }
+
+   if (ret)
+   dev_err(dev, "failed to parse viommu prop 0x%x\n", 
type);
+
+   cur += sizeof(*prop) + len;
+   if (cur >= viommu->probe_size)
+   break;
+
+   prop = (void *)probe->properties + cur;
+

[RFC PATCH v2 0/5] Add virtio-iommu driver

2017-11-17 Thread Jean-Philippe Brucker
Implement the virtio-iommu driver following version 0.5 of the
specification [1]. Previous version of this code was sent back in April
[2], implementing the first public RFC. Since then there has been lots of
progress and discussion on the specification side, and I think the driver
is in a good shape now.

The reason patches 1-3 are only RFC is that I'm waiting on feedback from
the Virtio TC to reserve a device ID.

List of changes since previous RFC:
* Add per-endpoint probe request, for hardware MSI and reserved regions.
* Add a virtqueue for the device to report translation faults. Only
  non-recoverable ones at the moment.
* Removed the iommu_map_sg specialization for now, because none of the
  device drivers I use for testing (virtio, ixgbe and internal DMA
  engines) seem to use map_sg. This kind of feature is a lot more
  interesting when accompanied by benchmark numbers, and can be added back
  during future optimization work.
* Many fixes and cleanup

The driver works out of the box on DT-based systems, but ACPI support
still needs to be tested and discussed. In the specification I proposed
IORT tables as a nice candidate for describing the virtual topology.
Patches 4 and 5 propose small changes to the IORT driver for
instantiating a paravirtualized IOMMU. The IORT node is described in the
specification [1]. x86 support will also require some hacks since the
driver is based on the IOMMU DMA ops, that x86 doesn't use.

Eric's latest QEMU device [3] works with v0.4. For the moment you can use
the kvmtool device [4] to test v0.5 on arm64, and inject arbitrary fault
with the debug tool. The driver can also be pulled from my Linux tree [5].

[1] https://www.spinics.net/lists/kvm/msg157402.html
[2] https://patchwork.kernel.org/patch/9670273/
[3] https://lists.gnu.org/archive/html/qemu-arm/2017-09/msg00413.html
[4] git://linux-arm.org/kvmtool-jpb.git virtio-iommu/base
[5] git://linux-arm.org/linux-jpb.git virtio-iommu/v0.5-dev

Jean-Philippe Brucker (5):
  iommu: Add virtio-iommu driver
  iommu/virtio-iommu: Add probe request
  iommu/virtio-iommu: Add event queue
  ACPI/IORT: Support paravirtualized IOMMU
  ACPI/IORT: Move IORT to the ACPI folder

 drivers/acpi/Kconfig  |3 +
 drivers/acpi/Makefile |1 +
 drivers/acpi/arm64/Kconfig|3 -
 drivers/acpi/arm64/Makefile   |1 -
 drivers/acpi/{arm64 => }/iort.c   |   95 ++-
 drivers/iommu/Kconfig |   12 +
 drivers/iommu/Makefile|1 +
 drivers/iommu/virtio-iommu.c  | 1219 +
 include/acpi/actbl2.h |   18 +-
 include/uapi/linux/virtio_ids.h   |1 +
 include/uapi/linux/virtio_iommu.h |  195 ++
 11 files changed, 1537 insertions(+), 12 deletions(-)
 rename drivers/acpi/{arm64 => }/iort.c (92%)
 create mode 100644 drivers/iommu/virtio-iommu.c
 create mode 100644 include/uapi/linux/virtio_iommu.h

-- 
2.14.3

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 5/5] ACPI/IORT: Move IORT to the ACPI folder

2017-11-17 Thread Jean-Philippe Brucker
IORT can be used (by QEMU) to describe a virtual topology containing an
architecture-agnostic paravirtualized device. The rationale behind this
blasphemy is explained in patch 4/5.

In order to build IORT for x86 systems, the driver has to be moved outside
of arm64/. Since there is nothing specific to arm64 in the driver, it
simply requires moving Makefile and Kconfig entries.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/acpi/Kconfig| 3 +++
 drivers/acpi/Makefile   | 1 +
 drivers/acpi/arm64/Kconfig  | 3 ---
 drivers/acpi/arm64/Makefile | 1 -
 drivers/acpi/{arm64 => }/iort.c | 0
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename drivers/acpi/{arm64 => }/iort.c (100%)

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 5b1938f4b626..ce40275646c8 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -536,4 +536,7 @@ if ARM64
 source "drivers/acpi/arm64/Kconfig"
 endif
 
+config ACPI_IORT
+   bool
+
 endif  # ACPI
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index cd1abc9bc325..689c470c013b 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -112,3 +112,4 @@ video-objs  += acpi_video.o video_detect.o
 obj-y  += dptf/
 
 obj-$(CONFIG_ARM64)+= arm64/
+obj-$(CONFIG_ACPI_IORT)+= iort.o
diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig
index 5a6f80fce0d6..403f917ab274 100644
--- a/drivers/acpi/arm64/Kconfig
+++ b/drivers/acpi/arm64/Kconfig
@@ -2,8 +2,5 @@
 # ACPI Configuration for ARM64
 #
 
-config ACPI_IORT
-   bool
-
 config ACPI_GTDT
bool
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 1017def2ea12..47925dc6cfc8 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -1,2 +1 @@
-obj-$(CONFIG_ACPI_IORT)+= iort.o
 obj-$(CONFIG_ACPI_GTDT)+= gtdt.o
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/iort.c
similarity index 100%
rename from drivers/acpi/arm64/iort.c
rename to drivers/acpi/iort.c
-- 
2.14.3

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH] KVM: arm/arm64: VGIC: extend !vgic_is_initialized guard

2017-11-17 Thread Andre Przywara
Commit f39d16cbabf9 ("KVM: arm/arm64: Guard kvm_vgic_map_is_active against
!vgic_initialized") introduced a check whether the VGIC has been
initialized before accessing the spinlock and the VGIC data structure.
However the vgic_get_irq() call in the variable declaration sneaked
through the net, so lets make sure that this also gets called only after
we actually allocated the arrays this function accesses.

Signed-off-by: Andre Przywara 
---
 virt/kvm/arm/vgic/vgic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index e54ef2fdf73d..967983a33ab2 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -786,13 +786,14 @@ void vgic_kick_vcpus(struct kvm *kvm)
 
 bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
-   struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+   struct vgic_irq *irq;
bool map_is_active;
unsigned long flags;
 
if (!vgic_initialized(vcpu->kvm))
return false;
 
+   irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
spin_lock_irqsave(&irq->irq_lock, flags);
map_is_active = irq->hw && irq->active;
spin_unlock_irqrestore(&irq->irq_lock, flags);
-- 
2.14.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 0/4] Initial KVM SVE support hacks

2017-11-17 Thread Dave Martin
On Fri, Nov 17, 2017 at 04:38:51PM +, Dave Martin wrote:
> Throwing out an RFC here now that I've got something _sort of_ working.
> 
> This is based on the base SVE patches as now present in
> torvalds/master [1], but not on Christoffer's SVE optimisations (for

(That should be: Christoffer's _VHE_ optimisations, though if he
has some SVE optimisations I would naturally be interested...)

[...]

Cheers
---Dave
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 1/4] arm64: fpsimd: Abstract out binding of task's fpsimd context to the cpu.

2017-11-17 Thread Dave Martin
There is currently some duplicate logic to associate current's
FPSIMD context with the cpu when loading FPSIMD state into the cpu
regs.

Subsequent patches will update that logic, so in order to ensure it
only needs to be done in one place, this patch factors the relevant
code out into a new function fpsimd_bind_to_cpu().

Signed-off-by: Dave Martin 
---
 arch/arm64/kernel/fpsimd.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 143b3e7..007140b 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -992,6 +992,18 @@ void fpsimd_signal_preserve_current_state(void)
 }
 
 /*
+ * Associate current's FPSIMD context with this cpu
+ * Preemption must be disabled when calling this function.
+ */
+static void fpsimd_bind_to_cpu(void)
+{
+   struct fpsimd_state *st = ¤t->thread.fpsimd_state;
+
+   __this_cpu_write(fpsimd_last_state, st);
+   st->cpu = smp_processor_id();
+}
+
+/*
  * Load the userland FPSIMD state of 'current' from memory, but only if the
  * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
  * state of 'current'
@@ -1004,11 +1016,8 @@ void fpsimd_restore_current_state(void)
local_bh_disable();
 
if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
-   struct fpsimd_state *st = ¤t->thread.fpsimd_state;
-
task_fpsimd_load();
-   __this_cpu_write(fpsimd_last_state, st);
-   st->cpu = smp_processor_id();
+   fpsimd_bind_to_cpu();
}
 
local_bh_enable();
@@ -1032,12 +1041,8 @@ void fpsimd_update_current_state(struct fpsimd_state 
*state)
}
task_fpsimd_load();
 
-   if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
-   struct fpsimd_state *st = ¤t->thread.fpsimd_state;
-
-   __this_cpu_write(fpsimd_last_state, st);
-   st->cpu = smp_processor_id();
-   }
+   if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE))
+   fpsimd_bind_to_cpu();
 
local_bh_enable();
 }
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 4/4] arm64/sve: KVM: Basic SVE support

2017-11-17 Thread Dave Martin
This patch is a flattened bunch of hacks for adding SVE support to
KVM.  It's intended as a starting point for comments: it is not
intended to be complete or final!

** This patch has suspected bugs and has undergone minimal testing: do
not merge **

Notes:

struct kvm_vpcu_arch does not currently include space for a guest's
SVE state, so supporting SVE in guests requires some additional
space to be allocated.  Statically allocating space per-vcpu is
wasteful, especially if this allocation is based on the theoretical
future arch maximum vector length SVE_VL_MAX.

A pointer to dynamically allocated memory would require that memory
to be mapped into hyp.  Hyp mappings cannot currently be torn down
dynamically, so this would result in a mess of kernel heap memory
getting mapped into hyp over time.

This patch adopts a compromise: enough space is allocated at the
end of each kvm_vcpu to store the SVE state, sized according to the
maximum vector length supported by the hardware.  Naturally, if the
hardware does not support SVE, no extra space is allocated at all.

Context switching implemented by adding alternative SVE code at
each site where FPSIMD context is handled.  SVE is unconditionally
provided to the guest is the host supports it.  This is a bit
crude, but good enough for a proof-of-concept implementation.

ZCR_EL1 and ZCR_EL2 are added to the sys_regs list unconditionally,
which will break userspace snapshot/restore compatibility.
Possibly a more flexible approach is needed.  The inclusion of
ZCR_EL2 here is a bit odd too: this is a feature configuration
control rather than a guest register -- it is used to clamp the
maximum vector length available to the guest.  Currently it is just
set by default to correspond to the host's maximum.

Questions
-

 * Should userspace be able to control the maximum SVE vector
   length available to the guest, and what's the most natural way
   to do it?

   For example, it would be necessary to limit the vector length to
   the lowest common denominator in order to support migration
   across a cluster where the maximum hardware vector length
   differs between nodes.

 * Combined alternatives are really awkward.  Is there any way to
   use the standard static key based features tests in hyp?

TODO


 * Allow userspace feature control, to choose whether to expose SVE
   to a guest.

 * Attempt to port some of the KVM entry code to C, at least for the
   __fpsimd_guest_restore stuff.  The extra complexity with SVE looks
   unsustainable.

 * Figure out ABI for exposing SVE regs via the ioctl interface.

*Bugs*
--

Currently there is nothing stopping KVM userspace from
changing the guest's ZCR_EL2 after boot via the ioctl interface:
this breaks architectural assumptions in the guest, and should
really be forbidden.  Also, this is a latent trigger for
buffer overruns, if creation of guests with limited VL is
someday permitted.

Signed-off-by: Dave Martin 
---
 arch/arm64/include/asm/fpsimdmacros.h |  8 +
 arch/arm64/include/asm/kvm_host.h | 30 ++
 arch/arm64/include/asm/kvm_hyp.h  |  4 +++
 arch/arm64/include/asm/sysreg.h   |  1 +
 arch/arm64/kernel/asm-offsets.c   |  8 +
 arch/arm64/kernel/entry-fpsimd.S  |  1 -
 arch/arm64/kvm/handle_exit.c  |  2 +-
 arch/arm64/kvm/hyp/entry.S| 60 ---
 arch/arm64/kvm/hyp/fpsimd.S   | 12 +++
 arch/arm64/kvm/hyp/hyp-entry.S|  7 
 arch/arm64/kvm/hyp/switch.c   | 46 ++-
 arch/arm64/kvm/reset.c| 18 +++
 arch/arm64/kvm/sys_regs.c | 39 ---
 virt/kvm/arm/arm.c| 12 +--
 14 files changed, 221 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimdmacros.h 
b/arch/arm64/include/asm/fpsimdmacros.h
index e050d76..e2124c8 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -17,6 +17,12 @@
  * along with this program.  If not, see .
  */
 
+#ifndef __ARM64_FPSIMDMACROS_H
+#define __ARM64_FPSIMDMACROS_H
+
+#include 
+#include 
+
 .macro fpsimd_save state, tmpnr
stp q0, q1, [\state, #16 * 0]
stp q2, q3, [\state, #16 * 2]
@@ -223,3 +229,5 @@
ldr w\nxtmp, [\xpfpsr, #4]
msr fpcr, x\nxtmp
 .endm
+
+#endif /* ! __ARM64_FPSIMDMACROS_H */
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 674912d..7045682 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -22,6 +22,7 @@
 #ifndef __ARM64_KVM_HOST_H__
 #define __ARM64_KVM_HOST_H__
 
+#include 
 #include 
 #include 
 #include 
@@ -29,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
@@ -102,6 +104,8 @@ enum vcpu_sysreg {
SCTLR_EL1,  /* System Control Register 

[RFC PATCH 2/4] arm64/sve: KVM: Avoid dereference of dead task during guest entry

2017-11-17 Thread Dave Martin
When deciding whether to invalidate FPSIMD state cached in the cpu,
the backend function sve_flush_cpu_state() attempts to dereference
__this_cpu_read(fpsimd_last_state).  However, this is not safe:
there is no guarantee that the pointer is still valid, because the
task could have exited in the meantime.  For this reason, this
percpu pointer should only be assigned or compared, never
dereferenced.

This means that we need another means to get the appropriate value
of TIF_SVE for the associated task.

This patch solves this issue by adding a cached copy of the TIF_SVE
flag in fpsimd_last_state, which we can check without dereferencing
the task pointer.

Signed-off-by: Dave Martin 
---
 arch/arm64/kernel/fpsimd.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 007140b..3dc8058 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -114,7 +114,12 @@
  *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
  *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
  */
-static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
+struct fpsimd_last_state_struct {
+   struct fpsimd_state *st;
+   bool sve_in_use;
+};
+
+static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
 
 /* Default VL for tasks that don't set it explicitly: */
 static int sve_default_vl = -1;
@@ -905,7 +910,7 @@ void fpsimd_thread_switch(struct task_struct *next)
 */
struct fpsimd_state *st = &next->thread.fpsimd_state;
 
-   if (__this_cpu_read(fpsimd_last_state) == st
+   if (__this_cpu_read(fpsimd_last_state.st) == st
&& st->cpu == smp_processor_id())
clear_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE);
else
@@ -997,9 +1002,12 @@ void fpsimd_signal_preserve_current_state(void)
  */
 static void fpsimd_bind_to_cpu(void)
 {
+   struct fpsimd_last_state_struct *last =
+   this_cpu_ptr(&fpsimd_last_state);
struct fpsimd_state *st = ¤t->thread.fpsimd_state;
 
-   __this_cpu_write(fpsimd_last_state, st);
+   last->st = st;
+   last->sve_in_use = test_thread_flag(TIF_SVE);
st->cpu = smp_processor_id();
 }
 
@@ -1057,7 +1065,7 @@ void fpsimd_flush_task_state(struct task_struct *t)
 
 static inline void fpsimd_flush_cpu_state(void)
 {
-   __this_cpu_write(fpsimd_last_state, NULL);
+   __this_cpu_write(fpsimd_last_state.st, NULL);
 }
 
 /*
@@ -1070,14 +1078,10 @@ static inline void fpsimd_flush_cpu_state(void)
 #ifdef CONFIG_ARM64_SVE
 void sve_flush_cpu_state(void)
 {
-   struct fpsimd_state *const fpstate = __this_cpu_read(fpsimd_last_state);
-   struct task_struct *tsk;
-
-   if (!fpstate)
-   return;
+   struct fpsimd_last_state_struct const *last =
+   this_cpu_ptr(&fpsimd_last_state);
 
-   tsk = container_of(fpstate, struct task_struct, thread.fpsimd_state);
-   if (test_tsk_thread_flag(tsk, TIF_SVE))
+   if (last->st && last->sve_in_use)
fpsimd_flush_cpu_state();
 }
 #endif /* CONFIG_ARM64_SVE */
@@ -1272,7 +1276,7 @@ static inline void fpsimd_pm_init(void) { }
 #ifdef CONFIG_HOTPLUG_CPU
 static int fpsimd_cpu_dead(unsigned int cpu)
 {
-   per_cpu(fpsimd_last_state, cpu) = NULL;
+   per_cpu(fpsimd_last_state.st, cpu) = NULL;
return 0;
 }
 
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 3/4] arm64/sve: KVM: Ensure user SVE use traps after vcpu execution

2017-11-17 Thread Dave Martin
Currently, SVE use can remain untrapped if a KVM vcpu thread is
preempted inside the kernel and we then switch back to some user
thread.

This patch ensures that SVE traps for userspace are enabled before
switching away from the vcpu thread.

In an attempt to preserve some clarity about why and when this is
needed, kvm_fpsimd_flush_cpu_state() is used as a hook for doing
this.  This means that this function needs to be called after
exiting the vcpu instead of before entry: this patch moves the call
as appropriate.  As a side-effect, this will avoid the call if vcpu
entry is shortcircuited by a signal etc.

Signed-off-by: Dave Martin 
---
 arch/arm64/kernel/fpsimd.c | 2 ++
 virt/kvm/arm/arm.c | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 3dc8058..3b135eb 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1083,6 +1083,8 @@ void sve_flush_cpu_state(void)
 
if (last->st && last->sve_in_use)
fpsimd_flush_cpu_state();
+
+   sve_user_disable();
 }
 #endif /* CONFIG_ARM64_SVE */
 
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 772bf74..554b157 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -651,9 +651,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
 */
preempt_disable();
 
-   /* Flush FP/SIMD state that can't survive guest entry/exit */
-   kvm_fpsimd_flush_cpu_state();
-
kvm_pmu_flush_hwstate(vcpu);
 
local_irq_disable();
@@ -754,6 +751,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
guest_exit();
trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), 
*vcpu_pc(vcpu));
 
+   /* Flush FP/SIMD state that can't survive guest entry/exit */
+   kvm_fpsimd_flush_cpu_state();
+
preempt_enable();
 
ret = handle_exit(vcpu, run, ret);
-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 0/4] Initial KVM SVE support hacks

2017-11-17 Thread Dave Martin
Throwing out an RFC here now that I've got something _sort of_ working.

This is based on the base SVE patches as now present in
torvalds/master [1], but not on Christoffer's SVE optimisations (for
now).

In a slightly older version of this code I have seen host user tasks
reaching task_fpsimd_save() with the wrong ZCR setup.  This _might_
now be fixed, but if reviewers can pay particular attention to dodgy
looking trapping control or missing context switching that would be
much appreciated!


Notes:

Currently, I grow the vcpu slab size to be large enough to accommodate
the SVE state dangling at the end, and extend the existing FPSIMD
handling paths in KVM to deal with SVE.

Guests are allowed to use SVE whenever the system supports it, and
full SVE context is tracked unconditionally for each vcpu.


The next things to decide are

 a) how broken this implementation approach is

 b) how (or whether) to enable userspace to control whether SVE is
available to the guest and if so with what maximum vector length

 c) what the ioctl interface should look like.


Patches 1-3 contain some relevant tweaks and fixes.

Patch 4 contains the actual implementation: this is consciously a bit
of a hack today -- more detailed notes in the commit message.


[1] c9b012e5f4a1 ("Merge tag 'arm64-upstream' of 
git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux")


Dave Martin (4):
  arm64: fpsimd: Abstract out binding of task's fpsimd context to the
cpu.
  arm64/sve: KVM: Avoid dereference of dead task during guest entry
  arm64/sve: KVM: Ensure user SVE use traps after vcpu execution
  arm64/sve: KVM: Basic SVE support

 arch/arm64/include/asm/fpsimdmacros.h |  8 +
 arch/arm64/include/asm/kvm_host.h | 30 ++
 arch/arm64/include/asm/kvm_hyp.h  |  4 +++
 arch/arm64/include/asm/sysreg.h   |  1 +
 arch/arm64/kernel/asm-offsets.c   |  8 +
 arch/arm64/kernel/entry-fpsimd.S  |  1 -
 arch/arm64/kernel/fpsimd.c| 53 +++
 arch/arm64/kvm/handle_exit.c  |  2 +-
 arch/arm64/kvm/hyp/entry.S| 60 ---
 arch/arm64/kvm/hyp/fpsimd.S   | 12 +++
 arch/arm64/kvm/hyp/hyp-entry.S|  7 
 arch/arm64/kvm/hyp/switch.c   | 46 ++-
 arch/arm64/kvm/reset.c| 18 +++
 arch/arm64/kvm/sys_regs.c | 39 ---
 virt/kvm/arm/arm.c| 18 ---
 15 files changed, 256 insertions(+), 51 deletions(-)

-- 
2.1.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PULL 00/27] KVM/ARM GICv4 Support for v4.15

2017-11-17 Thread Paolo Bonzini
On 13/11/2017 10:17, Christoffer Dall wrote:
>   git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 
> tags/kvm-arm-gicv4-for-v4.15

Pulled, thanks.

Paolo
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm