This patch adds vfio iommu support for Freescale IOMMU (PAMU -
Peripheral Access Management Unit).

The Freescale PAMU is an aperture-based IOMMU with the following
characteristics.  Each device has an entry in a table in memory
describing the iova->phys mapping. The mapping has:
   -an overall aperture that is power of 2 sized, and has a start iova that
    is naturally aligned
   -has 1 or more windows within the aperture
   -number of windows must be power of 2, max is 256
   -size of each window is determined by aperture size / # of windows
   -iova of each window is determined by aperture start iova / # of windows
   -the mapped region in each window can be different than
    the window size...mapping must power of 2
   -physical address of the mapping must be naturally aligned
    with the mapping size

Some of the code is derived from TYPE1 iommu (driver/vfio/vfio_iommu_type1.c).

Signed-off-by: Bharat Bhushan <bharat.bhus...@freescale.com>
---
v1->v2
 - Use lock around msi-dma list
 - check for overlap between dma and msi-dma pages
 - Some code cleanup as per various comments

 drivers/vfio/Kconfig               |    6 +
 drivers/vfio/Makefile              |    1 +
 drivers/vfio/vfio_iommu_fsl_pamu.c | 1003 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h          |  100 ++++
 4 files changed, 1110 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/vfio_iommu_fsl_pamu.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 26b3d9d..7d1da26 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -8,11 +8,17 @@ config VFIO_IOMMU_SPAPR_TCE
        depends on VFIO && SPAPR_TCE_IOMMU
        default n
 
+config VFIO_IOMMU_FSL_PAMU
+       tristate
+       depends on VFIO
+       default n
+
 menuconfig VFIO
        tristate "VFIO Non-Privileged userspace driver framework"
        depends on IOMMU_API
        select VFIO_IOMMU_TYPE1 if X86
        select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
+       select VFIO_IOMMU_FSL_PAMU if FSL_PAMU
        help
          VFIO provides a framework for secure userspace device drivers.
          See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index c5792ec..7461350 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o 
vfio_iommu_spapr_tce.o
+obj-$(CONFIG_VFIO_IOMMU_FSL_PAMU) += vfio_iommu_common.o vfio_iommu_fsl_pamu.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_fsl_pamu.c 
b/drivers/vfio/vfio_iommu_fsl_pamu.c
new file mode 100644
index 0000000..66efc84
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_fsl_pamu.c
@@ -0,0 +1,1003 @@
+/*
+ * VFIO: IOMMU DMA mapping support for FSL PAMU IOMMU
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ *     Author: Bharat Bhushan <bharat.bhus...@freescale.com>
+ *
+ * This file is derived from driver/vfio/vfio_iommu_type1.c
+ *
+ * The Freescale PAMU is an aperture-based IOMMU with the following
+ * characteristics.  Each device has an entry in a table in memory
+ * describing the iova->phys mapping. The mapping has:
+ *  -an overall aperture that is power of 2 sized, and has a start iova that
+ *   is naturally aligned
+ *  -has 1 or more windows within the aperture
+ *     -number of windows must be power of 2, max is 256
+ *     -size of each window is determined by aperture size / # of windows
+ *     -iova of each window is determined by aperture start iova / # of windows
+ *     -the mapped region in each window can be different than
+ *      the window size...mapping must power of 2
+ *     -physical address of the mapping must be naturally aligned
+ *      with the mapping size
+ */
+
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pci.h>         /* pci_bus_type */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/hugetlb.h>
+#include <linux/msi.h>
+#include <asm/fsl_pamu_stash.h>
+
+#include "vfio_iommu_common.h"
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Bharat Bhushan <bharat.bhus...@freescale.com>"
+#define DRIVER_DESC     "FSL PAMU IOMMU driver for VFIO"
+
+struct vfio_iommu {
+       struct iommu_domain     *domain;
+       struct mutex            lock;
+       dma_addr_t              aperture_start;
+       dma_addr_t              aperture_end;
+       dma_addr_t              page_size;      /* Maximum mapped Page size */
+       int                     nsubwindows;    /* Number of subwindows */
+       struct rb_root          dma_list;
+       struct list_head        msi_dma_list;
+       struct list_head        group_list;
+};
+
+struct vfio_dma {
+       struct rb_node          node;
+       dma_addr_t              iova;           /* Device address */
+       unsigned long           vaddr;          /* Process virtual addr */
+       size_t                  size;           /* Map size (bytes) */
+       int                     prot;           /* IOMMU_READ/WRITE */
+};
+
+struct vfio_msi_dma {
+       struct list_head        next;
+       dma_addr_t              iova;           /* Device address */
+       size_t                  size;           /* MSI page size */
+       int                     bank_id;
+       int                     prot;           /* IOMMU_READ/WRITE */
+};
+
+struct vfio_group {
+       struct iommu_group      *iommu_group;
+       struct list_head        next;
+};
+
+static int iova_to_win(struct vfio_iommu *iommu, dma_addr_t iova)
+{
+       u64 offset = iova - iommu->aperture_start;
+       do_div(offset, iommu->page_size);
+       return (int) offset;
+}
+
+static int vfio_disable_iommu_domain(struct vfio_iommu *iommu)
+{
+       int enable = 0;
+       return iommu_domain_set_attr(iommu->domain,
+                                    DOMAIN_ATTR_FSL_PAMU_ENABLE, &enable);
+}
+
+static int vfio_enable_iommu_domain(struct vfio_iommu *iommu)
+{
+       int enable = 1;
+       return iommu_domain_set_attr(iommu->domain,
+                                    DOMAIN_ATTR_FSL_PAMU_ENABLE, &enable);
+}
+
+/* Unmap DMA region */
+/* This function disable iommu if no dma mapping is set */
+static void vfio_check_and_disable_iommu(struct vfio_iommu *iommu)
+{
+       if (list_empty(&iommu->msi_dma_list) && !rb_first(&iommu->dma_list))
+               vfio_disable_iommu_domain(iommu);
+}
+
+static struct vfio_msi_dma *vfio_find_msi_dma(struct vfio_iommu *iommu,
+                                             dma_addr_t start, size_t size)
+{
+       struct vfio_msi_dma *msi_dma;
+
+       /* Check MSI MAP entries */
+       list_for_each_entry(msi_dma, &iommu->msi_dma_list, next) {
+               if ((start + size) <= (msi_dma->iova))
+                       continue;
+
+               if ((start >= (msi_dma->iova + msi_dma->size)))
+                       continue;
+
+               return msi_dma;
+       }
+
+       return NULL;
+}
+
+static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
+                                     dma_addr_t start, size_t size)
+{
+       struct rb_node *node = iommu->dma_list.rb_node;
+
+       /* check DMA MAP entries */
+       while (node) {
+               struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+               if (start + size <= dma->iova)
+                       node = node->rb_left;
+               else if (start >= dma->iova + dma->size)
+                       node = node->rb_right;
+               else
+                       return dma;
+       }
+
+       return NULL;
+}
+
+static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
+{
+       struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+       struct vfio_dma *dma;
+
+       while (*link) {
+               parent = *link;
+               dma = rb_entry(parent, struct vfio_dma, node);
+
+               if (new->iova + new->size <= dma->iova)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
+{
+       rb_erase(&old->node, &iommu->dma_list);
+       vfio_check_and_disable_iommu(iommu);
+}
+
+static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+                           dma_addr_t iova, size_t *size)
+{
+       dma_addr_t start = iova;
+       int win, win_start, win_end;
+       long unlocked = 0;
+       unsigned int nr_pages;
+
+       nr_pages = iommu->page_size / PAGE_SIZE;
+       win_start = iova_to_win(iommu, iova);
+       win_end = iova_to_win(iommu, iova + *size - 1);
+
+       /* Release the pinned pages */
+       for (win = win_start; win <= win_end; iova += iommu->page_size, win++) {
+               unsigned long pfn;
+
+               pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
+               if (!pfn)
+                       continue;
+
+               iommu_domain_window_disable(iommu->domain, win);
+
+               unlocked += vfio_unpin_pages(pfn, nr_pages, dma->prot, 1);
+       }
+
+       vfio_lock_acct(-unlocked);
+       *size = iova - start;
+       return 0;
+}
+
+static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
+                                  size_t *size, struct vfio_dma *dma)
+{
+       size_t offset, overlap, tmp;
+       struct vfio_dma *split;
+       int ret;
+
+       if (!*size)
+               return 0;
+
+       /*
+        * Existing dma region is completely covered, unmap all.  This is
+        * the likely case since userspace tends to map and unmap buffers
+        * in one shot rather than multiple mappings within a buffer.
+        */
+       if (likely(start <= dma->iova &&
+                  start + *size >= dma->iova + dma->size)) {
+               *size = dma->size;
+               ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
+               if (ret)
+                       return ret;
+
+               /*
+                * Did we remove more than we have?  Should never happen
+                * since a vfio_dma is contiguous in iova and vaddr.
+                */
+               WARN_ON(*size != dma->size);
+
+               vfio_remove_dma(iommu, dma);
+               kfree(dma);
+               return 0;
+       }
+
+       /* Overlap low address of existing range */
+       if (start <= dma->iova) {
+               overlap = start + *size - dma->iova;
+               ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
+               if (ret)
+                       return ret;
+
+               vfio_remove_dma(iommu, dma);
+
+               /*
+                * Check, we may have removed to whole vfio_dma.  If not
+                * fixup and re-insert.
+                */
+               if (overlap < dma->size) {
+                       dma->iova += overlap;
+                       dma->vaddr += overlap;
+                       dma->size -= overlap;
+                       vfio_insert_dma(iommu, dma);
+               } else
+                       kfree(dma);
+
+               *size = overlap;
+               return 0;
+       }
+
+       /* Overlap high address of existing range */
+       if (start + *size >= dma->iova + dma->size) {
+               offset = start - dma->iova;
+               overlap = dma->size - offset;
+
+               ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
+               if (ret)
+                       return ret;
+
+               dma->size -= overlap;
+               *size = overlap;
+               return 0;
+       }
+
+       /* Split existing */
+
+       /*
+        * Allocate our tracking structure early even though it may not
+        * be used.  An Allocation failure later loses track of pages and
+        * is more difficult to unwind.
+        */
+       split = kzalloc(sizeof(*split), GFP_KERNEL);
+       if (!split)
+               return -ENOMEM;
+
+       offset = start - dma->iova;
+
+       ret = vfio_unmap_unpin(iommu, dma, start, size);
+       if (ret || !*size) {
+               kfree(split);
+               return ret;
+       }
+
+       tmp = dma->size;
+
+       /* Resize the lower vfio_dma in place, before the below insert */
+       dma->size = offset;
+
+       /* Insert new for remainder, assuming it didn't all get unmapped */
+       if (likely(offset + *size < tmp)) {
+               split->size = tmp - offset - *size;
+               split->iova = dma->iova + offset + *size;
+               split->vaddr = dma->vaddr + offset + *size;
+               split->prot = dma->prot;
+               vfio_insert_dma(iommu, split);
+       } else
+               kfree(split);
+
+       return 0;
+}
+
+/* Map DMA region */
+static int vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
+                         unsigned long vaddr, long npage, int prot)
+{
+       int ret = 0, i;
+       size_t size;
+       unsigned int win, nr_subwindows;
+       dma_addr_t iovamap;
+
+       win = iova_to_win(iommu, iova);
+       if (iova != iommu->aperture_start + iommu->page_size * win) {
+               pr_err("%s iova(%llx) unalligned to window size %llx\n",
+                       __func__, iova, iommu->page_size);
+               return -EINVAL;
+       }
+
+       /* total size to be mapped */
+       size = npage << PAGE_SHIFT;
+       nr_subwindows = size >> ilog2(iommu->page_size);
+       iovamap = iova;
+
+       for (i = 0; i < nr_subwindows; i++, win++) {
+               unsigned long pfn;
+               unsigned long nr_pages;
+               dma_addr_t mapsize;
+               struct vfio_dma *dma = NULL;
+
+               mapsize = min(iova + size - iovamap, iommu->page_size);
+               nr_pages = mapsize >> PAGE_SHIFT;
+
+               /* Pin a contiguous chunk of memory */
+               ret = vfio_pin_pages(vaddr, nr_pages, prot, &pfn);
+               if (ret != nr_pages) {
+                       pr_err("%s unable to pin pages = %lx, 
pinned(%lx/%lx)\n",
+                               __func__, vaddr, npage, nr_pages);
+                       ret = -EINVAL;
+                       break;
+               }
+
+               ret = iommu_domain_window_enable(iommu->domain, win,
+                                                (phys_addr_t)pfn << PAGE_SHIFT,
+                                                mapsize, prot);
+               if (ret) {
+                       pr_err("%s unable to iommu_map()\n", __func__);
+                       ret = -EINVAL;
+                       break;
+               }
+
+               /*
+                * Check if we abut a region below - nothing below 0.
+                * This is the most likely case when mapping chunks of
+                * physically contiguous regions within a virtual address
+                * range.  Update the abutting entry in place since iova
+                * doesn't change.
+                */
+               if (likely(iovamap)) {
+                       struct vfio_dma *tmp;
+                       tmp = vfio_find_dma(iommu, iovamap - 1, 1);
+                       if (tmp && tmp->prot == prot &&
+                           tmp->vaddr + tmp->size == vaddr) {
+                               tmp->size += mapsize;
+                               dma = tmp;
+                       }
+               }
+
+               /*
+                * Check if we abut a region above - nothing above ~0 + 1.
+                * If we abut above and below, remove and free.  If only
+                * abut above, remove, modify, reinsert.
+                */
+               if (likely(iovamap + mapsize)) {
+                       struct vfio_dma *tmp;
+                       tmp = vfio_find_dma(iommu, iovamap + mapsize, 1);
+                       if (tmp && tmp->prot == prot &&
+                           tmp->vaddr == vaddr + mapsize) {
+                               vfio_remove_dma(iommu, tmp);
+                               if (dma) {
+                                       dma->size += tmp->size;
+                                       kfree(tmp);
+                               } else {
+                                       tmp->size += mapsize;
+                                       tmp->iova = iovamap;
+                                       tmp->vaddr = vaddr;
+                                       vfio_insert_dma(iommu, tmp);
+                                       dma = tmp;
+                               }
+                       }
+               }
+
+               if (!dma) {
+                       dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+                       if (!dma) {
+                               iommu_unmap(iommu->domain, iovamap, mapsize);
+                               vfio_unpin_pages(pfn, npage, prot, true);
+                               ret = -ENOMEM;
+                               break;
+                       }
+
+                       dma->size = mapsize;
+                       dma->iova = iovamap;
+                       dma->vaddr = vaddr;
+                       dma->prot = prot;
+                       vfio_insert_dma(iommu, dma);
+               }
+
+               iovamap += mapsize;
+               vaddr += mapsize;
+       }
+
+       if (ret) {
+               struct vfio_dma *tmp;
+               while ((tmp = vfio_find_dma(iommu, iova, size))) {
+                       int r = vfio_remove_dma_overlap(iommu, iova,
+                                                       &size, tmp);
+                       if (WARN_ON(r || !size))
+                               break;
+               }
+               return 0;
+       }
+
+       vfio_enable_iommu_domain(iommu);
+       return 0;
+}
+
+static int vfio_dma_do_map(struct vfio_iommu *iommu,
+                          struct vfio_iommu_type1_dma_map *map)
+{
+       dma_addr_t iova = map->iova;
+       size_t size = map->size;
+       unsigned long vaddr = map->vaddr;
+       int ret = 0, prot = 0;
+       long npage;
+
+       /* READ/WRITE from device perspective */
+       if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
+               prot |= IOMMU_WRITE;
+       if (map->flags & VFIO_DMA_MAP_FLAG_READ)
+               prot |= IOMMU_READ;
+
+       if (!prot)
+               return -EINVAL; /* No READ/WRITE? */
+
+       /* Don't allow IOVA wrap */
+       if (iova + size && iova + size < iova)
+               return -EINVAL;
+
+       /* Don't allow virtual address wrap */
+       if (vaddr + size && vaddr + size < vaddr)
+               return -EINVAL;
+
+       /*
+        * FIXME: Currently we only support mapping page-size
+        * of subwindow-size.
+        */
+       if (size < iommu->page_size)
+               return -EINVAL;
+
+       npage = size >> PAGE_SHIFT;
+       if (!npage)
+               return -EINVAL;
+
+       mutex_lock(&iommu->lock);
+
+       /* Check for dma maping and msi_dma mapping */
+       if (vfio_find_dma(iommu, iova, size) ||
+           vfio_find_msi_dma(iommu, iova, size)) {
+               ret = -EEXIST;
+               goto out_lock;
+       }
+
+       ret = vfio_dma_map(iommu, iova, vaddr, npage, prot);
+
+out_lock:
+       mutex_unlock(&iommu->lock);
+       return ret;
+}
+
+static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
+                            struct vfio_iommu_type1_dma_unmap *unmap)
+{
+       struct vfio_dma *dma;
+       size_t unmapped = 0, size;
+       int ret = 0;
+
+       mutex_lock(&iommu->lock);
+
+       while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
+               size = unmap->size;
+               ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
+               if (ret || !size)
+                       break;
+               unmapped += size;
+       }
+
+       mutex_unlock(&iommu->lock);
+
+       /*
+        * We may unmap more than requested, update the unmap struct so
+        * userspace can know.
+        */
+       unmap->size = unmapped;
+
+       return ret;
+}
+
+static int vfio_handle_get_attr(struct vfio_iommu *iommu,
+                        struct vfio_pamu_attr *pamu_attr)
+{
+       int ret = 0;
+
+       switch (pamu_attr->attribute) {
+       case VFIO_ATTR_GEOMETRY: {
+               struct iommu_domain_geometry geom;
+               ret = iommu_domain_get_attr(iommu->domain,
+                                         DOMAIN_ATTR_GEOMETRY, &geom);
+               pamu_attr->attr_info.attr.aperture_start = geom.aperture_start;
+               pamu_attr->attr_info.attr.aperture_end = geom.aperture_end;
+               break;
+       }
+       case VFIO_ATTR_WINDOWS: {
+               u32 count;
+               ret = iommu_domain_get_attr(iommu->domain,
+                                     DOMAIN_ATTR_WINDOWS, &count);
+               pamu_attr->attr_info.windows = count;
+               break;
+       }
+       case VFIO_ATTR_PAMU_STASH: {
+               struct pamu_stash_attribute stash;
+               ret = iommu_domain_get_attr(iommu->domain,
+                                     DOMAIN_ATTR_FSL_PAMU_STASH, &stash);
+               pamu_attr->attr_info.stash.cpu = stash.cpu;
+               pamu_attr->attr_info.stash.cache = stash.cache;
+               break;
+       }
+
+       default:
+               pr_err("%s Error: Invalid attribute (%d)\n",
+                        __func__, pamu_attr->attribute);
+               return -EINVAL;
+       }
+
+       return ret;
+}
+
+static int vfio_handle_set_attr(struct vfio_iommu *iommu,
+                        struct vfio_pamu_attr *pamu_attr)
+{
+       int ret = 0;
+
+       switch (pamu_attr->attribute) {
+       case VFIO_ATTR_GEOMETRY: {
+               struct iommu_domain_geometry geom;
+
+               geom.aperture_start = pamu_attr->attr_info.attr.aperture_start;
+               geom.aperture_end = pamu_attr->attr_info.attr.aperture_end;
+               iommu->aperture_start = geom.aperture_start;
+               iommu->aperture_end = geom.aperture_end;
+               geom.force_aperture = 1;
+               ret = iommu_domain_set_attr(iommu->domain,
+                                         DOMAIN_ATTR_GEOMETRY, &geom);
+               break;
+       }
+       case VFIO_ATTR_WINDOWS: {
+               u32 count = pamu_attr->attr_info.windows;
+               u64 size = iommu->aperture_end - iommu->aperture_start + 1;
+
+               ret = iommu_domain_set_attr(iommu->domain,
+                                     DOMAIN_ATTR_WINDOWS, &count);
+               if (!ret) {
+                       iommu->nsubwindows = pamu_attr->attr_info.windows;
+                       iommu->page_size = size >> ilog2(count);
+               }
+
+               break;
+       }
+       case VFIO_ATTR_PAMU_STASH: {
+               struct pamu_stash_attribute stash;
+
+               stash.cpu = pamu_attr->attr_info.stash.cpu;
+               stash.cache = pamu_attr->attr_info.stash.cache;
+               ret = iommu_domain_set_attr(iommu->domain,
+                                     DOMAIN_ATTR_FSL_PAMU_STASH, &stash);
+               break;
+       }
+
+       default:
+               pr_err("%s Error: Invalid attribute (%d)\n",
+                        __func__, pamu_attr->attribute);
+               return -EINVAL;
+       }
+
+       return ret;
+}
+
+static int pci_msi_set_device_iova(struct device *dev, void *data)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct vfio_msi_dma *msi_dma = data;
+
+       return msi_set_iova(pdev, msi_dma->bank_id, msi_dma->iova, 1);
+}
+
+static int pci_msi_clear_device_iova(struct device *dev, void *data)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct vfio_msi_dma *msi_dma = data;
+
+       return msi_set_iova(pdev, msi_dma->bank_id, msi_dma->iova, 0);
+}
+
+static int vfio_iommu_set_msi_iova(struct vfio_iommu *iommu,
+                                  struct vfio_msi_dma *msi_dma)
+{
+       struct vfio_group *group;
+       int ret = 0;
+
+       list_for_each_entry(group, &iommu->group_list, next) {
+               ret = iommu_group_for_each_dev(group->iommu_group, msi_dma,
+                                              pci_msi_set_device_iova);
+       }
+
+       return ret;
+}
+
+static int vfio_iommu_clear_msi_iova(struct vfio_iommu *iommu,
+                                    struct vfio_msi_dma *msi_dma)
+{
+       struct vfio_group *group;
+       int ret = 0;
+
+       list_for_each_entry(group, &iommu->group_list, next) {
+               ret = iommu_group_for_each_dev(group->iommu_group, msi_dma,
+                                              pci_msi_clear_device_iova);
+       }
+
+       return ret;
+}
+
+static int vfio_do_msi_map(struct vfio_iommu *iommu,
+                       struct vfio_pamu_msi_bank_map *msi_map)
+{
+       struct msi_region region;
+       struct vfio_msi_dma *msi_dma;
+       int window;
+       int prot = 0;
+       int ret;
+
+       /* READ/WRITE from device perspective */
+       if (msi_map->flags & VFIO_DMA_MAP_FLAG_WRITE)
+               prot |= IOMMU_WRITE;
+       if (msi_map->flags & VFIO_DMA_MAP_FLAG_READ)
+               prot |= IOMMU_READ;
+
+       if (!prot)
+               return -EINVAL; /* No READ/WRITE? */
+
+       ret = msi_get_region(msi_map->msi_bank_index, &region);
+       if (ret) {
+               pr_err("%s MSI region (%d) not found\n", __func__,
+                      msi_map->msi_bank_index);
+               return ret;
+       }
+
+       mutex_lock(&iommu->lock);
+       /* Check for dma maping and msi_dma mapping */
+       if (vfio_find_dma(iommu, msi_map->iova, region.size) ||
+           vfio_find_msi_dma(iommu, msi_map->iova, region.size)) {
+               ret = -EEXIST;
+               goto out_lock;
+       }
+
+       window = iova_to_win(iommu, msi_map->iova);
+       ret = iommu_domain_window_enable(iommu->domain, window, region.addr,
+                                        region.size, prot);
+       if (ret) {
+               pr_err("%s Error: unable to map msi region\n", __func__);
+               goto out_lock;
+       }
+
+       msi_dma = kzalloc(sizeof(*msi_dma), GFP_KERNEL);
+       if (!msi_dma) {
+               ret = -ENOMEM;
+               goto out_lock;
+       }
+
+       msi_dma->iova = msi_map->iova;
+       msi_dma->size = region.size;
+       msi_dma->bank_id = msi_map->msi_bank_index;
+       list_add(&msi_dma->next, &iommu->msi_dma_list);
+
+       /* Set iova for all the device in iommu-group for the given msi-bank */
+       ret = vfio_iommu_set_msi_iova(iommu, msi_dma);
+
+out_lock:
+       mutex_unlock(&iommu->lock);
+       return ret;
+}
+
+static void vfio_msi_unmap(struct vfio_iommu *iommu, dma_addr_t iova)
+{
+       int window;
+       window = iova_to_win(iommu, iova);
+       iommu_domain_window_disable(iommu->domain, window);
+}
+
+static int vfio_do_msi_unmap(struct vfio_iommu *iommu,
+                            struct vfio_pamu_msi_bank_unmap *msi_unmap)
+{
+       struct vfio_msi_dma *mdma, *mdma_tmp;
+
+       mutex_lock(&iommu->lock);
+
+       list_for_each_entry_safe(mdma, mdma_tmp, &iommu->msi_dma_list, next) {
+               if (mdma->iova == msi_unmap->iova) {
+                       /* Clear mapping for msi iova page mapping */
+                       vfio_iommu_clear_msi_iova(iommu, mdma);
+                       /* Unmap in iommu (PAMU) */
+                       vfio_msi_unmap(iommu, mdma->iova);
+                       list_del(&mdma->next);
+                       vfio_check_and_disable_iommu(iommu);
+                       kfree(mdma);
+                       mutex_unlock(&iommu->lock);
+                       return 0;
+               }
+       }
+
+       mutex_unlock(&iommu->lock);
+       return -EINVAL;
+}
+static void *vfio_iommu_fsl_pamu_open(unsigned long arg)
+{
+       struct vfio_iommu *iommu;
+
+       if (arg != VFIO_FSL_PAMU_IOMMU)
+               return ERR_PTR(-EINVAL);
+
+       iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+       if (!iommu)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&iommu->group_list);
+       iommu->dma_list = RB_ROOT;
+       INIT_LIST_HEAD(&iommu->msi_dma_list);
+       mutex_init(&iommu->lock);
+
+       /*
+        * Wish we didn't have to know about bus_type here.
+        */
+       iommu->domain = iommu_domain_alloc(&pci_bus_type);
+       if (!iommu->domain) {
+               kfree(iommu);
+               return ERR_PTR(-EIO);
+       }
+
+       return iommu;
+}
+
+static void vfio_iommu_fsl_pamu_release(void *iommu_data)
+{
+       struct vfio_iommu *iommu = iommu_data;
+       struct vfio_group *group, *group_tmp;
+       struct vfio_msi_dma *mdma, *mdma_tmp;
+       struct rb_node *node;
+
+       list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
+               iommu_detach_group(iommu->domain, group->iommu_group);
+               list_del(&group->next);
+               kfree(group);
+       }
+
+       while ((node = rb_first(&iommu->dma_list))) {
+               struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+               size_t size = dma->size;
+               vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
+               if (WARN_ON(!size))
+                       break;
+       }
+
+       list_for_each_entry_safe(mdma, mdma_tmp, &iommu->msi_dma_list, next) {
+               vfio_msi_unmap(iommu, mdma->iova);
+               list_del(&mdma->next);
+               kfree(mdma);
+       }
+
+       /* Disable the iommu as there is no valid entry */
+       vfio_disable_iommu_domain(iommu);
+
+       iommu_domain_free(iommu->domain);
+       iommu->domain = NULL;
+       kfree(iommu);
+}
+
+static long vfio_iommu_fsl_pamu_ioctl(void *iommu_data,
+                                     unsigned int cmd, unsigned long arg)
+{
+       struct vfio_iommu *iommu = iommu_data;
+       unsigned long minsz;
+
+       if (cmd == VFIO_CHECK_EXTENSION) {
+               switch (arg) {
+               case VFIO_FSL_PAMU_IOMMU:
+                       return 1;
+               default:
+                       return 0;
+               }
+       } else if (cmd == VFIO_IOMMU_MAP_DMA) {
+               struct vfio_iommu_type1_dma_map map;
+               uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
+                               VFIO_DMA_MAP_FLAG_WRITE;
+
+               minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+               if (copy_from_user(&map, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (map.argsz < minsz || map.flags & ~mask)
+                       return -EINVAL;
+
+               return vfio_dma_do_map(iommu, &map);
+
+       } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
+               struct vfio_iommu_type1_dma_unmap unmap;
+               long ret;
+
+               minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+
+               if (copy_from_user(&unmap, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (unmap.argsz < minsz || unmap.flags)
+                       return -EINVAL;
+
+               ret = vfio_dma_do_unmap(iommu, &unmap);
+               if (ret)
+                       return ret;
+
+               return copy_to_user((void __user *)arg, &unmap, minsz);
+       } else if (cmd == VFIO_IOMMU_PAMU_GET_ATTR) {
+               struct vfio_pamu_attr pamu_attr;
+
+               minsz = offsetofend(struct vfio_pamu_attr, attr_info);
+               if (copy_from_user(&pamu_attr, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (pamu_attr.argsz < minsz)
+                       return -EINVAL;
+
+               vfio_handle_get_attr(iommu, &pamu_attr);
+
+               copy_to_user((void __user *)arg, &pamu_attr, minsz);
+               return 0;
+       } else if (cmd == VFIO_IOMMU_PAMU_SET_ATTR) {
+               struct vfio_pamu_attr pamu_attr;
+
+               minsz = offsetofend(struct vfio_pamu_attr, attr_info);
+               if (copy_from_user(&pamu_attr, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (pamu_attr.argsz < minsz)
+                       return -EINVAL;
+
+               vfio_handle_set_attr(iommu, &pamu_attr);
+               return 0;
+       } else if (cmd == VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT) {
+               return msi_get_region_count();
+       } else if (cmd == VFIO_IOMMU_PAMU_MAP_MSI_BANK) {
+               struct vfio_pamu_msi_bank_map msi_map;
+
+               minsz = offsetofend(struct vfio_pamu_msi_bank_map, iova);
+               if (copy_from_user(&msi_map, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (msi_map.argsz < minsz)
+                       return -EINVAL;
+
+               vfio_do_msi_map(iommu, &msi_map);
+               return 0;
+       } else if (cmd == VFIO_IOMMU_PAMU_UNMAP_MSI_BANK) {
+               struct vfio_pamu_msi_bank_unmap msi_unmap;
+
+               minsz = offsetofend(struct vfio_pamu_msi_bank_unmap, iova);
+               if (copy_from_user(&msi_unmap, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (msi_unmap.argsz < minsz)
+                       return -EINVAL;
+
+               vfio_do_msi_unmap(iommu, &msi_unmap);
+               return 0;
+
+       }
+
+       return -ENOTTY;
+}
+
+static int vfio_iommu_fsl_pamu_attach_group(void *iommu_data,
+                                        struct iommu_group *iommu_group)
+{
+       struct vfio_iommu *iommu = iommu_data;
+       struct vfio_group *group, *tmp;
+       int ret;
+
+       group = kzalloc(sizeof(*group), GFP_KERNEL);
+       if (!group)
+               return -ENOMEM;
+
+       mutex_lock(&iommu->lock);
+
+       list_for_each_entry(tmp, &iommu->group_list, next) {
+               if (tmp->iommu_group == iommu_group) {
+                       mutex_unlock(&iommu->lock);
+                       kfree(group);
+                       return -EINVAL;
+               }
+       }
+
+       ret = iommu_attach_group(iommu->domain, iommu_group);
+       if (ret) {
+               mutex_unlock(&iommu->lock);
+               kfree(group);
+               return ret;
+       }
+
+       group->iommu_group = iommu_group;
+       list_add(&group->next, &iommu->group_list);
+
+       mutex_unlock(&iommu->lock);
+
+       return 0;
+}
+
+static void vfio_iommu_fsl_pamu_detach_group(void *iommu_data,
+                                         struct iommu_group *iommu_group)
+{
+       struct vfio_iommu *iommu = iommu_data;
+       struct vfio_group *group;
+
+       mutex_lock(&iommu->lock);
+
+       list_for_each_entry(group, &iommu->group_list, next) {
+               if (group->iommu_group == iommu_group) {
+                       iommu_detach_group(iommu->domain, iommu_group);
+                       list_del(&group->next);
+                       kfree(group);
+                       break;
+               }
+       }
+
+       mutex_unlock(&iommu->lock);
+}
+
+static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_fsl_pamu = {
+       .name           = "vfio-iommu-fsl_pamu",
+       .owner          = THIS_MODULE,
+       .open           = vfio_iommu_fsl_pamu_open,
+       .release        = vfio_iommu_fsl_pamu_release,
+       .ioctl          = vfio_iommu_fsl_pamu_ioctl,
+       .attach_group   = vfio_iommu_fsl_pamu_attach_group,
+       .detach_group   = vfio_iommu_fsl_pamu_detach_group,
+};
+
+static int __init vfio_iommu_fsl_pamu_init(void)
+{
+       if (!iommu_present(&pci_bus_type))
+               return -ENODEV;
+
+       return vfio_register_iommu_driver(&vfio_iommu_driver_ops_fsl_pamu);
+}
+
+static void __exit vfio_iommu_fsl_pamu_cleanup(void)
+{
+       vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_fsl_pamu);
+}
+
+module_init(vfio_iommu_fsl_pamu_init);
+module_exit(vfio_iommu_fsl_pamu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0fd47f5..d359055 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -23,6 +23,7 @@
 
 #define VFIO_TYPE1_IOMMU               1
 #define VFIO_SPAPR_TCE_IOMMU           2
+#define VFIO_FSL_PAMU_IOMMU            3
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -451,4 +452,103 @@ struct vfio_iommu_spapr_tce_info {
 
 /* ***************************************************************** */
 
+/*********** APIs for VFIO_PAMU type only ****************/
+/*
+ * VFIO_IOMMU_PAMU_GET_ATTR - _IO(VFIO_TYPE, VFIO_BASE + 17,
+ *                               struct vfio_pamu_attr)
+ *
+ * Gets the iommu attributes for the current vfio container.
+ * Caller sets argsz and attribute.  The ioctl fills in
+ * the provided struct vfio_pamu_attr based on the attribute
+ * value that was set.
+ * Return: 0 on success, -errno on failure
+ */
+struct vfio_pamu_attr {
+       __u32   argsz;
+       __u32   flags;  /* no flags currently */
+#define VFIO_ATTR_GEOMETRY     0
+#define VFIO_ATTR_WINDOWS      1
+#define VFIO_ATTR_PAMU_STASH   2
+       __u32   attribute;
+
+       union {
+               /* VFIO_ATTR_GEOMETRY */
+               struct {
+                       /* first addr that can be mapped */
+                       __u64 aperture_start;
+                       /* last addr that can be mapped */
+                       __u64 aperture_end;
+               } attr;
+
+               /* VFIO_ATTR_WINDOWS */
+               __u32 windows;  /* number of windows in the aperture
+                                * initially this will be the max number
+                                * of windows that can be set
+                                */
+               /* VFIO_ATTR_PAMU_STASH */
+               struct {
+                       __u32 cpu;      /* CPU number for stashing */
+                       __u32 cache;    /* cache ID for stashing */
+               } stash;
+       } attr_info;
+};
+#define VFIO_IOMMU_PAMU_GET_ATTR  _IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/*
+ * VFIO_IOMMU_PAMU_SET_ATTR - _IO(VFIO_TYPE, VFIO_BASE + 18,
+ *                               struct vfio_pamu_attr)
+ *
+ * Sets the iommu attributes for the current vfio container.
+ * Caller sets struct vfio_pamu attr, including argsz and attribute and
+ * setting any fields that are valid for the attribute.
+ * Return: 0 on success, -errno on failure
+ */
+#define VFIO_IOMMU_PAMU_SET_ATTR  _IO(VFIO_TYPE, VFIO_BASE + 18)
+
+/*
+ * VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT - _IO(VFIO_TYPE, VFIO_BASE + 19, __u32)
+ *
+ * Returns the number of MSI banks for this platform.  This tells user space
+ * how many aperture windows should be reserved for MSI banks when setting
+ * the PAMU geometry and window count.
+ * Return: __u32 bank count on success, -errno on failure
+ */
+#define VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT _IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/*
+ * VFIO_IOMMU_PAMU_MAP_MSI_BANK - _IO(VFIO_TYPE, VFIO_BASE + 20,
+ *                                   struct vfio_pamu_msi_bank_map)
+ *
+ * Maps the MSI bank at the specified index and iova.  User space must
+ * call this ioctl once for each MSI bank (count of banks is returned by
+ * VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT).
+ * Caller provides struct vfio_pamu_msi_bank_map with all fields set.
+ * Return: 0 on success, -errno on failure
+ */
+
+struct vfio_pamu_msi_bank_map {
+       __u32   argsz;
+       __u32   flags;          /* no flags currently */
+       __u32   msi_bank_index; /* the index of the MSI bank */
+       __u64   iova;           /* the iova the bank is to be mapped to */
+};
+#define VFIO_IOMMU_PAMU_MAP_MSI_BANK  _IO(VFIO_TYPE, VFIO_BASE + 20)
+
+/*
+ * VFIO_IOMMU_PAMU_UNMAP_MSI_BANK - _IO(VFIO_TYPE, VFIO_BASE + 21,
+ *                                     struct vfio_pamu_msi_bank_unmap)
+ *
+ * Unmaps the MSI bank at the specified iova.
+ * Caller provides struct vfio_pamu_msi_bank_unmap with all fields set.
+ * Operates on VFIO file descriptor (/dev/vfio/vfio).
+ * Return: 0 on success, -errno on failure
+ */
+
+struct vfio_pamu_msi_bank_unmap {
+       __u32   argsz;
+       __u32   flags;  /* no flags currently */
+       __u64   iova;   /* the iova to be unmapped to */
+};
+#define VFIO_IOMMU_PAMU_UNMAP_MSI_BANK  _IO(VFIO_TYPE, VFIO_BASE + 21)
+
 #endif /* _UAPIVFIO_H */
-- 
1.7.0.4


_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to