Some devices only allow selective DMA faulting. Similar to the selective
dirty page tracking, the vendor driver can call vfio_pin_pages() to
indicate the non-faultable scope, we add a new struct vfio_range to
record it, then when the IOPF handler receives any page request out
of the scope, we can directly return with an invalid response.
Suggested-by: Kevin Tian
Signed-off-by: Shenming Lu
---
drivers/vfio/vfio.c | 4 +-
drivers/vfio/vfio_iommu_type1.c | 357 +++-
include/linux/vfio.h| 1 +
3 files changed, 358 insertions(+), 4 deletions(-)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 38779e6fd80c..44c8dfabf7de 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2013,7 +2013,8 @@ int vfio_unpin_pages(struct device *dev, unsigned long
*user_pfn, int npage)
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->unpin_pages))
- ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
+ ret = driver->ops->unpin_pages(container->iommu_data,
+ group->iommu_group, user_pfn,
npage);
else
ret = -ENOTTY;
@@ -2112,6 +2113,7 @@ int vfio_group_unpin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->unpin_pages))
ret = driver->ops->unpin_pages(container->iommu_data,
+ group->iommu_group,
user_iova_pfn, npage);
else
ret = -ENOTTY;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index dcc93c3b258c..ba2b5a1cf6e9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -150,10 +150,19 @@ struct vfio_regions {
static struct rb_root iopf_group_list = RB_ROOT;
static DEFINE_MUTEX(iopf_group_list_lock);
+struct vfio_range {
+ struct rb_node node;
+ dma_addr_t base_iova;
+ size_t span;
+ unsigned intref_count;
+};
+
struct vfio_iopf_group {
struct rb_node node;
struct iommu_group *iommu_group;
struct vfio_iommu *iommu;
+ struct rb_root pinned_range_list;
+ boolselective_faulting;
};
#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
@@ -496,6 +505,255 @@ static void vfio_unlink_iopf_group(struct vfio_iopf_group
*old)
mutex_unlock(&iopf_group_list_lock);
}
+/*
+ * Helper functions for range list, handle one page at a time.
+ */
+static struct vfio_range *vfio_find_range(struct rb_root *range_list,
+ dma_addr_t iova)
+{
+ struct rb_node *node = range_list->rb_node;
+ struct vfio_range *range;
+
+ while (node) {
+ range = rb_entry(node, struct vfio_range, node);
+
+ if (iova + PAGE_SIZE <= range->base_iova)
+ node = node->rb_left;
+ else if (iova >= range->base_iova + range->span)
+ node = node->rb_right;
+ else
+ return range;
+ }
+
+ return NULL;
+}
+
+/* Do the possible merge adjacent to the input range. */
+static void vfio_merge_range_list(struct rb_root *range_list,
+ struct vfio_range *range)
+{
+ struct rb_node *node_prev = rb_prev(&range->node);
+ struct rb_node *node_next = rb_next(&range->node);
+
+ if (node_next) {
+ struct vfio_range *range_next = rb_entry(node_next,
+struct vfio_range,
+node);
+
+ if (range_next->base_iova == (range->base_iova + range->span) &&
+ range_next->ref_count == range->ref_count) {
+ rb_erase(node_next, range_list);
+ range->span += range_next->span;
+ kfree(range_next);
+ }
+ }
+
+ if (node_prev) {
+ struct vfio_range *range_prev = rb_entry(node_prev,
+struct vfio_range,
+node);
+
+ if (range->base_iova == (range_prev->base_iova +
range_prev->span)
+ && range->ref_count == range_prev->ref_count) {
+ rb_erase(&range->node, range_list);
+ range_prev->span += range->span;
+ kfree(range);
+ }
+ }
+}
+
+static void vfio_link_range(struct rb_root *range_list, struct vfio_range *new)
+{
+ struct rb_node **link, *parent = NULL;
+