Update vfio_pci_bar_map() to align BAR mmaps for efficient huge page
mappings. The manual mmap alignment can be removed once mmap(!MAP_FIXED)
on vfio device fds improves to automatically return well-aligned
addresses.

Also add MADV_HUGEPAGE, which encourages the kernel to use huge pages
(e.g. when /sys/kernel/mm/transparent_hugepage/enabled is set to "madvise").

Drop MAP_FILE from mmap(). It is an ignored compatibility flag.

Signed-off-by: Alex Mastro <[email protected]>
---
 tools/testing/selftests/vfio/lib/include/libvfio.h |  9 ++++++++
 tools/testing/selftests/vfio/lib/libvfio.c         | 25 ++++++++++++++++++++++
 tools/testing/selftests/vfio/lib/vfio_pci_device.c | 24 ++++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h 
b/tools/testing/selftests/vfio/lib/include/libvfio.h
index 279ddcd70194..1b6da54cc2cb 100644
--- a/tools/testing/selftests/vfio/lib/include/libvfio.h
+++ b/tools/testing/selftests/vfio/lib/include/libvfio.h
@@ -23,4 +23,13 @@
 const char *vfio_selftests_get_bdf(int *argc, char *argv[]);
 char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs);
 
+/*
+ * Reserve virtual address space of size at an address satisfying
+ * (vaddr % align) == offset.
+ *
+ * Returns the reserved vaddr. The caller is responsible for unmapping
+ * the returned region.
+ */
+void *mmap_reserve(size_t size, size_t align, size_t offset);
+
 #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */
diff --git a/tools/testing/selftests/vfio/lib/libvfio.c 
b/tools/testing/selftests/vfio/lib/libvfio.c
index a23a3cc5be69..3a3d1ed635c1 100644
--- a/tools/testing/selftests/vfio/lib/libvfio.c
+++ b/tools/testing/selftests/vfio/lib/libvfio.c
@@ -2,6 +2,9 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/mman.h>
+
+#include <linux/align.h>
 
 #include "../../../kselftest.h"
 #include <libvfio.h>
@@ -76,3 +79,25 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[])
 
        return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0];
 }
+
+void *mmap_reserve(size_t size, size_t align, size_t offset)
+{
+       void *map_base, *map_align;
+       size_t delta;
+
+       VFIO_ASSERT_GT(align, offset);
+       delta = align - offset;
+
+       map_base = mmap(NULL, size + align, PROT_NONE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       VFIO_ASSERT_NE(map_base, MAP_FAILED);
+
+       map_align = (void *)(ALIGN((uintptr_t)map_base + delta, align) - delta);
+
+       if (map_align > map_base)
+               VFIO_ASSERT_EQ(munmap(map_base, map_align - map_base), 0);
+
+       VFIO_ASSERT_EQ(munmap(map_align + size, map_base + align - map_align), 
0);
+
+       return map_align;
+}
diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c 
b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
index 13fdb4b0b10f..2e0721d58078 100644
--- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c
+++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
@@ -12,10 +12,14 @@
 #include <sys/mman.h>
 
 #include <uapi/linux/types.h>
+#include <linux/align.h>
 #include <linux/iommufd.h>
+#include <linux/kernel.h>
 #include <linux/limits.h>
+#include <linux/log2.h>
 #include <linux/mman.h>
 #include <linux/overflow.h>
+#include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/vfio.h>
 
@@ -124,20 +128,38 @@ static void vfio_pci_region_get(struct vfio_pci_device 
*device, int index,
 static void vfio_pci_bar_map(struct vfio_pci_device *device, int index)
 {
        struct vfio_pci_bar *bar = &device->bars[index];
+       size_t align, size;
        int prot = 0;
+       void *vaddr;
 
        VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS);
        VFIO_ASSERT_NULL(bar->vaddr);
        VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP);
+       VFIO_ASSERT_TRUE(is_power_of_2(bar->info.size));
 
        if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ)
                prot |= PROT_READ;
        if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
                prot |= PROT_WRITE;
 
-       bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED,
+       size = bar->info.size;
+
+       /*
+        * Align BAR mmaps to improve page fault granularity during potential
+        * subsequent IOMMU mapping of these BAR vaddr. 1G for x86 is the
+        * largest hugepage size across any architecture, so no benefit from
+        * larger alignment. BARs smaller than 1G will be aligned by their
+        * power-of-two size, guaranteeing sufficient alignment for smaller
+        * hugepages, if present.
+        */
+       align = min_t(size_t, size, SZ_1G);
+
+       vaddr = mmap_reserve(size, align, 0);
+       bar->vaddr = mmap(vaddr, size, prot, MAP_SHARED | MAP_FIXED,
                          device->fd, bar->info.offset);
        VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED);
+
+       madvise(bar->vaddr, size, MADV_HUGEPAGE);
 }
 
 static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index)

-- 
2.47.3


Reply via email to