Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-22 Thread Alex Williamson
On Fri, 18 Mar 2022 14:27:32 -0300
Jason Gunthorpe  wrote:
> +/*
> + * The area takes a slice of the pages from start_bytes to start_byte + 
> length
> + */
> +static struct iopt_area *
> +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> + unsigned long iova, unsigned long start_byte,
> + unsigned long length, int iommu_prot, unsigned int flags)
> +{
> + struct iopt_area *area;
> + int rc;
> +
> + area = kzalloc(sizeof(*area), GFP_KERNEL);
> + if (!area)
> + return ERR_PTR(-ENOMEM);
> +
> + area->iopt = iopt;
> + area->iommu_prot = iommu_prot;
> + area->page_offset = start_byte % PAGE_SIZE;
> + area->pages_node.start = start_byte / PAGE_SIZE;
> + if (check_add_overflow(start_byte, length - 1, >pages_node.last))
> + return ERR_PTR(-EOVERFLOW);
> + area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> + if (WARN_ON(area->pages_node.last >= pages->npages))
> + return ERR_PTR(-EOVERFLOW);

@area leaked in the above two error cases.

> +
> + down_write(>iova_rwsem);
> + if (flags & IOPT_ALLOC_IOVA) {
> + rc = iopt_alloc_iova(iopt, ,
> +  (uintptr_t)pages->uptr + start_byte,
> +  length);
> + if (rc)
> + goto out_unlock;
> + }
> +
> + if (check_add_overflow(iova, length - 1, >node.last)) {
> + rc = -EOVERFLOW;
> + goto out_unlock;
> + }
> +
> + if (!(flags & IOPT_ALLOC_IOVA)) {
> + if ((iova & (iopt->iova_alignment - 1)) ||
> + (length & (iopt->iova_alignment - 1)) || !length) {
> + rc = -EINVAL;
> + goto out_unlock;
> + }
> +
> + /* No reserved IOVA intersects the range */
> + if (interval_tree_iter_first(>reserved_iova_itree, iova,
> +  area->node.last)) {
> + rc = -ENOENT;
> + goto out_unlock;
> + }
> +
> + /* Check that there is not already a mapping in the range */
> + if (iopt_area_iter_first(iopt, iova, area->node.last)) {
> + rc = -EADDRINUSE;
> + goto out_unlock;
> + }
> + }
> +
> + /*
> +  * The area is inserted with a NULL pages indicating it is not fully
> +  * initialized yet.
> +  */
> + area->node.start = iova;
> + interval_tree_insert(>node, >iopt->area_itree);
> + up_write(>iova_rwsem);
> + return area;
> +
> +out_unlock:
> + up_write(>iova_rwsem);
> + kfree(area);
> + return ERR_PTR(rc);
> +}
...
> +/**
> + * iopt_access_pages() - Return a list of pages under the iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length: Number of bytes to access
> + * @out_pages: Output page list
> + * @write: True if access is for writing
> + *
> + * Reads @npages starting at iova and returns the struct page * pointers. 
> These
> + * can be kmap'd by the caller for CPU access.
> + *
> + * The caller must perform iopt_unaccess_pages() when done to balance this.
> + *
> + * iova can be unaligned from PAGE_SIZE. The first returned byte starts at
> + * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not 
> to
> + * touch memory outside the requested iova slice.
> + *
> + * FIXME: callers that need a DMA mapping via a sgl should create another
> + * interface to build the SGL efficiently
> + */
> +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> +   unsigned long length, struct page **out_pages, bool write)
> +{
> + unsigned long cur_iova = iova;
> + unsigned long last_iova;
> + struct iopt_area *area;
> + int rc;
> +
> + if (!length)
> + return -EINVAL;
> + if (check_add_overflow(iova, length - 1, _iova))
> + return -EOVERFLOW;
> +
> + down_read(>iova_rwsem);
> + for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +  area = iopt_area_iter_next(area, iova, last_iova)) {
> + unsigned long last = min(last_iova, iopt_area_last_iova(area));
> + unsigned long last_index;
> + unsigned long index;
> +
> + /* Need contiguous areas in the access */
> + if (iopt_area_iova(area) < cur_iova || !area->pages) {
^^^
Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?

I can't see how we'd require in-kernel page users to know the iopt_area
alignment from userspace, so I think this needs to skip the first
iteration.  Thanks,

Alex

> + rc = -EINVAL;
> + goto out_remove;
> + }
> +
> + index = iopt_area_iova_to_index(area, cur_iova);
> + last_index = 

[patch 095/227] mm: enforce pageblock_order < MAX_ORDER

2022-03-22 Thread Andrew Morton
From: David Hildenbrand 
Subject: mm: enforce pageblock_order < MAX_ORDER

Some places in the kernel don't really expect pageblock_order >=
MAX_ORDER, and it looks like this is only possible in corner cases:

1) CONFIG_DEFERRED_STRUCT_PAGE_INIT we'll end up freeing pageblock_order
   pages via __free_pages_core(), which cannot possibly work.

2) find_zone_movable_pfns_for_nodes() will roundup the ZONE_MOVABLE
   start PFN to MAX_ORDER_NR_PAGES. Consequently with a bigger
   pageblock_order, we could have a single pageblock partially managed by
   two zones.

3) compaction code runs into __fragmentation_index() with order
   >= MAX_ORDER, when checking WARN_ON_ONCE(order >= MAX_ORDER). [1]

4) mm/page_reporting.c won't be reporting any pages with default
   page_reporting_order == pageblock_order, as we'll be skipping the
   reporting loop inside page_reporting_process_zone().

5) __rmqueue_fallback() will never be able to steal with
   ALLOC_NOFRAGMENT.

pageblock_order >= MAX_ORDER is weird either way: it's a pure optimization
for making alloc_contig_range(), as used for allcoation of gigantic pages,
a little more reliable to succeed.  However, if there is demand for
somewhat reliable allocation of gigantic pages, affected setups should be
using CMA or boottime allocations instead.

So let's make sure that pageblock_order < MAX_ORDER and simplify.

[1] https://lkml.kernel.org/r/87r189a2ks@linux.ibm.com

Link: https://lkml.kernel.org/r/20220214174132.219303-3-da...@redhat.com
Signed-off-by: David Hildenbrand 
Reviewed-by: Zi Yan 
Cc: Aneesh Kumar K.V 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Hellwig 
Cc: Frank Rowand 
Cc: John Garry via iommu 
Cc: Marek Szyprowski 
Cc: Michael Ellerman 
Cc: Michael S. Tsirkin 
Cc: Minchan Kim 
Cc: Paul Mackerras 
Cc: Rob Herring 
Cc: Robin Murphy 
Cc: Vlastimil Babka 
Signed-off-by: Andrew Morton 
---

 drivers/virtio/virtio_mem.c |9 ++--
 include/linux/cma.h |3 --
 include/linux/pageblock-flags.h |7 --
 mm/Kconfig  |3 ++
 mm/page_alloc.c |   32 +++---
 5 files changed, 20 insertions(+), 34 deletions(-)

--- a/drivers/virtio/virtio_mem.c~mm-enforce-pageblock_order-max_order
+++ a/drivers/virtio/virtio_mem.c
@@ -2476,13 +2476,10 @@ static int virtio_mem_init_hotplug(struc
  VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
 
/*
-* We want subblocks to span at least MAX_ORDER_NR_PAGES and
-* pageblock_nr_pages pages. This:
-* - Is required for now for alloc_contig_range() to work reliably -
-*   it doesn't properly handle smaller granularity on ZONE_NORMAL.
+* TODO: once alloc_contig_range() works reliably with pageblock
+* granularity on ZONE_NORMAL, use pageblock_nr_pages instead.
 */
-   sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
-   pageblock_nr_pages) * PAGE_SIZE;
+   sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES;
sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
 
if (sb_size < memory_block_size_bytes() && !force_bbm) {
--- a/include/linux/cma.h~mm-enforce-pageblock_order-max_order
+++ a/include/linux/cma.h
@@ -25,8 +25,7 @@
  * -- can deal with only some pageblocks of a higher-order page being
  *  MIGRATE_CMA, we can use pageblock_nr_pages.
  */
-#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \
- pageblock_nr_pages)
+#define CMA_MIN_ALIGNMENT_PAGES MAX_ORDER_NR_PAGES
 #define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES)
 
 struct cma;
--- a/include/linux/pageblock-flags.h~mm-enforce-pageblock_order-max_order
+++ a/include/linux/pageblock-flags.h
@@ -37,8 +37,11 @@ extern unsigned int pageblock_order;
 
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
-/* Huge pages are a constant size */
-#define pageblock_orderHUGETLB_PAGE_ORDER
+/*
+ * Huge pages are a constant size, but don't exceed the maximum allocation
+ * granularity.
+ */
+#define pageblock_ordermin_t(unsigned int, HUGETLB_PAGE_ORDER, 
MAX_ORDER - 1)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
--- a/mm/Kconfig~mm-enforce-pageblock_order-max_order
+++ a/mm/Kconfig
@@ -262,6 +262,9 @@ config HUGETLB_PAGE_SIZE_VARIABLE
  HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes 
available
  on a platform.
 
+ Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be
+ clamped down to MAX_ORDER - 1.
+
 config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
 
--- a/mm/page_alloc.c~mm-enforce-pageblock_order-max_order
+++ a/mm/page_alloc.c
@@ -1072,14 +1072,12 @@ static inline void __free_one_page(struc
int migratetype, fpi_t fpi_flags)
 {
struct capture_control *capc = task_capc(zone);
+   unsigned int max_order = pageblock_order;
unsigned long 

[patch 094/227] cma: factor out minimum alignment requirement

2022-03-22 Thread Andrew Morton
From: David Hildenbrand 
Subject: cma: factor out minimum alignment requirement

Patch series "mm: enforce pageblock_order < MAX_ORDER".

Having pageblock_order >= MAX_ORDER seems to be able to happen in corner
cases and some parts of the kernel are not prepared for it.

For example, Aneesh has shown [1] that such kernels can be compiled on
ppc64 with 64k base pages by setting FORCE_MAX_ZONEORDER=8, which will run
into a WARN_ON_ONCE(order >= MAX_ORDER) in comapction code right during
boot.

We can get pageblock_order >= MAX_ORDER when the default hugetlb size is
bigger than the maximum allocation granularity of the buddy, in which case
we are no longer talking about huge pages but instead gigantic pages.

Having pageblock_order >= MAX_ORDER can only make alloc_contig_range() of
such gigantic pages more likely to succeed.

Reliable use of gigantic pages either requires boot time allcoation or
CMA, no need to overcomplicate some places in the kernel to optimize for
corner cases that are broken in other areas of the kernel.


This patch (of 2):

Let's enforce pageblock_order < MAX_ORDER and simplify.

Especially patch #1 can be regarded a cleanup before:
[PATCH v5 0/6] Use pageblock_order for cma and alloc_contig_range
alignment. [2]

[1] https://lkml.kernel.org/r/87r189a2ks@linux.ibm.com
[2] https://lkml.kernel.org/r/20220211164135.1803616-1-zi@sent.com

Link: https://lkml.kernel.org/r/20220214174132.219303-2-da...@redhat.com
Signed-off-by: David Hildenbrand 
Reviewed-by: Zi Yan 
Acked-by: Rob Herring 
Cc: Aneesh Kumar K.V 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Frank Rowand 
Cc: Michael S. Tsirkin 
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Minchan Kim 
Cc: Vlastimil Babka 
Cc: John Garry via iommu 

Signed-off-by: Andrew Morton 
---

 arch/powerpc/include/asm/fadump-internal.h |5 
 arch/powerpc/kernel/fadump.c   |2 -
 drivers/of/of_reserved_mem.c   |9 ++--
 include/linux/cma.h|9 
 kernel/dma/contiguous.c|4 ---
 mm/cma.c   |   20 ---
 6 files changed, 19 insertions(+), 30 deletions(-)

--- 
a/arch/powerpc/include/asm/fadump-internal.h~cma-factor-out-minimum-alignment-requirement
+++ a/arch/powerpc/include/asm/fadump-internal.h
@@ -19,11 +19,6 @@
 
 #define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt)
 
-/* Alignment per CMA requirement. */
-#define FADUMP_CMA_ALIGNMENT   (PAGE_SIZE <<   \
-max_t(unsigned long, MAX_ORDER - 1,\
-pageblock_order))
-
 /* FAD commands */
 #define FADUMP_REGISTER1
 #define FADUMP_UNREGISTER  2
--- a/arch/powerpc/kernel/fadump.c~cma-factor-out-minimum-alignment-requirement
+++ a/arch/powerpc/kernel/fadump.c
@@ -544,7 +544,7 @@ int __init fadump_reserve_mem(void)
if (!fw_dump.nocma) {
fw_dump.boot_memory_size =
ALIGN(fw_dump.boot_memory_size,
- FADUMP_CMA_ALIGNMENT);
+ CMA_MIN_ALIGNMENT_BYTES);
}
 #endif
 
--- a/drivers/of/of_reserved_mem.c~cma-factor-out-minimum-alignment-requirement
+++ a/drivers/of/of_reserved_mem.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "of_private.h"
 
@@ -116,12 +117,8 @@ static int __init __reserved_mem_alloc_s
if (IS_ENABLED(CONFIG_CMA)
&& of_flat_dt_is_compatible(node, "shared-dma-pool")
&& of_get_flat_dt_prop(node, "reusable", NULL)
-   && !nomap) {
-   unsigned long order =
-   max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
-
-   align = max(align, (phys_addr_t)PAGE_SIZE << order);
-   }
+   && !nomap)
+   align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES);
 
prop = of_get_flat_dt_prop(node, "alloc-ranges", );
if (prop) {
--- a/include/linux/cma.h~cma-factor-out-minimum-alignment-requirement
+++ a/include/linux/cma.h
@@ -20,6 +20,15 @@
 
 #define CMA_MAX_NAME 64
 
+/*
+ * TODO: once the buddy -- especially pageblock merging and 
alloc_contig_range()
+ * -- can deal with only some pageblocks of a higher-order page being
+ *  MIGRATE_CMA, we can use pageblock_nr_pages.
+ */
+#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \
+ pageblock_nr_pages)
+#define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES)
+
 struct cma;
 
 extern unsigned long totalcma_pages;
--- a/kernel/dma/contiguous.c~cma-factor-out-minimum-alignment-requirement
+++ a/kernel/dma/contiguous.c
@@ -399,8 +399,6 @@ static const struct reserved_mem_ops rme
 
 static int __init rmem_cma_setup(struct reserved_mem 

Re: [PATCH v4 1/2] PCI: Rename "pci_dev->untrusted" to "pci_dev->poses_dma_risk"

2022-03-22 Thread Rajat Jain via iommu
On Tue, Mar 22, 2022 at 4:12 AM Rafael J. Wysocki  wrote:
>
> On Tue, Mar 22, 2022 at 10:02 AM Christoph Hellwig  wrote:
> >
> > On Sat, Mar 19, 2022 at 11:29:05PM -0700, Rajat Jain wrote:
> > > Rename the field to make it more clear, that the device can execute DMA
> > > attacks on the system, and thus the system may need protection from
> > > such attacks from this device.
> > >
> > > No functional change intended.
> > >
> > > Signed-off-by: Rajat Jain 
> > > ---
> > > v4: Initial version, created based on comments on other patch
> >
> > What a horrible name.  Why not untrusted_dma which captures the
> > intent much better?
>
> FWIW, I like this one much better too.

Sure, no problems. I can change the name to "untrusted_dma".

Mika, can I carry forward your "Reviewed-by" tag with this name change too?

Thanks & Best Regards,

Rajat
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 01/11] ACPI/IORT: Add temporary RMR node flag definitions

2022-03-22 Thread Robin Murphy

On 2022-02-21 15:43, Shameer Kolothum via iommu wrote:

IORT rev E.d introduces more details into the RMR node Flags
field. Add temporary definitions to describe and access these
Flags field until ACPICA header is updated to support E.d.

This patch can be reverted once the include/acpi/actbl2.h has
all the relevant definitions.

Signed-off-by: Shameer Kolothum 
---
Please find the ACPICA E.d related changes pull request here,
https://github.com/acpica/acpica/pull/752

---
  drivers/acpi/arm64/iort.c | 24 
  1 file changed, 24 insertions(+)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index f2f8f05662de..0730c4dbb700 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -25,6 +25,30 @@
  #define IORT_IOMMU_TYPE   ((1 << ACPI_IORT_NODE_SMMU) | \
(1 << ACPI_IORT_NODE_SMMU_V3))
  
+/*

+ * The following RMR related definitions are temporary and
+ * can be removed once ACPICA headers support IORT rev E.d
+ */
+#ifndef ACPI_IORT_RMR_REMAP_PERMITTED
+#define ACPI_IORT_RMR_REMAP_PERMITTED  (1)
+#endif
+
+#ifndef ACPI_IORT_RMR_ACCESS_PRIVILEGE
+#define ACPI_IORT_RMR_ACCESS_PRIVILEGE (1 << 1)
+#endif
+
+#ifndef ACPI_IORT_RMR_ACCESS_ATTRIBUTES
+#define ACPI_IORT_RMR_ACCESS_ATTRIBUTES(flags) (((flags) >> 2) & 0xFF)
+#endif
+
+#ifndef ACPI_IORT_RMR_ATTR_DEVICE_GRE
+#define ACPI_IORT_RMR_ATTR_DEVICE_GRE  0x03
+#endif
+
+#ifndef ACPI_IORT_RMR_ATTR_NORMAL
+#define ACPI_IORT_RMR_ATTR_NORMAL  0x05


For the record, I've commented directly on the ACPICA pull request that 
I think this should be more clearly named to indicate that it means 
Normal Write-Back Cacheable, rather than being potentially ambiguous 
about cacheability.


Robin.


+#endif
+
  struct iort_its_msi_chip {
struct list_headlist;
struct fwnode_handle*fw_node;

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v8 10/11] iommu/arm-smmu-v3: Reserve any RMR regions associated with a dev

2022-03-22 Thread Robin Murphy

On 2022-02-21 15:43, Shameer Kolothum wrote:

Get ACPI IORT RMR regions associated with a dev reserved
so that there is a unity mapping for them in SMMU.

Signed-off-by: Shameer Kolothum 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 13 -
  1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index dee3197474b7..ef2972483fd7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2759,6 +2759,7 @@ static void arm_smmu_get_resv_regions(struct device *dev,
  {
struct iommu_resv_region *region;
int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
  
  	region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,

 prot, IOMMU_RESV_SW_MSI);
@@ -2768,6 +2769,16 @@ static void arm_smmu_get_resv_regions(struct device *dev,
list_add_tail(>list, head);
  
  	iommu_dma_get_resv_regions(dev, head);

+   iommu_dma_get_rmrs(fwspec->iommu_fwnode, dev, head);
+}
+
+static void arm_smmu_put_resv_regions(struct device *dev,
+ struct list_head *head)
+{
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+   iommu_dma_put_rmrs(fwspec->iommu_fwnode, head);
+   generic_iommu_put_resv_regions(dev, head);
  }


Tying in with my comment on patch #5, this should be a common 
iommu_dma_put_resv_regions() helper.


Thanks,
Robin.


  static bool arm_smmu_dev_has_feature(struct device *dev,
@@ -2865,7 +2876,7 @@ static struct iommu_ops arm_smmu_ops = {
.enable_nesting = arm_smmu_enable_nesting,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
-   .put_resv_regions   = generic_iommu_put_resv_regions,
+   .put_resv_regions   = arm_smmu_put_resv_regions,
.dev_has_feat   = arm_smmu_dev_has_feature,
.dev_feat_enabled   = arm_smmu_dev_feature_enabled,
.dev_enable_feat= arm_smmu_dev_enable_feature,

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 05/11] ACPI/IORT: Add a helper to retrieve RMR memory regions

2022-03-22 Thread Robin Murphy

On 2022-02-21 15:43, Shameer Kolothum via iommu wrote:

Add helper functions (iort_iommu_get/put_rmrs()) that
retrieves/releases RMR memory descriptors associated
with a given IOMMU. This will be used by IOMMU drivers
to set up necessary mappings.

Invoke it from the generic iommu helper functions.


iommu_dma_get_resv_regions() already exists - please extend that rather 
than adding a parallel implementation of the same thing but different. 
IORT should export a single get_resv_regions helper which combines the 
new RMRs with the existing MSI workaround, and a separate "do I need to 
bypass this StreamID" helper for the SMMU drivers to call directly at 
reset time, since the latter isn't really an iommu-dma responsibility.


I'm happy to do that just by shuffling wrappers around for now - we can 
come back and streamline the code properly afterwards - but the sheer 
amount of indirection currently at play here is so hard to follow that 
it's not even all that easy to see how it's crossing abstraction levels 
improperly.


Thanks,
Robin.


Signed-off-by: Shameer Kolothum 
---
  drivers/acpi/arm64/iort.c | 56 +++
  drivers/iommu/dma-iommu.c |  4 +++
  include/linux/acpi_iort.h | 14 ++
  3 files changed, 74 insertions(+)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 05da9ebff50a..b2c959c72fb2 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1055,6 +1055,57 @@ static void iort_find_rmrs(struct acpi_iort_node *iommu, 
struct device *dev,
}
  }
  
+/**

+ * iort_iommu_dma_put_rmrs - Free any memory associated with RMRs.
+ * @iommu_fwnode: fwnode associated with IOMMU
+ * @head: Resereved region list
+ *
+ * This function go through the provided reserved region list and
+ * free up memory associated with RMR entries and delete them from
+ * the list.
+ */
+void iort_iommu_put_rmrs(struct fwnode_handle *iommu_fwnode,
+struct list_head *head)
+{
+   struct iommu_resv_region *e, *tmp;
+
+   /*
+* RMR entries will have mem allocated for fw_data.rmr.sids.
+* Free the mem and delete the node.
+*/
+   list_for_each_entry_safe(e, tmp, head, list) {
+   if (e->fw_data.rmr.sids) {
+   kfree(e->fw_data.rmr.sids);
+   list_del(>list);
+   kfree(e);
+   }
+   }
+}
+
+/**
+ *
+ * iort_iommu_dma_get_rmrs - Retrieve Reserved Memory Regions(RMRs) associated
+ *  with a given IOMMU and dev.
+ * @iommu_fwnode: fwnode associated with IOMMU
+ * @dev: Device associated with RMR(Optional)
+ * @list: RMR list to be populated
+ *
+ * This function populates the RMR list associated with a given IOMMU and
+ * dev(if provided). If dev is NULL, the function populates all the RMRs
+ * associated with the given IOMMU.
+ */
+void iort_iommu_get_rmrs(struct fwnode_handle *iommu_fwnode, struct device 
*dev,
+struct list_head *head)
+{
+   struct acpi_iort_node *iommu;
+
+   iommu = iort_get_iort_node(iommu_fwnode);
+   if (!iommu)
+   return;
+
+   iort_find_rmrs(iommu, dev, head);
+}
+
  /**
   * iort_iommu_msi_get_resv_regions - Reserved region driver helper
   * @dev: Device from iommu_get_resv_regions()
@@ -1287,6 +1338,11 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, 
struct list_head *head)
  { return 0; }
  int iort_iommu_configure_id(struct device *dev, const u32 *input_id)
  { return -ENODEV; }
+void iort_iommu_get_rmrs(struct fwnode_handle *fwnode, struct device *dev,
+struct list_head *head)
+{ }
+void iort_iommu_put_rmrs(struct fwnode_handle *fwnode, struct list_head *head)
+{ }
  #endif
  
  static int nc_dma_get_range(struct device *dev, u64 *size)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 65ab01d5128b..b33e4df85de1 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -382,12 +382,16 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
  void iommu_dma_get_rmrs(struct fwnode_handle *iommu_fwnode, struct device 
*dev,
struct list_head *list)
  {
+   if (!is_of_node(iommu_fwnode))
+   iort_iommu_get_rmrs(iommu_fwnode, dev, list);
  }
  EXPORT_SYMBOL(iommu_dma_get_rmrs);
  
  void iommu_dma_put_rmrs(struct fwnode_handle *iommu_fwnode,

struct list_head *list)
  {
+   if (!is_of_node(iommu_fwnode))
+   iort_iommu_put_rmrs(iommu_fwnode, list);
  }
  EXPORT_SYMBOL(iommu_dma_put_rmrs);
  
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h

index f1f0842a2cb2..212f7f178ec3 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -38,6 +38,10 @@ int iort_dma_get_ranges(struct device *dev, u64 *size);
  int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
  int iort_iommu_msi_get_resv_regions(struct 

Re: [PATCH v8 02/11] iommu: Introduce a union to struct iommu_resv_region

2022-03-22 Thread Robin Murphy

On 2022-02-21 15:43, Shameer Kolothum wrote:

A union is introduced to struct iommu_resv_region to hold
any firmware specific data. This is in preparation to add
support for IORT RMR reserve regions and the union now holds
the RMR specific information.

Signed-off-by: Shameer Kolothum 
---
  include/linux/iommu.h | 9 +
  1 file changed, 9 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index de0c57a567c8..b06952a75f95 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -126,6 +126,11 @@ enum iommu_resv_type {
IOMMU_RESV_SW_MSI,
  };
  
+struct iommu_iort_rmr_data {

+   u32 *sids;  /* Stream Ids associated with IORT RMR entry */


Please make this const.

Further nit: capitalisation of "IDs" in the comment, otherwise I might 
worry about the possibility of Stream Egos too :P



+   u32 num_sids;
+};
+
  /**
   * struct iommu_resv_region - descriptor for a reserved memory region
   * @list: Linked list pointers
@@ -133,6 +138,7 @@ enum iommu_resv_type {
   * @length: Length of the region in bytes
   * @prot: IOMMU Protection flags (READ/WRITE/...)
   * @type: Type of the reserved region
+ * @fw_data: FW specific reserved region data


Nit: we've got plenty of room to spell out "Firmware-specific", and it 
never hurts to make documentation as easy to read as possible.


Thanks,
Robin.


   */
  struct iommu_resv_region {
struct list_headlist;
@@ -140,6 +146,9 @@ struct iommu_resv_region {
size_t  length;
int prot;
enum iommu_resv_typetype;
+   union {
+   struct iommu_iort_rmr_data rmr;
+   } fw_data;
  };
  
  /**

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu/dma: Explicitly sort PCI DMA windows

2022-03-22 Thread Robin Murphy
Originally, creating the dma_ranges resource list in pre-sorted fashion
was the simplest and most efficient way to enforce the order required by
iova_reserve_pci_windows(). However since then at least one PCI host
driver is now re-sorting the list for its own probe-time processing,
which doesn't seem entirely unreasonable, so that basic assumption no
longer holds. Make iommu-dma robust and get the sort order it needs by
explicitly sorting, which means we can also save the effort at creation
time and just build the list in whatever natural order the DT had.

Signed-off-by: Robin Murphy 
---

Looking at this area off the back of the XGene thread[1] made me realise
that we need to do it anyway, regardless of whether it might also happen
to restore the previous XGene behaviour or not. Presumably nobody's
tried to use pcie-cadence-host behind an IOMMU yet...

Boot-tested on Juno to make sure I hadn't got the sort comparison
backwards.

Robin.

[1] https://lore.kernel.org/linux-pci/20220321104843.949645-1-...@kernel.org/

 drivers/iommu/dma-iommu.c | 13 -
 drivers/pci/of.c  |  7 +--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index b22034975301..91d134c0c9b1 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -414,6 +415,15 @@ static int cookie_init_hw_msi_region(struct 
iommu_dma_cookie *cookie,
return 0;
 }
 
+static int iommu_dma_ranges_sort(void *priv, const struct list_head *a,
+   const struct list_head *b)
+{
+   struct resource_entry *res_a = list_entry(a, typeof(*res_a), node);
+   struct resource_entry *res_b = list_entry(b, typeof(*res_b), node);
+
+   return res_a->res->start > res_b->res->start;
+}
+
 static int iova_reserve_pci_windows(struct pci_dev *dev,
struct iova_domain *iovad)
 {
@@ -432,6 +442,7 @@ static int iova_reserve_pci_windows(struct pci_dev *dev,
}
 
/* Get reserved DMA windows from host bridge */
+   list_sort(NULL, >dma_ranges, iommu_dma_ranges_sort);
resource_list_for_each_entry(window, >dma_ranges) {
end = window->res->start - window->offset;
 resv_iova:
@@ -440,7 +451,7 @@ static int iova_reserve_pci_windows(struct pci_dev *dev,
hi = iova_pfn(iovad, end);
reserve_iova(iovad, lo, hi);
} else if (end < start) {
-   /* dma_ranges list should be sorted */
+   /* DMA ranges should be non-overlapping */
dev_err(>dev,
"Failed to reserve IOVA [%pa-%pa]\n",
, );
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index cb2e8351c2cc..d176b4bc6193 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -393,12 +393,7 @@ static int devm_of_pci_get_host_bridge_resources(struct 
device *dev,
goto failed;
}
 
-   /* Keep the resource list sorted */
-   resource_list_for_each_entry(entry, ib_resources)
-   if (entry->res->start > res->start)
-   break;
-
-   pci_add_resource_offset(>node, res,
+   pci_add_resource_offset(ib_resources, res,
res->start - range.pci_addr);
}
 
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-22 Thread Jason Gunthorpe via iommu
On Tue, Mar 22, 2022 at 05:31:26PM +0100, Niklas Schnelle wrote:

> > > In fact I stumbled over that because the wrong accounting in
> > > io_uring exhausted the applied to vfio (I was using a QEMU utilizing
> > > io_uring itself).
> > 
> > I'm pretty interested in this as well, do you have anything you can
> > share?
> 
> This was the issue reported in the following BZ.

Sorry, I was talking about the iouring usage in qemu :)

Thanks,
Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-22 Thread Niklas Schnelle
On Tue, 2022-03-22 at 11:57 -0300, Jason Gunthorpe wrote:
> On Tue, Mar 22, 2022 at 03:28:22PM +0100, Niklas Schnelle wrote:
> > On Fri, 2022-03-18 at 14:27 -0300, Jason Gunthorpe wrote:
> > > Following the pattern of io_uring, perf, skb, and bpf iommfd will use
> > iommufd ^
> > > user->locked_vm for accounting pinned pages. Ensure the value is included
> > > in the struct and export free_uid() as iommufd is modular.
> > > 
> > > user->locked_vm is the correct accounting to use for ulimit because it is
> > > per-user, and the ulimit is not supposed to be per-process. Other
> > > places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or
> > > mm->locked_vm for accounting pinned pages, but this is only per-process
> > > and inconsistent with the majority of the kernel.
> > 
> > Since this will replace parts of vfio this difference seems
> > significant. Can you explain this a bit more?
> 
> I'm not sure what to say more, this is the correct way to account for
> this. It is natural to see it is right because the ulimit is supposted
> to be global to the user, not effectively reset every time the user
> creates a new process.
> 
> So checking the ulimit against a per-process variable in the mm_struct
> doesn't make too much sense.

Yes I agree that logically this makes more sense. I was kind of aiming
in the same direction as Alex i.e. it's a conscious decision to do it
right and we need to know where this may lead to differences and how to
handle them.

> 
> > I'm also a bit confused how io_uring handles this. When I stumbled over
> > the problem fixed by 6b7898eb180d ("io_uring: fix imbalanced sqo_mm
> > accounting") and from that commit description I seem to rember that
> > io_uring also accounts in mm->locked_vm too? 
> 
> locked/pinned_pages in the mm is kind of a debugging counter, it
> indicates how many pins the user obtained through this mm. AFAICT its
> only correct use is to report through proc. Things are supposed to
> update it, but there is no reason to limit it as the user limit
> supersedes it.
> 
> The commit you pointed at is fixing that io_uring corrupted the value.
> 
> Since VFIO checks locked/pinned_pages against the ulimit would blow up
> when the value was wrong.
> 
> > In fact I stumbled over that because the wrong accounting in
> > io_uring exhausted the applied to vfio (I was using a QEMU utilizing
> > io_uring itself).
> 
> I'm pretty interested in this as well, do you have anything you can
> share?

This was the issue reported in the following BZ.

https://bugzilla.kernel.org/show_bug.cgi?id=209025

I stumbled over the same problem on my x86 box and also on s390. I
don't remember exactly what limit this ran into but I suspect it had
something to do with the libvirt resource limits Alex mentioned.
Meaning io_uring had an accounting bug and then vfio / QEMU couldn't
pin memory. I think that libvirt limit is set to allow pinning all of
guest memory plus a bit so the io_uring misaccounting easily tipped it
over.

> 
> Thanks,
> Jason


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-22 Thread Jason Gunthorpe via iommu
On Tue, Mar 22, 2022 at 09:29:23AM -0600, Alex Williamson wrote:

> I'm still picking my way through the series, but the later compat
> interface doesn't mention this difference as an outstanding issue.
> Doesn't this difference need to be accounted in how libvirt manages VM
> resource limits?  

AFACIT, no, but it should be checked.

> AIUI libvirt uses some form of prlimit(2) to set process locked
> memory limits.

Yes, and ulimit does work fully. prlimit adjusts the value:

int do_prlimit(struct task_struct *tsk, unsigned int resource,
struct rlimit *new_rlim, struct rlimit *old_rlim)
{
rlim = tsk->signal->rlim + resource;
[..]
if (new_rlim)
*rlim = *new_rlim;

Which vfio reads back here:

drivers/vfio/vfio_iommu_type1.c:unsigned long pfn, limit = 
rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
drivers/vfio/vfio_iommu_type1.c:unsigned long limit = 
rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

And iommufd does the same read back:

lock_limit =
task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
npages = pages->npinned - pages->last_npinned;
do {
cur_pages = atomic_long_read(>source_user->locked_vm);
new_pages = cur_pages + npages;
if (new_pages > lock_limit)
return -ENOMEM;
} while (atomic_long_cmpxchg(>source_user->locked_vm, cur_pages,
 new_pages) != cur_pages);

So it does work essentially the same.

The difference is more subtle, iouring/etc puts the charge in the user
so it is additive with things like iouring and additively spans all
the users processes.

However vfio is accounting only per-process and only for itself - no
other subsystem uses locked as the charge variable for DMA pins.

The user visible difference will be that a limit X that worked with
VFIO may start to fail after a kernel upgrade as the charge accounting
is now cross user and additive with things like iommufd.

This whole area is a bit peculiar (eg mlock itself works differently),
IMHO, but with most of the places doing pins voting to use
user->locked_vm as the charge it seems the right path in today's
kernel.

Ceratinly having qemu concurrently using three different subsystems
(vfio, rdma, iouring) issuing FOLL_LONGTERM and all accounting for
RLIMIT_MEMLOCK differently cannot be sane or correct.

I plan to fix RDMA like this as well so at least we can have
consistency within qemu.

Thanks,
Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-22 Thread Alex Williamson
On Tue, 22 Mar 2022 11:57:41 -0300
Jason Gunthorpe  wrote:

> On Tue, Mar 22, 2022 at 03:28:22PM +0100, Niklas Schnelle wrote:
> > On Fri, 2022-03-18 at 14:27 -0300, Jason Gunthorpe wrote:  
> > > 
> > > user->locked_vm is the correct accounting to use for ulimit because it is
> > > per-user, and the ulimit is not supposed to be per-process. Other
> > > places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or
> > > mm->locked_vm for accounting pinned pages, but this is only per-process
> > > and inconsistent with the majority of the kernel.  
> > 
> > Since this will replace parts of vfio this difference seems
> > significant. Can you explain this a bit more?  
> 
> I'm not sure what to say more, this is the correct way to account for
> this. It is natural to see it is right because the ulimit is supposted
> to be global to the user, not effectively reset every time the user
> creates a new process.
> 
> So checking the ulimit against a per-process variable in the mm_struct
> doesn't make too much sense.

I'm still picking my way through the series, but the later compat
interface doesn't mention this difference as an outstanding issue.
Doesn't this difference need to be accounted in how libvirt manages VM
resource limits?  AIUI libvirt uses some form of prlimit(2) to set
process locked memory limits.  A compat interface might be an
interesting feature, but does it only provide ioctl compatibility and
not resource ulimit compatibility?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-22 Thread Jason Gunthorpe via iommu
On Tue, Mar 22, 2022 at 03:28:22PM +0100, Niklas Schnelle wrote:
> On Fri, 2022-03-18 at 14:27 -0300, Jason Gunthorpe wrote:
> > Following the pattern of io_uring, perf, skb, and bpf iommfd will use
> iommufd ^
> > user->locked_vm for accounting pinned pages. Ensure the value is included
> > in the struct and export free_uid() as iommufd is modular.
> > 
> > user->locked_vm is the correct accounting to use for ulimit because it is
> > per-user, and the ulimit is not supposed to be per-process. Other
> > places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or
> > mm->locked_vm for accounting pinned pages, but this is only per-process
> > and inconsistent with the majority of the kernel.
> 
> Since this will replace parts of vfio this difference seems
> significant. Can you explain this a bit more?

I'm not sure what to say more, this is the correct way to account for
this. It is natural to see it is right because the ulimit is supposted
to be global to the user, not effectively reset every time the user
creates a new process.

So checking the ulimit against a per-process variable in the mm_struct
doesn't make too much sense.

> I'm also a bit confused how io_uring handles this. When I stumbled over
> the problem fixed by 6b7898eb180d ("io_uring: fix imbalanced sqo_mm
> accounting") and from that commit description I seem to rember that
> io_uring also accounts in mm->locked_vm too? 

locked/pinned_pages in the mm is kind of a debugging counter, it
indicates how many pins the user obtained through this mm. AFAICT its
only correct use is to report through proc. Things are supposed to
update it, but there is no reason to limit it as the user limit
supersedes it.

The commit you pointed at is fixing that io_uring corrupted the value.

Since VFIO checks locked/pinned_pages against the ulimit would blow up
when the value was wrong.

> In fact I stumbled over that because the wrong accounting in
> io_uring exhausted the applied to vfio (I was using a QEMU utilizing
> io_uring itself).

I'm pretty interested in this as well, do you have anything you can
share?

Thanks,
Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 03/12] iommufd: File descriptor, context, kconfig and makefiles

2022-03-22 Thread Jason Gunthorpe via iommu
On Tue, Mar 22, 2022 at 03:18:51PM +0100, Niklas Schnelle wrote:
> On Fri, 2022-03-18 at 14:27 -0300, Jason Gunthorpe wrote:
> > This is the basic infrastructure of a new miscdevice to hold the iommufd
> > IOCTL API.
> > 
> > It provides:
> >  - A miscdevice to create file descriptors to run the IOCTL interface over
> > 
> >  - A table based ioctl dispatch and centralized extendable pre-validation
> >step
> > 
> >  - An xarray mapping user ID's to kernel objects. The design has multiple
> >inter-related objects held within in a single IOMMUFD fd
> > 
> >  - A simple usage count to build a graph of object relations and protect
> >against hostile userspace racing ioctls
> 
> For me at this point it seems hard to grok what this "graph of object
> relations" is about. Maybe an example would help? I'm assuming this is
> about e.g. the DEVICE -depends-on-> HW_PAGETABLE -depends-on-> IOAS  so
> the arrows in the picture of PATCH 02? 

Yes, it is basically standard reference relations, think
'kref'. Object A referenced B because A has a pointer to B in its
struct.

Therefore B cannot be destroyed before A without creating a dangling
reference.

> Or is it the other way around
> and the IOAS -depends-on-> HW_PAGETABLE because it's about which
> references which? From the rest of the patch I seem to understand that
> this mostly establishes the order of destruction. So is HW_PAGETABLE
> destroyed before the IOAS because a HW_PAGETABLE must never reference
> an invalid/destoryed IOAS or is it the other way arund because the IOAS
> has a reference to the HW_PAGETABLES in it? I'd guess the former but
> I'm a bit confused still.

Yes HW_PAGETABLE is first because it is responsible to remove the
iommu_domain from the IOAS and the IOAS cannot be destroyed with
iommu_domains in it.

> > +/*
> > + * The objects for an acyclic graph through the users refcount. This enum 
> > must
> > + * be sorted by type depth first so that destruction completes lower 
> > objects and
> > + * releases the users refcount before reaching higher objects in the graph.
> > + */
> 
> A bit like my first comment I think this would benefit from an example
> what lower/higher mean in this case. I believe a lower object must only
> reference/depend on higher objects, correct?

Maybe it is too confusing - I was debating using a try and fail
approach instead which achieves the same thing with a little different
complexity. It seems we may need to do this anyhow for nesting..

Would look like this:

/* Destroy the graph from depth first */
while (!xa_empty(>objects)) {
unsigned int destroyed = 0;
unsigned long index = 0;

xa_for_each (>objects, index, obj) {
/*
 * Since we are in release elevated users must come from
 * other objects holding the users. We will eventually
 * destroy the object that holds this one and the next
 * pass will progress it.
 */
if (!refcount_dec_if_one(>users))
continue;
destroyed++;
xa_erase(>objects, index);
iommufd_object_ops[obj->type].destroy(obj);
kfree(obj);
}
/* Bug related to users refcount */
if (WARN_ON(!destroyed))
break;
}
kfree(ictx);

> > +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
> > + enum iommufd_object_type type)
> > +{
> > +   struct iommufd_object *obj;
> > +
> > +   xa_lock(>objects);
> > +   obj = xa_load(>objects, id);
> > +   if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) ||
> > +   !iommufd_lock_obj(obj))
> 
> Looking at the code it seems iommufd_lock_obj() locks against
> destroy_rw_sem and increases the reference count but there is also an
> iommufd_put_object_keep_user() which only unlocks the destroy_rw_sem. I
> think I personally would benefit from a comment/commit message
> explaining the lifecycle.
> 
> There is the below comment in iommufd_object_destroy_user() but I think
> it would be better placed near the destroy_rwsem decleration and to
> also explain the interaction between the destroy_rwsem and the
> reference count.

I do prefer it near the destroy because that is the only place that
actually requires the property it gives. The code outside this layer
shouldn't know about this at all beyond folowing some rules about
iommufd_put_object_keep_user(). Lets add a comment there instead:

/**
 * iommufd_put_object_keep_user() - Release part of the refcount on obj
 * @obj - Object to release
 *
 * Objects have two protections to ensure that userspace has a consistent
 * experience with destruction. Normally objects are locked so that destroy will
 * block 

Re: [PATCH v2 2/2] thunderbolt: Make iommu_dma_protection more accurate

2022-03-22 Thread Robin Murphy

On 2022-03-22 11:41, Mika Westerberg wrote:

Hi Robin,

I tried this now on two Intel systems. One with integrated Thunderbolt
and one with discrete. There was a small issue, see below but once fixed
it worked as expected :)

On Fri, Mar 18, 2022 at 05:42:58PM +, Robin Murphy wrote:

Between me trying to get rid of iommu_present() and Mario wanting to
support the AMD equivalent of DMAR_PLATFORM_OPT_IN, scrutiny has shown
that the iommu_dma_protection attribute is being far too optimistic.
Even if an IOMMU might be present for some PCI segment in the system,
that doesn't necessarily mean it provides translation for the device(s)
we care about. Furthermore, all that DMAR_PLATFORM_OPT_IN really does
is tell us that memory was protected before the kernel was loaded, and
prevent the user from disabling the intel-iommu driver entirely. While
that lets us assume kernel integrity, what matters for actual runtime
DMA protection is whether we trust individual devices, based on the
"external facing" property that we expect firmware to describe for
Thunderbolt ports.

It's proven challenging to determine the appropriate ports accurately
given the variety of possible topologies, so while still not getting a
perfect answer, by putting enough faith in firmware we can at least get
a good bit closer. If we can see that any device near a Thunderbolt NHI
has all the requisites for Kernel DMA Protection, chances are that it
*is* a relevant port, but moreover that implies that firmware is playing
the game overall, so we'll use that to assume that all Thunderbolt ports
should be correctly marked and thus will end up fully protected.

CC: Mario Limonciello 
Signed-off-by: Robin Murphy 
---

v2: Give up trying to look for specific devices, just look for evidence
 that firmware cares at all.

  drivers/thunderbolt/domain.c | 12 +++
  drivers/thunderbolt/nhi.c| 41 
  include/linux/thunderbolt.h  |  2 ++
  3 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 7018d959f775..2889a214dadc 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -7,9 +7,7 @@
   */
  
  #include 

-#include 
  #include 
-#include 
  #include 
  #include 
  #include 
@@ -257,13 +255,9 @@ static ssize_t iommu_dma_protection_show(struct device 
*dev,
 struct device_attribute *attr,
 char *buf)
  {
-   /*
-* Kernel DMA protection is a feature where Thunderbolt security is
-* handled natively using IOMMU. It is enabled when IOMMU is
-* enabled and ACPI DMAR table has DMAR_PLATFORM_OPT_IN set.
-*/
-   return sprintf(buf, "%d\n",
-  iommu_present(_bus_type) && dmar_platform_optin());
+   struct tb *tb = container_of(dev, struct tb, dev);
+
+   return sysfs_emit(buf, "%d\n", tb->nhi->iommu_dma_protection);
  }
  static DEVICE_ATTR_RO(iommu_dma_protection);
  
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c

index c73da0532be4..9e396e283792 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -14,6 +14,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -1102,6 +1103,45 @@ static void nhi_check_quirks(struct tb_nhi *nhi)
nhi->quirks |= QUIRK_AUTO_CLEAR_INT;
  }
  
+static int nhi_check_iommu_pdev(struct pci_dev *pdev, void *data)

+{
+   if (!pdev->untrusted ||
+   !dev_iommu_capable(>dev, IOMMU_CAP_PRE_BOOT_PROTECTION))


This one needs to take the pdev->external_facing into account too
because most of the time there are no existing tunnels when the driver
is loaded so we only see the PCIe root/downstream port. I think this is
enough actually:

if (!pdev->external_facing ||
!dev_iommu_capable(>dev, IOMMU_CAP_PRE_BOOT_PROTECTION))


Ah yes, my bad, for some reason I got the misapprehension into my head 
that untrusted was propagated to the port as well, not just the devices 
behind it. I'll fix this and tweak the comment below to match.



+   return 0;
+   *(bool *)data = true;
+   return 1; /* Stop walking */
+}
+
+static void nhi_check_iommu(struct tb_nhi *nhi)
+{
+   struct pci_bus *bus = nhi->pdev->bus;
+   bool port_ok = false;
+
+   /*
+* Ideally what we'd do here is grab every PCI device that
+* represents a tunnelling adapter for this NHI and check their
+* status directly, but unfortunately USB4 seems to make it
+* obnoxiously difficult to reliably make any correlation.
+*
+* So for now we'll have to bodge it... Hoping that the system
+* is at least sane enough that an adapter is in the same PCI
+* segment as its NHI, if we can find *something* on that segment
+* which meets the requirements for Kernel DMA Protection, we'll
+   

Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-22 Thread Niklas Schnelle
On Fri, 2022-03-18 at 14:27 -0300, Jason Gunthorpe wrote:
> Following the pattern of io_uring, perf, skb, and bpf iommfd will use
iommufd ^
> user->locked_vm for accounting pinned pages. Ensure the value is included
> in the struct and export free_uid() as iommufd is modular.
> 
> user->locked_vm is the correct accounting to use for ulimit because it is
> per-user, and the ulimit is not supposed to be per-process. Other
> places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or
> mm->locked_vm for accounting pinned pages, but this is only per-process
> and inconsistent with the majority of the kernel.

Since this will replace parts of vfio this difference seems
significant. Can you explain this a bit more?

I'm also a bit confused how io_uring handles this. When I stumbled over
the problem fixed by 6b7898eb180d ("io_uring: fix imbalanced sqo_mm
accounting") and from that commit description I seem to rember that
io_uring also accounts in mm->locked_vm too? In fact I stumbled over
that because the wrong accounting in io_uring exhausted the applied to
vfio (I was using a QEMU utilizing io_uring itself).

> 
> Signed-off-by: Jason Gunthorpe 
> ---
>  include/linux/sched/user.h | 2 +-
>  kernel/user.c  | 1 +
>  2 files changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
> index 00ed419dd46413..c47dae71dad3c8 100644
> --- a/include/linux/sched/user.h
> +++ b/include/linux/sched/user.h
> @@ -24,7 +24,7 @@ struct user_struct {
>   kuid_t uid;
>  
>  #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
> -defined(CONFIG_NET) || defined(CONFIG_IO_URING)
> +defined(CONFIG_NET) || defined(CONFIG_IO_URING) || 
> IS_ENABLED(CONFIG_IOMMUFD)
>   atomic_long_t locked_vm;
>  #endif
>  #ifdef CONFIG_WATCH_QUEUE
> diff --git a/kernel/user.c b/kernel/user.c
> index e2cf8c22b539a7..d667debeafd609 100644
> --- a/kernel/user.c
> +++ b/kernel/user.c
> @@ -185,6 +185,7 @@ void free_uid(struct user_struct *up)
>   if (refcount_dec_and_lock_irqsave(>__count, _lock, ))
>   free_user(up, flags);
>  }
> +EXPORT_SYMBOL_GPL(free_uid);
>  
>  struct user_struct *alloc_uid(kuid_t uid)
>  {


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 03/12] iommufd: File descriptor, context, kconfig and makefiles

2022-03-22 Thread Niklas Schnelle
On Fri, 2022-03-18 at 14:27 -0300, Jason Gunthorpe wrote:
> This is the basic infrastructure of a new miscdevice to hold the iommufd
> IOCTL API.
> 
> It provides:
>  - A miscdevice to create file descriptors to run the IOCTL interface over
> 
>  - A table based ioctl dispatch and centralized extendable pre-validation
>step
> 
>  - An xarray mapping user ID's to kernel objects. The design has multiple
>inter-related objects held within in a single IOMMUFD fd
> 
>  - A simple usage count to build a graph of object relations and protect
>against hostile userspace racing ioctls

For me at this point it seems hard to grok what this "graph of object
relations" is about. Maybe an example would help? I'm assuming this is
about e.g. the DEVICE -depends-on-> HW_PAGETABLE -depends-on-> IOAS  so
the arrows in the picture of PATCH 02? Or is it the other way around
and the IOAS -depends-on-> HW_PAGETABLE because it's about which
references which? From the rest of the patch I seem to understand that
this mostly establishes the order of destruction. So is HW_PAGETABLE
destroyed before the IOAS because a HW_PAGETABLE must never reference
an invalid/destoryed IOAS or is it the other way arund because the IOAS
has a reference to the HW_PAGETABLES in it? I'd guess the former but
I'm a bit confused still.

> 
> The only IOCTL provided in this patch is the generic 'destroy any object
> by handle' operation.
> 
> Signed-off-by: Yi Liu 
> Signed-off-by: Jason Gunthorpe 
> ---
>  .../userspace-api/ioctl/ioctl-number.rst  |   1 +
>  MAINTAINERS   |  10 +
>  drivers/iommu/Kconfig |   1 +
>  drivers/iommu/Makefile|   2 +-
>  drivers/iommu/iommufd/Kconfig |  13 +
>  drivers/iommu/iommufd/Makefile|   5 +
>  drivers/iommu/iommufd/iommufd_private.h   |  95 ++
>  drivers/iommu/iommufd/main.c  | 305 ++
>  include/uapi/linux/iommufd.h  |  55 
>  9 files changed, 486 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/iommu/iommufd/Kconfig
>  create mode 100644 drivers/iommu/iommufd/Makefile
>  create mode 100644 drivers/iommu/iommufd/iommufd_private.h
>  create mode 100644 drivers/iommu/iommufd/main.c
>  create mode 100644 include/uapi/linux/iommufd.h
> 
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
> b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index e6fce2cbd99ed4..4a041dfc61fe95 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -105,6 +105,7 @@ Code  Seq#Include File
>Comments
>  '8'   allSNP8023 
> advanced NIC card
>   
> 
>  ';'   64-7F  linux/vfio.h
> +';'   80-FF  linux/iommufd.h
>  '='   00-3f  uapi/linux/ptp_clock.h  
> 
>  '@'   00-0F  linux/radeonfb.h
> conflict!
>  '@'   00-0F  drivers/video/aty/aty128fb.c
> conflict!
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1ba1e4af2cbc80..23a9c631051ee8 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -10038,6 +10038,16 @@ L:   linux-m...@vger.kernel.org
>  S:   Maintained
>  F:   drivers/net/ethernet/sgi/ioc3-eth.c
>  
> +IOMMU FD
> +M:   Jason Gunthorpe 
> +M:   Kevin Tian 
> +L:   iommu@lists.linux-foundation.org
> +S:   Maintained
> +F:   Documentation/userspace-api/iommufd.rst
> +F:   drivers/iommu/iommufd/
> +F:   include/uapi/linux/iommufd.h
> +F:   include/linux/iommufd.h
> +
>  IOMAP FILESYSTEM LIBRARY
>  M:   Christoph Hellwig 
>  M:   Darrick J. Wong 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 3eb68fa1b8cc02..754d2a9ff64623 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -177,6 +177,7 @@ config MSM_IOMMU
>  
>  source "drivers/iommu/amd/Kconfig"
>  source "drivers/iommu/intel/Kconfig"
> +source "drivers/iommu/iommufd/Kconfig"
>  
>  config IRQ_REMAP
>   bool "Support for Interrupt Remapping"
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index bc7f730edbb0be..6b38d12692b213 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -1,5 +1,5 @@
>  # SPDX-License-Identifier: GPL-2.0
> -obj-y += amd/ intel/ arm/
> +obj-y += amd/ intel/ arm/ iommufd/
>  obj-$(CONFIG_IOMMU_API) += iommu.o
>  obj-$(CONFIG_IOMMU_API) += iommu-traces.o
>  obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
> diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
> new file mode 100644
> index 00..fddd453bb0e764
> --- /dev/null
> +++ b/drivers/iommu/iommufd/Kconfig
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config IOMMUFD
> + tristate 

Re: [PATCH RFC 10/11] iommu: Make IOPF handling framework generic

2022-03-22 Thread Jason Gunthorpe via iommu
On Tue, Mar 22, 2022 at 01:03:14PM +0800, Lu Baolu wrote:
> On 2022/3/21 20:43, Jason Gunthorpe wrote:
> > On Mon, Mar 21, 2022 at 11:42:16AM +, Jean-Philippe Brucker wrote:
> > 
> > > I tend to disagree with that last part. The fault is caused by a specific
> > > device accessing shared page tables. We should keep that device
> > > information throughout the fault handling, so that we can report it to the
> > > driver when things go wrong.
> > SVA faults should never be reported to drivers??
> > 
> 
> When things go wrong, the corresponding response code will be responded
> to the device through iommu_page_response(). The hardware should then
> report the failure to the device driver and the device driver will
> handle it in the device-specific way. There's no need to propagate the
> I/O page faults to the device driver in any case. Do I understand it
> right?

Something like that, I would expect fault failure to be similar to
accessing somethiing that is not in the iommu map. An Error TLP like
thing toward the device and whatever normal device-specific error
propagation happens.

SVA shouldn't require any special support in the using driver beyond
turing on PRI/ATS

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 2/2] thunderbolt: Make iommu_dma_protection more accurate

2022-03-22 Thread Mika Westerberg
Hi Robin,

I tried this now on two Intel systems. One with integrated Thunderbolt
and one with discrete. There was a small issue, see below but once fixed
it worked as expected :)

On Fri, Mar 18, 2022 at 05:42:58PM +, Robin Murphy wrote:
> Between me trying to get rid of iommu_present() and Mario wanting to
> support the AMD equivalent of DMAR_PLATFORM_OPT_IN, scrutiny has shown
> that the iommu_dma_protection attribute is being far too optimistic.
> Even if an IOMMU might be present for some PCI segment in the system,
> that doesn't necessarily mean it provides translation for the device(s)
> we care about. Furthermore, all that DMAR_PLATFORM_OPT_IN really does
> is tell us that memory was protected before the kernel was loaded, and
> prevent the user from disabling the intel-iommu driver entirely. While
> that lets us assume kernel integrity, what matters for actual runtime
> DMA protection is whether we trust individual devices, based on the
> "external facing" property that we expect firmware to describe for
> Thunderbolt ports.
> 
> It's proven challenging to determine the appropriate ports accurately
> given the variety of possible topologies, so while still not getting a
> perfect answer, by putting enough faith in firmware we can at least get
> a good bit closer. If we can see that any device near a Thunderbolt NHI
> has all the requisites for Kernel DMA Protection, chances are that it
> *is* a relevant port, but moreover that implies that firmware is playing
> the game overall, so we'll use that to assume that all Thunderbolt ports
> should be correctly marked and thus will end up fully protected.
> 
> CC: Mario Limonciello 
> Signed-off-by: Robin Murphy 
> ---
> 
> v2: Give up trying to look for specific devices, just look for evidence
> that firmware cares at all.
> 
>  drivers/thunderbolt/domain.c | 12 +++
>  drivers/thunderbolt/nhi.c| 41 
>  include/linux/thunderbolt.h  |  2 ++
>  3 files changed, 46 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
> index 7018d959f775..2889a214dadc 100644
> --- a/drivers/thunderbolt/domain.c
> +++ b/drivers/thunderbolt/domain.c
> @@ -7,9 +7,7 @@
>   */
>  
>  #include 
> -#include 
>  #include 
> -#include 
>  #include 
>  #include 
>  #include 
> @@ -257,13 +255,9 @@ static ssize_t iommu_dma_protection_show(struct device 
> *dev,
>struct device_attribute *attr,
>char *buf)
>  {
> - /*
> -  * Kernel DMA protection is a feature where Thunderbolt security is
> -  * handled natively using IOMMU. It is enabled when IOMMU is
> -  * enabled and ACPI DMAR table has DMAR_PLATFORM_OPT_IN set.
> -  */
> - return sprintf(buf, "%d\n",
> -iommu_present(_bus_type) && dmar_platform_optin());
> + struct tb *tb = container_of(dev, struct tb, dev);
> +
> + return sysfs_emit(buf, "%d\n", tb->nhi->iommu_dma_protection);
>  }
>  static DEVICE_ATTR_RO(iommu_dma_protection);
>  
> diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
> index c73da0532be4..9e396e283792 100644
> --- a/drivers/thunderbolt/nhi.c
> +++ b/drivers/thunderbolt/nhi.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1102,6 +1103,45 @@ static void nhi_check_quirks(struct tb_nhi *nhi)
>   nhi->quirks |= QUIRK_AUTO_CLEAR_INT;
>  }
>  
> +static int nhi_check_iommu_pdev(struct pci_dev *pdev, void *data)
> +{
> + if (!pdev->untrusted ||
> + !dev_iommu_capable(>dev, IOMMU_CAP_PRE_BOOT_PROTECTION))

This one needs to take the pdev->external_facing into account too
because most of the time there are no existing tunnels when the driver
is loaded so we only see the PCIe root/downstream port. I think this is
enough actually:

if (!pdev->external_facing ||
!dev_iommu_capable(>dev, IOMMU_CAP_PRE_BOOT_PROTECTION))

> + return 0;
> + *(bool *)data = true;
> + return 1; /* Stop walking */
> +}
> +
> +static void nhi_check_iommu(struct tb_nhi *nhi)
> +{
> + struct pci_bus *bus = nhi->pdev->bus;
> + bool port_ok = false;
> +
> + /*
> +  * Ideally what we'd do here is grab every PCI device that
> +  * represents a tunnelling adapter for this NHI and check their
> +  * status directly, but unfortunately USB4 seems to make it
> +  * obnoxiously difficult to reliably make any correlation.
> +  *
> +  * So for now we'll have to bodge it... Hoping that the system
> +  * is at least sane enough that an adapter is in the same PCI
> +  * segment as its NHI, if we can find *something* on that segment
> +  * which meets the requirements for Kernel DMA Protection, we'll
> +  * take that to imply that firmware is aware and has (hopefully)
> +  * done the right thing in general. We need to know 

Re: [PATCH v4 1/2] PCI: Rename "pci_dev->untrusted" to "pci_dev->poses_dma_risk"

2022-03-22 Thread Rafael J. Wysocki
On Tue, Mar 22, 2022 at 10:02 AM Christoph Hellwig  wrote:
>
> On Sat, Mar 19, 2022 at 11:29:05PM -0700, Rajat Jain wrote:
> > Rename the field to make it more clear, that the device can execute DMA
> > attacks on the system, and thus the system may need protection from
> > such attacks from this device.
> >
> > No functional change intended.
> >
> > Signed-off-by: Rajat Jain 
> > ---
> > v4: Initial version, created based on comments on other patch
>
> What a horrible name.  Why not untrusted_dma which captures the
> intent much better?

FWIW, I like this one much better too.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 10/11] iommu: Make IOPF handling framework generic

2022-03-22 Thread Jean-Philippe Brucker
On Tue, Mar 22, 2022 at 10:24:26AM +, Tian, Kevin wrote:
> > From: Jean-Philippe Brucker 
> > Sent: Tuesday, March 22, 2022 6:06 PM
> > 
> > On Tue, Mar 22, 2022 at 01:00:08AM +, Tian, Kevin wrote:
> > > > From: Jean-Philippe Brucker 
> > > > Sent: Monday, March 21, 2022 7:42 PM
> > > >
> > > > Hi Kevin,
> > > >
> > > > On Mon, Mar 21, 2022 at 08:09:36AM +, Tian, Kevin wrote:
> > > > > > From: Lu Baolu 
> > > > > > Sent: Sunday, March 20, 2022 2:40 PM
> > > > > >
> > > > > > The existing IOPF handling framework only handles the I/O page 
> > > > > > faults
> > for
> > > > > > SVA. Ginven that we are able to link iommu domain with each I/O
> > page
> > > > fault,
> > > > > > we can now make the I/O page fault handling framework more
> > general
> > > > for
> > > > > > more types of page faults.
> > > > >
> > > > > "make ... generic" in subject line is kind of confusing. Reading this 
> > > > > patch
> > I
> > > > > think you really meant changing from per-device fault handling to per-
> > > > domain
> > > > > fault handling. This is more accurate in concept since the fault is 
> > > > > caused
> > by
> > > > > the domain page table. 
> > > >
> > > > I tend to disagree with that last part. The fault is caused by a 
> > > > specific
> > > > device accessing shared page tables. We should keep that device
> > > > information throughout the fault handling, so that we can report it to 
> > > > the
> > > > driver when things go wrong. A process can have multiple threads bound
> > to
> > > > different devices, they share the same mm so if the driver wanted to
> > > > signal a misbehaving thread, similarly to a SEGV on the CPU side, it 
> > > > would
> > > > need the device information to precisely report it to userspace.
> > > >
> > >
> > > iommu driver can include the device information in the fault data. But
> > > in concept the IOPF should be reported per domain.
> > 
> > So I don't remember where we left off on that topic, what about fault
> > injection into guests?  In that case device info is more than just
> > diagnostic, fault injection can't work without it. I think we talked about
> > passing a device cookie to userspace, just want to make sure.
> > 
> > > and I agree with Jason that at most we can send SEGV to the entire thread
> > > group since there is no way to associate a DMA back to a thread which
> > > initiates the DMA.
> > 
> > The point is providing the most accurate information to the device driver
> > for diagnostics and debugging. A process opens multiple queues to
> > different devices, then if one of the queues issues invalid DMA, the
> > driver won't even know which queue is broken if you only report the target
> > mm and not the source dev. I don't think we gain anything from discarding
> > the device information from the fault path.
> > 
> 
> In case I didn't make it clear, what I talked about is just about having iommu
> core to report IOPF per domain handler vs. per device handler while this
> design choice doesn't change what the fault data should include (device,
> pasid, addr, etc.). i.e. it always includes all the information provided by 
> the
> iommu driver no matter how the fault is reported upwards.

Right thanks, I misunderstood.

Thanks,
Jean

> 
> e.g. with iommufd it is iommufd to register a IOPF handler per managed
> domain and receive IOPF on those domains. If necessary, iommufd further
> forwards to userspace including device cookie according to the fault data.
> 
> Thanks
> Kevin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

RE: [PATCH RFC 10/11] iommu: Make IOPF handling framework generic

2022-03-22 Thread Tian, Kevin
> From: Jean-Philippe Brucker 
> Sent: Tuesday, March 22, 2022 6:06 PM
> 
> On Tue, Mar 22, 2022 at 01:00:08AM +, Tian, Kevin wrote:
> > > From: Jean-Philippe Brucker 
> > > Sent: Monday, March 21, 2022 7:42 PM
> > >
> > > Hi Kevin,
> > >
> > > On Mon, Mar 21, 2022 at 08:09:36AM +, Tian, Kevin wrote:
> > > > > From: Lu Baolu 
> > > > > Sent: Sunday, March 20, 2022 2:40 PM
> > > > >
> > > > > The existing IOPF handling framework only handles the I/O page faults
> for
> > > > > SVA. Ginven that we are able to link iommu domain with each I/O
> page
> > > fault,
> > > > > we can now make the I/O page fault handling framework more
> general
> > > for
> > > > > more types of page faults.
> > > >
> > > > "make ... generic" in subject line is kind of confusing. Reading this 
> > > > patch
> I
> > > > think you really meant changing from per-device fault handling to per-
> > > domain
> > > > fault handling. This is more accurate in concept since the fault is 
> > > > caused
> by
> > > > the domain page table. 
> > >
> > > I tend to disagree with that last part. The fault is caused by a specific
> > > device accessing shared page tables. We should keep that device
> > > information throughout the fault handling, so that we can report it to the
> > > driver when things go wrong. A process can have multiple threads bound
> to
> > > different devices, they share the same mm so if the driver wanted to
> > > signal a misbehaving thread, similarly to a SEGV on the CPU side, it would
> > > need the device information to precisely report it to userspace.
> > >
> >
> > iommu driver can include the device information in the fault data. But
> > in concept the IOPF should be reported per domain.
> 
> So I don't remember where we left off on that topic, what about fault
> injection into guests?  In that case device info is more than just
> diagnostic, fault injection can't work without it. I think we talked about
> passing a device cookie to userspace, just want to make sure.
> 
> > and I agree with Jason that at most we can send SEGV to the entire thread
> > group since there is no way to associate a DMA back to a thread which
> > initiates the DMA.
> 
> The point is providing the most accurate information to the device driver
> for diagnostics and debugging. A process opens multiple queues to
> different devices, then if one of the queues issues invalid DMA, the
> driver won't even know which queue is broken if you only report the target
> mm and not the source dev. I don't think we gain anything from discarding
> the device information from the fault path.
> 

In case I didn't make it clear, what I talked about is just about having iommu
core to report IOPF per domain handler vs. per device handler while this
design choice doesn't change what the fault data should include (device,
pasid, addr, etc.). i.e. it always includes all the information provided by the
iommu driver no matter how the fault is reported upwards.

e.g. with iommufd it is iommufd to register a IOPF handler per managed
domain and receive IOPF on those domains. If necessary, iommufd further
forwards to userspace including device cookie according to the fault data.

Thanks
Kevin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH RFC 10/11] iommu: Make IOPF handling framework generic

2022-03-22 Thread Jean-Philippe Brucker
On Tue, Mar 22, 2022 at 01:00:08AM +, Tian, Kevin wrote:
> > From: Jean-Philippe Brucker 
> > Sent: Monday, March 21, 2022 7:42 PM
> > 
> > Hi Kevin,
> > 
> > On Mon, Mar 21, 2022 at 08:09:36AM +, Tian, Kevin wrote:
> > > > From: Lu Baolu 
> > > > Sent: Sunday, March 20, 2022 2:40 PM
> > > >
> > > > The existing IOPF handling framework only handles the I/O page faults 
> > > > for
> > > > SVA. Ginven that we are able to link iommu domain with each I/O page
> > fault,
> > > > we can now make the I/O page fault handling framework more general
> > for
> > > > more types of page faults.
> > >
> > > "make ... generic" in subject line is kind of confusing. Reading this 
> > > patch I
> > > think you really meant changing from per-device fault handling to per-
> > domain
> > > fault handling. This is more accurate in concept since the fault is 
> > > caused by
> > > the domain page table. 
> > 
> > I tend to disagree with that last part. The fault is caused by a specific
> > device accessing shared page tables. We should keep that device
> > information throughout the fault handling, so that we can report it to the
> > driver when things go wrong. A process can have multiple threads bound to
> > different devices, they share the same mm so if the driver wanted to
> > signal a misbehaving thread, similarly to a SEGV on the CPU side, it would
> > need the device information to precisely report it to userspace.
> > 
> 
> iommu driver can include the device information in the fault data. But
> in concept the IOPF should be reported per domain.

So I don't remember where we left off on that topic, what about fault
injection into guests?  In that case device info is more than just
diagnostic, fault injection can't work without it. I think we talked about
passing a device cookie to userspace, just want to make sure.

> and I agree with Jason that at most we can send SEGV to the entire thread
> group since there is no way to associate a DMA back to a thread which 
> initiates the DMA.

The point is providing the most accurate information to the device driver
for diagnostics and debugging. A process opens multiple queues to
different devices, then if one of the queues issues invalid DMA, the
driver won't even know which queue is broken if you only report the target
mm and not the source dev. I don't think we gain anything from discarding
the device information from the fault path.

Thanks,
Jean
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH RFC 10/11] iommu: Make IOPF handling framework generic

2022-03-22 Thread Jean-Philippe Brucker
On Tue, Mar 22, 2022 at 01:03:14PM +0800, Lu Baolu wrote:
> On 2022/3/21 20:43, Jason Gunthorpe wrote:
> > On Mon, Mar 21, 2022 at 11:42:16AM +, Jean-Philippe Brucker wrote:
> > 
> > > I tend to disagree with that last part. The fault is caused by a specific
> > > device accessing shared page tables. We should keep that device
> > > information throughout the fault handling, so that we can report it to the
> > > driver when things go wrong.
> > SVA faults should never be reported to drivers??
> > 
> 
> When things go wrong, the corresponding response code will be responded
> to the device through iommu_page_response(). The hardware should then
> report the failure to the device driver and the device driver will
> handle it in the device-specific way. There's no need to propagate the
> I/O page faults to the device driver in any case. Do I understand it
> right?

In theory yes, but devices don't necessarily have the ability to report
precise errors, we may have more information.

Thanks,
Jean

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/2] iommu: Add capability for pre-boot DMA protection

2022-03-22 Thread Robin Murphy

On 2022-03-22 09:14, Christoph Hellwig wrote:

On Fri, Mar 18, 2022 at 05:42:57PM +, Robin Murphy wrote:

VT-d's dmar_platform_optin() actually represents a combination of
properties fairly well standardised by Microsoft as "Pre-boot DMA
Protection" and "Kernel DMA Protection"[1]. As such, we can provide
interested consumers with an abstracted capability rather than
driver-specific interfaces that won't scale. We name it for the former
aspect since that's what external callers are most likely to be
interested in; the latter is for the IOMMU layer to handle itself.

Also use this as an opportunity to draw a line in the sand and add a
new interface so as not to introduce any more callers of iommu_capable()
which I also want to get rid of. For now it's a quick'n'dirty wrapper
function, but will evolve to subsume the internal interface in future.

[1] 
https://docs.microsoft.com/en-us/windows-hardware/design/device-experiences/oem-kernel-dma-protection

Suggested-by: Christoph Hellwig 
Signed-off-by: Robin Murphy 


I can't really think of a way in which I suggested this, but it does
looks like a good interface:


Well, you were the first to say it should be abstracted[1], and since my 
initial thought that it could be hidden completely didn't pan out, I 
felt I should give you credit for being right all along :)



Reviewed-by: Christoph Hellwig 


Thanks!

Robin.

[1] https://lore.kernel.org/linux-iommu/yjdduuez%2fdvuz...@infradead.org/
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 2/2] thunderbolt: Make iommu_dma_protection more accurate

2022-03-22 Thread Christoph Hellwig
On Fri, Mar 18, 2022 at 05:42:58PM +, Robin Murphy wrote:
> Between me trying to get rid of iommu_present() and Mario wanting to
> support the AMD equivalent of DMAR_PLATFORM_OPT_IN, scrutiny has shown
> that the iommu_dma_protection attribute is being far too optimistic.
> Even if an IOMMU might be present for some PCI segment in the system,
> that doesn't necessarily mean it provides translation for the device(s)
> we care about. Furthermore, all that DMAR_PLATFORM_OPT_IN really does
> is tell us that memory was protected before the kernel was loaded, and
> prevent the user from disabling the intel-iommu driver entirely. While
> that lets us assume kernel integrity, what matters for actual runtime
> DMA protection is whether we trust individual devices, based on the
> "external facing" property that we expect firmware to describe for
> Thunderbolt ports.
> 
> It's proven challenging to determine the appropriate ports accurately
> given the variety of possible topologies, so while still not getting a
> perfect answer, by putting enough faith in firmware we can at least get
> a good bit closer. If we can see that any device near a Thunderbolt NHI
> has all the requisites for Kernel DMA Protection, chances are that it
> *is* a relevant port, but moreover that implies that firmware is playing
> the game overall, so we'll use that to assume that all Thunderbolt ports
> should be correctly marked and thus will end up fully protected.
> 
> CC: Mario Limonciello 
> Signed-off-by: Robin Murphy 

Looks sensible to me:

Acked-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/2] iommu: Add capability for pre-boot DMA protection

2022-03-22 Thread Christoph Hellwig
On Fri, Mar 18, 2022 at 05:42:57PM +, Robin Murphy wrote:
> VT-d's dmar_platform_optin() actually represents a combination of
> properties fairly well standardised by Microsoft as "Pre-boot DMA
> Protection" and "Kernel DMA Protection"[1]. As such, we can provide
> interested consumers with an abstracted capability rather than
> driver-specific interfaces that won't scale. We name it for the former
> aspect since that's what external callers are most likely to be
> interested in; the latter is for the IOMMU layer to handle itself.
> 
> Also use this as an opportunity to draw a line in the sand and add a
> new interface so as not to introduce any more callers of iommu_capable()
> which I also want to get rid of. For now it's a quick'n'dirty wrapper
> function, but will evolve to subsume the internal interface in future.
> 
> [1] 
> https://docs.microsoft.com/en-us/windows-hardware/design/device-experiences/oem-kernel-dma-protection
> 
> Suggested-by: Christoph Hellwig 
> Signed-off-by: Robin Murphy 

I can't really think of a way in which I suggested this, but it does
looks like a good interface:

Reviewed-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 1/2] PCI: Rename "pci_dev->untrusted" to "pci_dev->poses_dma_risk"

2022-03-22 Thread Christoph Hellwig
On Sat, Mar 19, 2022 at 11:29:05PM -0700, Rajat Jain wrote:
> Rename the field to make it more clear, that the device can execute DMA
> attacks on the system, and thus the system may need protection from
> such attacks from this device.
> 
> No functional change intended.
> 
> Signed-off-by: Rajat Jain 
> ---
> v4: Initial version, created based on comments on other patch

What a horrible name.  Why not untrusted_dma which captures the
intent much better?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH v4 14/32] iommu: introduce iommu_domain_alloc_type and the KVM type

2022-03-22 Thread Tian, Kevin
> From: Jason Gunthorpe
> Sent: Monday, March 21, 2022 10:07 PM
> 
> On Sat, Mar 19, 2022 at 07:51:31AM +, Tian, Kevin wrote:
> > > From: Jason Gunthorpe 
> > > Sent: Friday, March 18, 2022 10:13 PM
> > >
> > > On Fri, Mar 18, 2022 at 02:23:57AM +, Tian, Kevin wrote:
> > >
> > > > Yes, that is another major part work besides the iommufd work. And
> > > > it is not compatible with KVM features which rely on the dynamic
> > > > manner of EPT. Though It is a bit questionable whether it's worthy of
> > > > doing so just for saving memory footprint while losing other 
> > > > capabilities,
> > > > it is a requirement for some future security extension in Intel trusted
> > > > computing architecture. And KVM has been pinning pages for
> SEV/TDX/etc.
> > > > today thus some facilities can be reused. But I agree it is not a simple
> > > > task thus we need start discussion early to explore various gaps in
> > > > iommu and kvm.
> > >
> > > Yikes. IMHO this might work better going the other way, have KVM
> > > import the iommu_domain and use that as the KVM page table than vice
> > > versa.
> > >
> > > The semantics are a heck of a lot clearer, and it is really obvious
> > > that alot of KVM becomes disabled if you do this.
> > >
> >
> > This is an interesting angle to look at it. But given pinning is already
> > required in KVM to support SEV/TDX even w/o assigned device, those
> > restrictions have to be understood by KVM MMU code which makes
> > a KVM-managed page table under such restrictions closer to be
> > sharable with IOMMU.
> 
> I thought the SEV/TDX stuff wasn't being done with pinning but via a
> memfd in a special mode that does sort of pin under the covers, but it
> is not necessarily a DMA pin. (it isn't even struct page memory, so
> I'm not even sure what pin means)
> 
> Certainly, there is no inherent problem with SEV/TDX having movable
> memory and KVM could concievably handle this - but iommu cannot.
> 
> I would not make an equivilance with SEV/TDX and iommu at least..
> 

Currently SEV does use DMA pin i.e. pin_user_pages in sev_pin_memory().

I'm not sure whether it's a hardware limitation or just a software tradeoff
for simplicity. But having that code does imply that KVM has absorbed
certain restrictions with that pinning fact.

But I agree they are not equivalent. e.g. suppose pinning is only applied to
private/encrypted memory in SEV/TDX while iommu requires pinning the
entire guest memory (if no IOPF support on device).

btw no matter it's KVM to import iommu domain or it's iommufd to
import KVM page table, in the end KVM mmu needs to explicitly mark
out its page table as shared with IOMMU and enable all kinds of
restrictions to support that sharing fact.

Thanks
Kevin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2] iommu/vt-d: calculate mask for non-aligned flushes

2022-03-22 Thread David Stevens
From: David Stevens 

Calculate the appropriate mask for non-size-aligned page selective
invalidation. Since psi uses the mask value to mask out the lower order
bits of the target address, properly flushing the iotlb requires using a
mask value such that [pfn, pfn+pages) all lie within the flushed
size-aligned region.  This is not normally an issue because iova.c
always allocates iovas that are aligned to their size. However, iovas
which come from other sources (e.g. userspace via VFIO) may not be
aligned.

Signed-off-by: David Stevens 
---
v1 -> v2:
 - Calculate an appropriate mask for non-size-aligned iovas instead
   of falling back to domain selective flush.

 drivers/iommu/intel/iommu.c | 27 ---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5b196cfe9ed2..ab2273300346 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1717,7 +1717,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu 
*iommu,
  unsigned long pfn, unsigned int pages,
  int ih, int map)
 {
-   unsigned int mask = ilog2(__roundup_pow_of_two(pages));
+   unsigned int aligned_pages = __roundup_pow_of_two(pages);
+   unsigned int mask = ilog2(aligned_pages);
uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
u16 did = domain->iommu_did[iommu->seq_id];
 
@@ -1729,10 +1730,30 @@ static void iommu_flush_iotlb_psi(struct intel_iommu 
*iommu,
if (domain_use_first_level(domain)) {
domain_flush_piotlb(iommu, domain, addr, pages, ih);
} else {
+   unsigned long bitmask = aligned_pages - 1;
+
+   /*
+* PSI masks the low order bits of the base address. If the
+* address isn't aligned to the mask, then compute a mask value
+* needed to ensure the target range is flushed.
+*/
+   if (unlikely(bitmask & pfn)) {
+   unsigned long end_pfn = pfn + pages - 1, shared_bits;
+
+   /*
+* Since end_pfn <= pfn + bitmask, the only way bits
+* higher than bitmask can differ in pfn and end_pfn is
+* by carrying. This means after masking out bitmask,
+* high bits starting with the first set bit in
+* shared_bits are all equal in both pfn and end_pfn.
+*/
+   shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
+   mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
+   }
+
/*
 * Fallback to domain selective flush if no PSI support or
-* the size is too big. PSI requires page size to be 2 ^ x,
-* and the base address is naturally aligned to the size.
+* the size is too big.
 */
if (!cap_pgsel_inv(iommu->cap) ||
mask > cap_max_amask_val(iommu->cap))
-- 
2.35.1.894.gb6a874cedc-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu