[PATCH v2] ACPI: VIOT: Fix ACS setup

2022-06-30 Thread Eric Auger
Currently acpi_viot_init() gets called after the pci
device has been scanned and pci_enable_acs() has been called.
So pci_request_acs() fails to be taken into account leading
to wrong single iommu group topologies when dealing with
multi-function root ports for instance.

We cannot simply move the acpi_viot_init() earlier, similarly
as the IORT init because the VIOT parsing relies on the pci
scan. However we can detect VIOT is present earlier and in
such a case, request ACS. Introduce a new acpi_viot_early_init()
routine that allows to call pci_request_acs() before the scan.

While at it, guard the call to pci_request_acs() with #ifdef
CONFIG_PCI.

Fixes: 3cf485540e7b ("ACPI: Add driver for the VIOT table")
Signed-off-by: Eric Auger 
Reported-by: Jin Liu 

---

v1 -> v2:
- guard acpi_viot_early_init() content with #ifdef CONFIG_PCI
- do not call acpi_put_table() if acpi_get_table() failed
- change declaration order of acpi_viot_early_init() stub
---
 drivers/acpi/bus.c|  1 +
 drivers/acpi/viot.c   | 26 --
 include/linux/acpi_viot.h |  2 ++
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 86fa61a21826..906ad8153fd9 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1400,6 +1400,7 @@ static int __init acpi_init(void)
 
pci_mmcfg_late_init();
acpi_iort_init();
+   acpi_viot_early_init();
acpi_hest_init();
acpi_ghes_init();
acpi_scan_init();
diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
index d2256326c73a..647f11cf165d 100644
--- a/drivers/acpi/viot.c
+++ b/drivers/acpi/viot.c
@@ -248,6 +248,26 @@ static int __init viot_parse_node(const struct 
acpi_viot_header *hdr)
return ret;
 }
 
+/**
+ * acpi_viot_early_init - Test the presence of VIOT and enable ACS
+ *
+ * If the VIOT does exist, ACS must be enabled. This cannot be
+ * done in acpi_viot_init() which is called after the bus scan
+ */
+void __init acpi_viot_early_init(void)
+{
+#ifdef CONFIG_PCI
+   acpi_status status;
+   struct acpi_table_header *hdr;
+
+   status = acpi_get_table(ACPI_SIG_VIOT, 0, );
+   if (ACPI_FAILURE(status))
+   return;
+   pci_request_acs();
+   acpi_put_table(hdr);
+#endif
+}
+
 /**
  * acpi_viot_init - Parse the VIOT table
  *
@@ -319,12 +339,6 @@ static int viot_pci_dev_iommu_init(struct pci_dev *pdev, 
u16 dev_id, void *data)
epid = ((domain_nr - ep->segment_start) << 16) +
dev_id - ep->bdf_start + ep->endpoint_id;
 
-   /*
-* If we found a PCI range managed by the viommu, we're
-* the one that has to request ACS.
-*/
-   pci_request_acs();
-
return viot_dev_iommu_init(>dev, ep->viommu,
   epid);
}
diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
index 1eb8ee5b0e5f..a5a122431563 100644
--- a/include/linux/acpi_viot.h
+++ b/include/linux/acpi_viot.h
@@ -6,9 +6,11 @@
 #include 
 
 #ifdef CONFIG_ACPI_VIOT
+void __init acpi_viot_early_init(void);
 void __init acpi_viot_init(void);
 int viot_iommu_configure(struct device *dev);
 #else
+static inline void acpi_viot_early_init(void) {}
 static inline void acpi_viot_init(void) {}
 static inline int viot_iommu_configure(struct device *dev)
 {
-- 
2.35.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] ACPI: VIOT: Fix ACS setup

2022-06-29 Thread Eric Auger
Hi Jean

On 6/29/22 11:14, Jean-Philippe Brucker wrote:
> Hi Eric,
> 
> On Mon, Jun 27, 2022 at 02:55:34PM +0200, Eric Auger wrote:
>> Currently acpi_viot_init() gets called after the pci
>> device has been scanned and pci_enable_acs() has been called.
>> So pci_request_acs() fails to be taken into account leading
>> to wrong single iommu group topologies when dealing with
>> multi-function root ports for instance.
>>
>> We cannot simply move the acpi_viot_init() earlier, similarly
>> as the IORT init because the VIOT parsing relies on the pci
>> scan. However we can detect VIOT is present earlier and in
>> such a case, request ACS. Introduce a new acpi_viot_early_init()
>> routine that allows to call pci_request_acs() before the scan.
>>
>> Fixes: 3cf485540e7b ("ACPI: Add driver for the VIOT table")
>> Signed-off-by: Eric Auger 
>> Reported-by: Jin Liu 
> 
> Thanks for the fix, the patch makes sense and fixes the issue.
> 
> I wondered whether we should keep the logic where we only request ACS if
> an IOMMU is found to manage a PCI range, but I can't see any harm in
> requesting it regardless (plus there is a precedent with AMD IOMMU).
Yes that's what I saw too
> I could imagine some VMM wanting to only put an IOMMU in front of its MMIO
> devices and leave PCI to roam free, but that seems like a stretch.
> 
> There is another issue with the existing code, though: we can't call
> pci_request_acs() when CONFIG_PCI is disabled because no stub is defined.
> Could you wrap the call in an #ifdef?
sure
> 
>> ---
>>  drivers/acpi/bus.c|  1 +
>>  drivers/acpi/viot.c   | 23 +--
>>  include/linux/acpi_viot.h |  2 ++
>>  3 files changed, 20 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
>> index 86fa61a21826..906ad8153fd9 100644
>> --- a/drivers/acpi/bus.c
>> +++ b/drivers/acpi/bus.c
>> @@ -1400,6 +1400,7 @@ static int __init acpi_init(void)
>>  
>>  pci_mmcfg_late_init();
>>  acpi_iort_init();
>> +acpi_viot_early_init();
>>  acpi_hest_init();
>>  acpi_ghes_init();
>>  acpi_scan_init();
>> diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
>> index d2256326c73a..3c1be123e4d6 100644
>> --- a/drivers/acpi/viot.c
>> +++ b/drivers/acpi/viot.c
>> @@ -248,6 +248,23 @@ static int __init viot_parse_node(const struct 
>> acpi_viot_header *hdr)
>>  return ret;
>>  }
>>  
>> +/**
>> + * acpi_viot_early_init - Test the presence of VIOT and enable ACS
>> + *
>> + * If the VIOT does exist, ACS must be enabled. This cannot be
>> + * done in acpi_viot_init() which is called after the bus scan
>> + */
>> +void __init acpi_viot_early_init(void)
>> +{
>> +acpi_status status;
>> +struct acpi_table_header *hdr;
>> +
>> +status = acpi_get_table(ACPI_SIG_VIOT, 0, );
>> +if (!ACPI_FAILURE(status))
>> +pci_request_acs();
>> +acpi_put_table(hdr);
> 
> I'd rather not call acpi_put_table() in case of failure. I know it is
> handled but it looks fragile and I couldn't find any other user of
> acpi_get_table() doing this.
OK
> 
>> +}
>> +
>>  /**
>>   * acpi_viot_init - Parse the VIOT table
>>   *
>> @@ -319,12 +336,6 @@ static int viot_pci_dev_iommu_init(struct pci_dev 
>> *pdev, u16 dev_id, void *data)
>>  epid = ((domain_nr - ep->segment_start) << 16) +
>>  dev_id - ep->bdf_start + ep->endpoint_id;
>>  
>> -/*
>> - * If we found a PCI range managed by the viommu, we're
>> - * the one that has to request ACS.
>> - */
>> -pci_request_acs();
>> -
>>  return viot_dev_iommu_init(>dev, ep->viommu,
>> epid);
>>  }
>> diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
>> index 1eb8ee5b0e5f..e58d60f8ff2e 100644
>> --- a/include/linux/acpi_viot.h
>> +++ b/include/linux/acpi_viot.h
>> @@ -6,10 +6,12 @@
>>  #include 
>>  
>>  #ifdef CONFIG_ACPI_VIOT
>> +void __init acpi_viot_early_init(void);
>>  void __init acpi_viot_init(void);
>>  int viot_iommu_configure(struct device *dev);
>>  #else
>>  static inline void acpi_viot_init(void) {}
>> +static inline void acpi_viot_early_init(void) {}
> 
> nit: different declaration order
OK

Thanks

Eric
> 
> Thanks,
> Jean
> 
> 
>>  static inline int viot_iommu_configure(struct device *dev)
>>  {
>>  return -ENODEV;
>> -- 
>> 2.35.3
>>
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] ACPI: VIOT: Fix ACS setup

2022-06-27 Thread Eric Auger
Currently acpi_viot_init() gets called after the pci
device has been scanned and pci_enable_acs() has been called.
So pci_request_acs() fails to be taken into account leading
to wrong single iommu group topologies when dealing with
multi-function root ports for instance.

We cannot simply move the acpi_viot_init() earlier, similarly
as the IORT init because the VIOT parsing relies on the pci
scan. However we can detect VIOT is present earlier and in
such a case, request ACS. Introduce a new acpi_viot_early_init()
routine that allows to call pci_request_acs() before the scan.

Fixes: 3cf485540e7b ("ACPI: Add driver for the VIOT table")
Signed-off-by: Eric Auger 
Reported-by: Jin Liu 
---
 drivers/acpi/bus.c|  1 +
 drivers/acpi/viot.c   | 23 +--
 include/linux/acpi_viot.h |  2 ++
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 86fa61a21826..906ad8153fd9 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1400,6 +1400,7 @@ static int __init acpi_init(void)
 
pci_mmcfg_late_init();
acpi_iort_init();
+   acpi_viot_early_init();
acpi_hest_init();
acpi_ghes_init();
acpi_scan_init();
diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
index d2256326c73a..3c1be123e4d6 100644
--- a/drivers/acpi/viot.c
+++ b/drivers/acpi/viot.c
@@ -248,6 +248,23 @@ static int __init viot_parse_node(const struct 
acpi_viot_header *hdr)
return ret;
 }
 
+/**
+ * acpi_viot_early_init - Test the presence of VIOT and enable ACS
+ *
+ * If the VIOT does exist, ACS must be enabled. This cannot be
+ * done in acpi_viot_init() which is called after the bus scan
+ */
+void __init acpi_viot_early_init(void)
+{
+   acpi_status status;
+   struct acpi_table_header *hdr;
+
+   status = acpi_get_table(ACPI_SIG_VIOT, 0, );
+   if (!ACPI_FAILURE(status))
+   pci_request_acs();
+   acpi_put_table(hdr);
+}
+
 /**
  * acpi_viot_init - Parse the VIOT table
  *
@@ -319,12 +336,6 @@ static int viot_pci_dev_iommu_init(struct pci_dev *pdev, 
u16 dev_id, void *data)
epid = ((domain_nr - ep->segment_start) << 16) +
dev_id - ep->bdf_start + ep->endpoint_id;
 
-   /*
-* If we found a PCI range managed by the viommu, we're
-* the one that has to request ACS.
-*/
-   pci_request_acs();
-
return viot_dev_iommu_init(>dev, ep->viommu,
   epid);
}
diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
index 1eb8ee5b0e5f..e58d60f8ff2e 100644
--- a/include/linux/acpi_viot.h
+++ b/include/linux/acpi_viot.h
@@ -6,10 +6,12 @@
 #include 
 
 #ifdef CONFIG_ACPI_VIOT
+void __init acpi_viot_early_init(void);
 void __init acpi_viot_init(void);
 int viot_iommu_configure(struct device *dev);
 #else
 static inline void acpi_viot_init(void) {}
+static inline void acpi_viot_early_init(void) {}
 static inline int viot_iommu_configure(struct device *dev)
 {
return -ENODEV;
-- 
2.35.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] vfio: Remove VFIO_TYPE1_NESTING_IOMMU

2022-05-12 Thread Eric Auger
Hi,

On 5/10/22 20:13, Jason Gunthorpe wrote:
> On Tue, May 10, 2022 at 06:52:06PM +0100, Robin Murphy wrote:
>> On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
>>> This control causes the ARM SMMU drivers to choose a stage 2
>>> implementation for the IO pagetable (vs the stage 1 usual default),
>>> however this choice has no visible impact to the VFIO user. Further qemu
>>> never implemented this and no other userspace user is known.
>>>
>>> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
>>> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
>>> SMMU translation services to the guest operating system" however the rest
>>> of the API to set the guest table pointer for the stage 1 was never
>>> completed, or at least never upstreamed, rendering this part useless dead
>>> code.
>>>
>>> Since the current patches to enable nested translation, aka userspace page
>>> tables, rely on iommufd and will not use the enable_nesting()
>>> iommu_domain_op, remove this infrastructure. However, don't cut too deep
>>> into the SMMU drivers for now expecting the iommufd work to pick it up -
>>> we still need to create S2 IO page tables.
>>>
>>> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
>>> enable_nesting iommu_domain_op.
>>>
>>> Just in-case there is some userspace using this continue to treat
>>> requesting it as a NOP, but do not advertise support any more.
>> I assume the nested translation/guest SVA patches that Eric and Vivek were
>> working on pre-IOMMUFD made use of this, and given that they got quite far
>> along, I wouldn't be too surprised if some eager cloud vendors might have
>> even deployed something based on the patches off the list. 

thank you Robin for the heads up.
> With upstream there is no way to make use of this flag, if someone is
> using it they have other out of tree kernel, vfio, kvm and qemu
> patches to make it all work.
>
> You can see how much is still needed in Eric's tree:
>
> https://github.com/eauger/linux/commits/v5.15-rc7-nested-v16
>
>> I can't help feeling a little wary about removing this until IOMMUFD
>> can actually offer a functional replacement - is it in the way of
>> anything upcoming?
> From an upstream perspective if someone has a patched kernel to
> complete the feature, then they can patch this part in as well, we
> should not carry dead code like this in the kernel and in the uapi.

On the other end the code is in the kernel for 8 years now, I think we
could wait for some additional weeks/months until the iommufd nested
integration arises and gets tested.

Thanks

Eric
>
> It is not directly in the way, but this needs to get done at some
> point, I'd rather just get it out of the way.
>
> Thanks,
> Jason
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 00/12] IOMMUFD Generic interface

2022-04-12 Thread Eric Auger
Hi Jason,

On 4/12/22 10:22 PM, Jason Gunthorpe wrote:
> On Tue, Apr 12, 2022 at 10:13:32PM +0200, Eric Auger wrote:
>> Hi,
>>
>> On 3/18/22 6:27 PM, Jason Gunthorpe wrote:
>>> iommufd is the user API to control the IOMMU subsystem as it relates to
>>> managing IO page tables that point at user space memory.
>>>
>>> It takes over from drivers/vfio/vfio_iommu_type1.c (aka the VFIO
>>> container) which is the VFIO specific interface for a similar idea.
>>>
>>> We see a broad need for extended features, some being highly IOMMU device
>>> specific:
>>>  - Binding iommu_domain's to PASID/SSID
>>>  - Userspace page tables, for ARM, x86 and S390
>>>  - Kernel bypass'd invalidation of user page tables
>>>  - Re-use of the KVM page table in the IOMMU
>>>  - Dirty page tracking in the IOMMU
>>>  - Runtime Increase/Decrease of IOPTE size
>>>  - PRI support with faults resolved in userspace
>> This series does not have any concept of group fds anymore and the API
>> is device oriented.
>> I have a question wrt pci bus reset capability.
>>
>> 8b27ee60bfd6 ("vfio-pci: PCI hot reset interface")
>> introduced VFIO_DEVICE_PCI_GET_HOT_RESET_INFO and VFIO_DEVICE_PCI_HOT_RESET
>>
>> Maybe we can reuse VFIO_DEVICE_GET_PCI_HOT_RESET_INFO to retrieve the 
>> devices and iommu groups that need to be checked and involved in the bus 
>> reset. If I understand correctly we now need to make sure the devices are 
>> handled in the same security context (bound to the same iommufd)
>>
>> however VFIO_DEVICE_PCI_HOT_RESET operate on a collection of group fds.
>>
>> How do you see the porting of this functionality onto /dev/iommu?
> I already made a patch that converts VFIO_DEVICE_PCI_HOT_RESET to work
> on a generic notion of a file and the underlying infrastructure to
> allow it to accept either a device or group fd.
>
> Same for the similar issue in KVM.
>
> It is part of three VFIO series I will be posting. First is up here:
>
> https://lore.kernel.org/kvm/0-v1-a8faf768d202+125dd-vfio_mdev_no_group_...@nvidia.com/
>
> Overall the strategy is to contain the vfio_group as an internal detail
> of vfio.ko and external interfaces use either a struct vfio_device *
> or a struct file *
Thank you for the quick reply. Yi and I will look at this series. I
guess we won't support the bus reset functionality in our first QEMU
porting onto /dev/iommu until that code stabilizes.

Eric
>
> Jason
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 00/12] IOMMUFD Generic interface

2022-04-12 Thread Eric Auger
Hi,

On 3/18/22 6:27 PM, Jason Gunthorpe wrote:
> iommufd is the user API to control the IOMMU subsystem as it relates to
> managing IO page tables that point at user space memory.
>
> It takes over from drivers/vfio/vfio_iommu_type1.c (aka the VFIO
> container) which is the VFIO specific interface for a similar idea.
>
> We see a broad need for extended features, some being highly IOMMU device
> specific:
>  - Binding iommu_domain's to PASID/SSID
>  - Userspace page tables, for ARM, x86 and S390
>  - Kernel bypass'd invalidation of user page tables
>  - Re-use of the KVM page table in the IOMMU
>  - Dirty page tracking in the IOMMU
>  - Runtime Increase/Decrease of IOPTE size
>  - PRI support with faults resolved in userspace

This series does not have any concept of group fds anymore and the API
is device oriented.
I have a question wrt pci bus reset capability.

8b27ee60bfd6 ("vfio-pci: PCI hot reset interface")
introduced VFIO_DEVICE_PCI_GET_HOT_RESET_INFO and VFIO_DEVICE_PCI_HOT_RESET

Maybe we can reuse VFIO_DEVICE_GET_PCI_HOT_RESET_INFO to retrieve the devices 
and iommu groups that need to be checked and involved in the bus reset. If I 
understand correctly we now need to make sure the devices are handled in the 
same security context (bound to the same iommufd)

however VFIO_DEVICE_PCI_HOT_RESET operate on a collection of group fds.

How do you see the porting of this functionality onto /dev/iommu?

Thanks

Eric




>
> As well as a need to access these features beyond just VFIO, VDPA for
> instance, but other classes of accelerator HW are touching on these areas
> now too.
>
> The v1 series proposed re-using the VFIO type 1 data structure, however it
> was suggested that if we are doing this big update then we should also
> come with a data structure that solves the limitations that VFIO type1
> has. Notably this addresses:
>
>  - Multiple IOAS/'containers' and multiple domains inside a single FD
>
>  - Single-pin operation no matter how many domains and containers use
>a page
>
>  - A fine grained locking scheme supporting user managed concurrency for
>multi-threaded map/unmap
>
>  - A pre-registration mechanism to optimize vIOMMU use cases by
>pre-pinning pages
>
>  - Extended ioctl API that can manage these new objects and exposes
>domains directly to user space
>
>  - domains are sharable between subsystems, eg VFIO and VDPA
>
> The bulk of this code is a new data structure design to track how the
> IOVAs are mapped to PFNs.
>
> iommufd intends to be general and consumable by any driver that wants to
> DMA to userspace. From a driver perspective it can largely be dropped in
> in-place of iommu_attach_device() and provides a uniform full feature set
> to all consumers.
>
> As this is a larger project this series is the first step. This series
> provides the iommfd "generic interface" which is designed to be suitable
> for applications like DPDK and VMM flows that are not optimized to
> specific HW scenarios. It is close to being a drop in replacement for the
> existing VFIO type 1.
>
> This is part two of three for an initial sequence:
>  - Move IOMMU Group security into the iommu layer
>
> https://lore.kernel.org/linux-iommu/20220218005521.172832-1-baolu...@linux.intel.com/
>  * Generic IOMMUFD implementation
>  - VFIO ability to consume IOMMUFD
>An early exploration of this is available here:
> https://github.com/luxis1999/iommufd/commits/iommufd-v5.17-rc6
>
> Various parts of the above extended features are in WIP stages currently
> to define how their IOCTL interface should work.
>
> At this point, using the draft VFIO series, unmodified qemu has been
> tested to operate using iommufd on x86 and ARM systems.
>
> Several people have contributed directly to this work: Eric Auger, Kevin
> Tian, Lu Baolu, Nicolin Chen, Yi L Liu. Many more have participated in the
> discussions that lead here, and provided ideas. Thanks to all!
>
> This is on github: https://github.com/jgunthorpe/linux/commits/iommufd
>
> # S390 in-kernel page table walker
> Cc: Niklas Schnelle 
> Cc: Matthew Rosato 
> # AMD Dirty page tracking
> Cc: Joao Martins 
> # ARM SMMU Dirty page tracking
> Cc: Keqian Zhu 
> Cc: Shameerali Kolothum Thodi 
> # ARM SMMU nesting
> Cc: Eric Auger 
> Cc: Jean-Philippe Brucker 
> # Map/unmap performance
> Cc: Daniel Jordan 
> # VDPA
> Cc: "Michael S. Tsirkin" 
> Cc: Jason Wang 
> # Power
> Cc: David Gibson 
> # vfio
> Cc: Alex Williamson 
> Cc: Cornelia Huck 
> Cc: k...@vger.kernel.org
> # iommu
> Cc: iommu@lists.linux-foundation.org
> # Collaborators
> Cc: "Chaitanya Kulkarni" 
> Cc: Nicolin Chen 
> Cc: Lu Baolu 
>

Re: [PATCH RFC 11/12] iommufd: vfio container FD ioctl compatibility

2022-03-24 Thread Eric Auger
Hi,

On 3/24/22 1:33 AM, Jason Gunthorpe wrote:
> On Wed, Mar 23, 2022 at 04:51:25PM -0600, Alex Williamson wrote:
>
>> My overall question here would be whether we can actually achieve a
>> compatibility interface that has sufficient feature transparency that we
>> can dump vfio code in favor of this interface, or will there be enough
>> niche use cases that we need to keep type1 and vfio containers around
>> through a deprecation process?
> Other than SPAPR, I think we can.
>
>> The locked memory differences for one seem like something that
>> libvirt wouldn't want hidden
> I'm first interested to have an understanding how this change becomes
> a real problem in practice that requires libvirt to do something
> different for vfio or iommufd. We can discuss in the other thread
>
> If this is the make or break point then I think we can deal with it
> either by going back to what vfio does now or perhaps some other
> friendly compat approach..
>
>> and we have questions regarding support for vaddr hijacking
> I'm not sure what vaddr hijacking is? Do you mean
> VFIO_DMA_MAP_FLAG_VADDR ? There is a comment that outlines my plan to
> implement it in a functionally compatible way without the deadlock
> problem. I estimate this as a small project.
>
>> and different ideas how to implement dirty page tracking, 
> I don't think this is compatibility. No kernel today triggers qemu to
> use this feature as no kernel supports live migration. No existing
> qemu will trigger this feature with new kernels that support live
> migration v2. Therefore we can adjust qemu's dirty tracking at the
> same time we enable migration v2 in qemu.
>
> With Joao's work we are close to having a solid RFC to come with
> something that can be fully implemented.
>
> Hopefully we can agree to this soon enough that qemu can come with a
> full package of migration v2 support including the dirty tracking
> solution.
>
>> not to mention the missing features that are currently well used,
>> like p2p mappings, coherency tracking, mdev, etc.
> I consider these all mandatory things, they won't be left out.
>
> The reason they are not in the RFC is mostly because supporting them
> requires work outside just this iommufd area, and I'd like this series
> to remain self-contained.
>
> I've already got a draft to add DMABUF support to VFIO PCI which
> nicely solves the follow_pfn security problem, we want to do this for
> another reason already. I'm waiting for some testing feedback before
> posting it. Need some help from Daniel make the DMABUF revoke semantic
> him and I have been talking about. In the worst case can copy the
> follow_pfn approach.
>
> Intel no-snoop is simple enough, just needs some Intel cleanup parts.
>
> mdev will come along with the final VFIO integration, all the really
> hard parts are done already. The VFIO integration is a medium sized
> task overall.
>
> So, I'm not ready to give up yet :)
>
>> Where do we focus attention?  Is symlinking device files our proposal
>> to userspace and is that something achievable, or do we want to use
>> this compatibility interface as a means to test the interface and
>> allow userspace to make use of it for transition, if their use cases
>> allow it, perhaps eventually performing the symlink after deprecation
>> and eventual removal of the vfio container and type1 code?  Thanks,
> symlinking device files is definitely just a suggested way to expedite
> testing.
>
> Things like qemu that are learning to use iommufd-only features should
> learn to directly open iommufd instead of vfio container to activate
> those features.
>
> Looking long down the road I don't think we want to have type 1 and
> iommufd code forever. So, I would like to make an option to compile
> out vfio container support entirely and have that option arrange for
> iommufd to provide the container device node itself.
I am currently working on migrating the QEMU VFIO device onto the new
API because since after our discussions the compat mode cannot be used
anyway to implemented nesting. I hope I will be able to present
something next week.

Thanks

Eric
>
> I think we can get there pretty quickly, or at least I haven't got
> anything that is scaring me alot (beyond SPAPR of course)
>
> For the dpdk/etcs of the world I think we are already there.
>
> Jason
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 00/11] ACPI/IORT: Support for IORT RMR node

2022-03-15 Thread Eric Auger
Hi Shameer,
On 2/21/22 4:43 PM, Shameer Kolothum wrote:
> Hi,
>
> Since we now have an updated verion[0] of IORT spec(E.d) which
> addresses the memory attributes issues discussed here [1],
> this series now make use of it.
>
> The pull request for ACPICA E.d related changes are already
> raised and can be found here,
> https://github.com/acpica/acpica/pull/752
>
> v7 --> v8
>   - Patch #1 has temp definitions for RMR related changes till
> the ACPICA header changes are part of kernel.
>   - No early parsing of RMR node info and is only parsed at the
> time of use.
>   - Changes to the RMR get/put API format compared to the
> previous version.
>   - Support for RMR descriptor shared by multiple stream IDs.
>
> Please take a look and let me know your thoughts.
>
> Thanks,
> Shameer
> [0] https://developer.arm.com/documentation/den0049/ed/
> [1] https://lore.kernel.org/linux-acpi/20210805160319.GB23085@lpieralisi/
>
> From old:
> We have faced issues with 3408iMR RAID controller cards which
> fail to boot when SMMU is enabled. This is because these
> controllers make use of host memory for various caching related
> purposes and when SMMU is enabled the iMR firmware fails to
> access these memory regions as there is no mapping for them.
> IORT RMR provides a way for UEFI to describe and report these
> memory regions so that the kernel can make a unity mapping for
> these in SMMU.
>
> Change History:
>
> v6 --> v7
>  -fix pointed out by Steve to the SMMUv2 SMR bypass install in patch #8.
>
> v5 --> v6
> - Addressed comments from Robin & Lorenzo.
>   : Moved iort_parse_rmr() to acpi_iort_init() from
> iort_init_platform_devices().
>   : Removed use of struct iort_rmr_entry during the initial
> parse. Using struct iommu_resv_region instead.
>   : Report RMR address alignment and overlap errors, but continue.
>   : Reworked arm_smmu_init_bypass_stes() (patch # 6).
> - Updated SMMUv2 bypass SMR code. Thanks to Jon N (patch #8).
> - Set IOMMU protection flags(IOMMU_CACHE, IOMMU_MMIO) based
>   on Type of RMR region. Suggested by Jon N.
>
> v4 --> v5
>  -Added a fw_data union to struct iommu_resv_region and removed
>   struct iommu_rmr (Based on comments from Joerg/Robin).
>  -Added iommu_put_rmrs() to release mem.
>  -Thanks to Steve for verifying on SMMUv2, but not added the Tested-by
>   yet because of the above changes.
>
> v3 -->v4
> -Included the SMMUv2 SMR bypass install changes suggested by
>  Steve(patch #7)
> -As per Robin's comments, RMR reserve implementation is now
>  more generic  (patch #8) and dropped v3 patches 8 and 10.
> -Rebase to 5.13-rc1
>
> RFC v2 --> v3
>  -Dropped RFC tag as the ACPICA header changes are now ready to be
>   part of 5.13[0]. But this series still has a dependency on that patch.
>  -Added IORT E.b related changes(node flags, _DSM function 5 checks for
>   PCIe).
>  -Changed RMR to stream id mapping from M:N to M:1 as per the spec and
>   discussion here[1].
>  -Last two patches add support for SMMUv2(Thanks to Jon Nettleton!)
>
> Jon Nettleton (1):
>   iommu/arm-smmu: Get associated RMR info and install bypass SMR
>
> Shameer Kolothum (10):
>   ACPI/IORT: Add temporary RMR node flag definitions
>   iommu: Introduce a union to struct iommu_resv_region
>   ACPI/IORT: Add helper functions to parse RMR nodes
>   iommu/dma: Introduce generic helper to retrieve RMR info
>   ACPI/IORT: Add a helper to retrieve RMR memory regions
>   iommu/arm-smmu-v3: Introduce strtab init helper
>   iommu/arm-smmu-v3: Refactor arm_smmu_init_bypass_stes() to force
> bypass
>   iommu/arm-smmu-v3: Get associated RMR info and install bypass STE
>   iommu/arm-smmu-v3: Reserve any RMR regions associated with a dev
>   iommu/arm-smmu: Reserve any RMR regions associated with a dev
fyi, the last 2 patches have conflicts with
[PATCH v4 9/9] iommu: Split struct iommu_ops
which was applied on core branch.

Thanks

Eric
>
>  drivers/acpi/arm64/iort.c   | 305 
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  91 --
>  drivers/iommu/arm/arm-smmu/arm-smmu.c   |  65 -
>  drivers/iommu/dma-iommu.c   |  25 ++
>  include/linux/acpi_iort.h   |  14 +
>  include/linux/dma-iommu.h   |  14 +
>  include/linux/iommu.h   |   9 +
>  7 files changed, 504 insertions(+), 19 deletions(-)
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 00/11] ACPI/IORT: Support for IORT RMR node

2022-03-14 Thread Eric Auger
Hi Robin

On 3/11/22 11:34 AM, Robin Murphy wrote:
> On 2022-03-11 08:19, Eric Auger wrote:
>> Hi guys,
>>
>> On 2/21/22 4:43 PM, Shameer Kolothum wrote:
>>> Hi,
>>>
>>> Since we now have an updated verion[0] of IORT spec(E.d) which
>>> addresses the memory attributes issues discussed here [1],
>>> this series now make use of it.
>>>
>>> The pull request for ACPICA E.d related changes are already
>>> raised and can be found here,
>>> https://github.com/acpica/acpica/pull/752
>>>
>>> v7 --> v8
>>>    - Patch #1 has temp definitions for RMR related changes till
>>>  the ACPICA header changes are part of kernel.
>>>    - No early parsing of RMR node info and is only parsed at the
>>>  time of use.
>>>    - Changes to the RMR get/put API format compared to the
>>>  previous version.
>>>    - Support for RMR descriptor shared by multiple stream IDs.
>>>
>>> Please take a look and let me know your thoughts.
>>>
>>> Thanks,
>>> Shameer
>>> [0] https://developer.arm.com/documentation/den0049/ed/
>> I still have a question on the IORT E.d spec (unrelated to this series).
>>
>> The spec mandates that if RMR nodes are presented in the IORT,
>> _DSM function #5 for the PCIe host bridge ACPI device object must return
>> 0, indicating the OS must honour the PCI config that the FW computed at
>> boot time.
>>
>> However implementing this _DSM #5 as above is known to prevent PCI
>> devices with IO ports from working, on aarch64 linux.
>>
>> "
>> The reason is that EFI creates I/O port mappings below
>>  0x1000 (in fact, at 0). However Linux, for legacy reasons, does not
>>  support I/O ports <= 0x1000 on PCI, so the I/O assignment
>> created by EFI
>>  is rejected.
>>      EFI creates the mappings primarily for itself, and up until
>> DSM #5
>>  started to be enforced, all PCI resource allocations that
>> existed at
>>  boot were ignored by Linux and recreated from scratch.
>> "
>>
>> This is an excerpt of a qemu commit message that reverted the _DMS #5
>> change (Revert "acpi/gpex: Inform os to keep firmware resource map").
>> Has the situation changed since July 2021 (ie. has UEFI been reworked?).
>> [+ Ard]
>
> FWIW I wasn't aware of that, but if it's an issue then it will need to
> be fixed in Linux or UEFI's PCI resource code (arguably if UEFI has
> already allocated from the bottom of I/O space then Linux should be
> safe to assume that there are no legacy PC I/O resources to worry
> about). The DSM is required to prevent bus numbers being reassigned,
> because if that happens then any PCI StreamIDs referenced in IORT may
> suddenly become meaningless and the association of root complex nodes
> and RMRs to physical hardware lost.

Thank you for confirming and explaining the need for DSM #5. Ard, please
could you confirm that the incompatibility with PCI devices with IO
ports is still there?

Eric

>
> Robin.
>
>> Thank you in advance
>>
>> Regards
>>
>> Eric
>>
>>
>>
>>
>>> [1]
>>> https://lore.kernel.org/linux-acpi/20210805160319.GB23085@lpieralisi/
>>>
>>>  From old:
>>> We have faced issues with 3408iMR RAID controller cards which
>>> fail to boot when SMMU is enabled. This is because these
>>> controllers make use of host memory for various caching related
>>> purposes and when SMMU is enabled the iMR firmware fails to
>>> access these memory regions as there is no mapping for them.
>>> IORT RMR provides a way for UEFI to describe and report these
>>> memory regions so that the kernel can make a unity mapping for
>>> these in SMMU.
>>>
>>> Change History:
>>>
>>> v6 --> v7
>>>   -fix pointed out by Steve to the SMMUv2 SMR bypass install in
>>> patch #8.
>>>
>>> v5 --> v6
>>> - Addressed comments from Robin & Lorenzo.
>>>    : Moved iort_parse_rmr() to acpi_iort_init() from
>>>  iort_init_platform_devices().
>>>    : Removed use of struct iort_rmr_entry during the initial
>>>  parse. Using struct iommu_resv_region instead.
>>>    : Report RMR address alignment and overlap errors, but continue.
>>>    : Reworked arm_smmu_init_bypass_stes() (patch # 6).
>>> - Updated SMMUv2 bypass SMR code. Thanks to Jon N (patch #8).
>>> - Set IOMMU protection flags(IOMMU_CACHE, IOMMU_MMIO) based
>&g

Re: [PATCH v8 00/11] ACPI/IORT: Support for IORT RMR node

2022-03-11 Thread Eric Auger
Hi guys,

On 2/21/22 4:43 PM, Shameer Kolothum wrote:
> Hi,
>
> Since we now have an updated verion[0] of IORT spec(E.d) which
> addresses the memory attributes issues discussed here [1],
> this series now make use of it.
>
> The pull request for ACPICA E.d related changes are already
> raised and can be found here,
> https://github.com/acpica/acpica/pull/752
>
> v7 --> v8
>   - Patch #1 has temp definitions for RMR related changes till
> the ACPICA header changes are part of kernel.
>   - No early parsing of RMR node info and is only parsed at the
> time of use.
>   - Changes to the RMR get/put API format compared to the
> previous version.
>   - Support for RMR descriptor shared by multiple stream IDs.
>
> Please take a look and let me know your thoughts.
>
> Thanks,
> Shameer
> [0] https://developer.arm.com/documentation/den0049/ed/
I still have a question on the IORT E.d spec (unrelated to this series).

The spec mandates that if RMR nodes are presented in the IORT,
_DSM function #5 for the PCIe host bridge ACPI device object must return
0, indicating the OS must honour the PCI config that the FW computed at
boot time.

However implementing this _DSM #5 as above is known to prevent PCI
devices with IO ports from working, on aarch64 linux.

"
The reason is that EFI creates I/O port mappings below
    0x1000 (in fact, at 0). However Linux, for legacy reasons, does not
    support I/O ports <= 0x1000 on PCI, so the I/O assignment created by EFI
    is rejected.
   
    EFI creates the mappings primarily for itself, and up until DSM #5
    started to be enforced, all PCI resource allocations that existed at
    boot were ignored by Linux and recreated from scratch.
"

This is an excerpt of a qemu commit message that reverted the _DMS #5
change (Revert "acpi/gpex: Inform os to keep firmware resource map").
Has the situation changed since July 2021 (ie. has UEFI been reworked?).
[+ Ard]

Thank you in advance

Regards

Eric




> [1] https://lore.kernel.org/linux-acpi/20210805160319.GB23085@lpieralisi/
>
> From old:
> We have faced issues with 3408iMR RAID controller cards which
> fail to boot when SMMU is enabled. This is because these
> controllers make use of host memory for various caching related
> purposes and when SMMU is enabled the iMR firmware fails to
> access these memory regions as there is no mapping for them.
> IORT RMR provides a way for UEFI to describe and report these
> memory regions so that the kernel can make a unity mapping for
> these in SMMU.
>
> Change History:
>
> v6 --> v7
>  -fix pointed out by Steve to the SMMUv2 SMR bypass install in patch #8.
>
> v5 --> v6
> - Addressed comments from Robin & Lorenzo.
>   : Moved iort_parse_rmr() to acpi_iort_init() from
> iort_init_platform_devices().
>   : Removed use of struct iort_rmr_entry during the initial
> parse. Using struct iommu_resv_region instead.
>   : Report RMR address alignment and overlap errors, but continue.
>   : Reworked arm_smmu_init_bypass_stes() (patch # 6).
> - Updated SMMUv2 bypass SMR code. Thanks to Jon N (patch #8).
> - Set IOMMU protection flags(IOMMU_CACHE, IOMMU_MMIO) based
>   on Type of RMR region. Suggested by Jon N.
>
> v4 --> v5
>  -Added a fw_data union to struct iommu_resv_region and removed
>   struct iommu_rmr (Based on comments from Joerg/Robin).
>  -Added iommu_put_rmrs() to release mem.
>  -Thanks to Steve for verifying on SMMUv2, but not added the Tested-by
>   yet because of the above changes.
>
> v3 -->v4
> -Included the SMMUv2 SMR bypass install changes suggested by
>  Steve(patch #7)
> -As per Robin's comments, RMR reserve implementation is now
>  more generic  (patch #8) and dropped v3 patches 8 and 10.
> -Rebase to 5.13-rc1
>
> RFC v2 --> v3
>  -Dropped RFC tag as the ACPICA header changes are now ready to be
>   part of 5.13[0]. But this series still has a dependency on that patch.
>  -Added IORT E.b related changes(node flags, _DSM function 5 checks for
>   PCIe).
>  -Changed RMR to stream id mapping from M:N to M:1 as per the spec and
>   discussion here[1].
>  -Last two patches add support for SMMUv2(Thanks to Jon Nettleton!)
>
> Jon Nettleton (1):
>   iommu/arm-smmu: Get associated RMR info and install bypass SMR
>
> Shameer Kolothum (10):
>   ACPI/IORT: Add temporary RMR node flag definitions
>   iommu: Introduce a union to struct iommu_resv_region
>   ACPI/IORT: Add helper functions to parse RMR nodes
>   iommu/dma: Introduce generic helper to retrieve RMR info
>   ACPI/IORT: Add a helper to retrieve RMR memory regions
>   iommu/arm-smmu-v3: Introduce strtab init helper
>   iommu/arm-smmu-v3: Refactor arm_smmu_init_bypass_stes() to force
> bypass
>   iommu/arm-smmu-v3: Get associated RMR info and install bypass STE
>   iommu/arm-smmu-v3: Reserve any RMR regions associated with a dev
>   iommu/arm-smmu: Reserve any RMR regions associated with a dev
>
>  drivers/acpi/arm64/iort.c   | 305 
>  

Re: [PATCH v8 00/11] ACPI/IORT: Support for IORT RMR node

2022-03-11 Thread Eric Auger
Hi Shameer,

On 2/21/22 4:43 PM, Shameer Kolothum wrote:
> Hi,
>
> Since we now have an updated verion[0] of IORT spec(E.d) which
> addresses the memory attributes issues discussed here [1],
> this series now make use of it.
>
> The pull request for ACPICA E.d related changes are already
> raised and can be found here,
> https://github.com/acpica/acpica/pull/752
>
> v7 --> v8
>   - Patch #1 has temp definitions for RMR related changes till
> the ACPICA header changes are part of kernel.
>   - No early parsing of RMR node info and is only parsed at the
> time of use.
>   - Changes to the RMR get/put API format compared to the
> previous version.
>   - Support for RMR descriptor shared by multiple stream IDs.

I tested it on guest side for host MSI SW RESV region flat mapping
(using both the old single mapping layout and the now allowed multiple
RID ID mapping format) and this worked for me. Feel free to add my

Tested-by: Eric Auger 

Thanks

Eric


>
> Please take a look and let me know your thoughts.
>
> Thanks,
> Shameer
> [0] https://developer.arm.com/documentation/den0049/ed/
> [1] https://lore.kernel.org/linux-acpi/20210805160319.GB23085@lpieralisi/
>
> From old:
> We have faced issues with 3408iMR RAID controller cards which
> fail to boot when SMMU is enabled. This is because these
> controllers make use of host memory for various caching related
> purposes and when SMMU is enabled the iMR firmware fails to
> access these memory regions as there is no mapping for them.
> IORT RMR provides a way for UEFI to describe and report these
> memory regions so that the kernel can make a unity mapping for
> these in SMMU.
>
> Change History:
>
> v6 --> v7
>  -fix pointed out by Steve to the SMMUv2 SMR bypass install in patch #8.
>
> v5 --> v6
> - Addressed comments from Robin & Lorenzo.
>   : Moved iort_parse_rmr() to acpi_iort_init() from
> iort_init_platform_devices().
>   : Removed use of struct iort_rmr_entry during the initial
> parse. Using struct iommu_resv_region instead.
>   : Report RMR address alignment and overlap errors, but continue.
>   : Reworked arm_smmu_init_bypass_stes() (patch # 6).
> - Updated SMMUv2 bypass SMR code. Thanks to Jon N (patch #8).
> - Set IOMMU protection flags(IOMMU_CACHE, IOMMU_MMIO) based
>   on Type of RMR region. Suggested by Jon N.
>
> v4 --> v5
>  -Added a fw_data union to struct iommu_resv_region and removed
>   struct iommu_rmr (Based on comments from Joerg/Robin).
>  -Added iommu_put_rmrs() to release mem.
>  -Thanks to Steve for verifying on SMMUv2, but not added the Tested-by
>   yet because of the above changes.
>
> v3 -->v4
> -Included the SMMUv2 SMR bypass install changes suggested by
>  Steve(patch #7)
> -As per Robin's comments, RMR reserve implementation is now
>  more generic  (patch #8) and dropped v3 patches 8 and 10.
> -Rebase to 5.13-rc1
>
> RFC v2 --> v3
>  -Dropped RFC tag as the ACPICA header changes are now ready to be
>   part of 5.13[0]. But this series still has a dependency on that patch.
>  -Added IORT E.b related changes(node flags, _DSM function 5 checks for
>   PCIe).
>  -Changed RMR to stream id mapping from M:N to M:1 as per the spec and
>   discussion here[1].
>  -Last two patches add support for SMMUv2(Thanks to Jon Nettleton!)
>
> Jon Nettleton (1):
>   iommu/arm-smmu: Get associated RMR info and install bypass SMR
>
> Shameer Kolothum (10):
>   ACPI/IORT: Add temporary RMR node flag definitions
>   iommu: Introduce a union to struct iommu_resv_region
>   ACPI/IORT: Add helper functions to parse RMR nodes
>   iommu/dma: Introduce generic helper to retrieve RMR info
>   ACPI/IORT: Add a helper to retrieve RMR memory regions
>   iommu/arm-smmu-v3: Introduce strtab init helper
>   iommu/arm-smmu-v3: Refactor arm_smmu_init_bypass_stes() to force
> bypass
>   iommu/arm-smmu-v3: Get associated RMR info and install bypass STE
>   iommu/arm-smmu-v3: Reserve any RMR regions associated with a dev
>   iommu/arm-smmu: Reserve any RMR regions associated with a dev
>
>  drivers/acpi/arm64/iort.c   | 305 
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  91 --
>  drivers/iommu/arm/arm-smmu/arm-smmu.c   |  65 -
>  drivers/iommu/dma-iommu.c   |  25 ++
>  include/linux/acpi_iort.h   |  14 +
>  include/linux/dma-iommu.h   |  14 +
>  include/linux/iommu.h   |   9 +
>  7 files changed, 504 insertions(+), 19 deletions(-)
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 03/11] ACPI/IORT: Add helper functions to parse RMR nodes

2022-03-10 Thread Eric Auger
Hi Shameer,

On 2/21/22 4:43 PM, Shameer Kolothum wrote:
> The helper functions here parse through the IORT RMR nodes and
> populate a reserved region list  corresponding to a given iommu
> and device(optional). These also go through the ID mappings of
> the RMR node and retrieves all the SIDs associated with a RMR
> descriptor.
>
> Signed-off-by: Shameer Kolothum 
> ---
>  drivers/acpi/arm64/iort.c | 225 ++
>  1 file changed, 225 insertions(+)
>
> diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
> index 0730c4dbb700..05da9ebff50a 100644
> --- a/drivers/acpi/arm64/iort.c
> +++ b/drivers/acpi/arm64/iort.c
> @@ -830,6 +830,231 @@ static struct acpi_iort_node 
> *iort_get_msi_resv_iommu(struct device *dev)
>   return NULL;
>  }
>  
> +static void iort_rmr_desc_check_overlap(struct acpi_iort_rmr_desc *desc, u32 
> count)
> +{
> + int i, j;
> +
> + for (i = 0; i < count; i++) {
> + u64 end, start = desc[i].base_address, length = desc[i].length;
> +
> + end = start + length - 1;
> +
> + /* Check for address overlap */
> + for (j = i + 1; j < count; j++) {
> + u64 e_start = desc[j].base_address;
> + u64 e_end = e_start + desc[j].length - 1;
> +
> + if (start <= e_end && end >= e_start)
> + pr_err(FW_BUG "RMR descriptor[0x%llx - 0x%llx] 
> overlaps, continue anyway\n",
> +start, end);
> + }
> + }
> +}
> +
> +/*
> + * Please note, we will keep the already allocated RMR reserve
> + * regions in case of a memory allocation failure.
> + */
> +static void iort_rmr_get_resv_regions(struct acpi_iort_node *node,
> +   struct acpi_iort_node *smmu,
> +   u32 *sids, u32 num_sids,
> +   struct list_head *head)
> +{
> + struct acpi_iort_rmr *rmr = (struct acpi_iort_rmr *)node->node_data;
> + struct acpi_iort_rmr_desc *rmr_desc;
> + int i;
> +
> + rmr_desc = ACPI_ADD_PTR(struct acpi_iort_rmr_desc, node,
> + rmr->rmr_offset);
> +
> + iort_rmr_desc_check_overlap(rmr_desc, rmr->rmr_count);
> +
> + for (i = 0; i < rmr->rmr_count; i++, rmr_desc++) {
> + struct iommu_resv_region *region;
> + enum iommu_resv_type type;
> + u32  *sids_copy;
> + int prot = IOMMU_READ | IOMMU_WRITE;
> + u64 addr = rmr_desc->base_address, size = rmr_desc->length;
> +
> + if (!IS_ALIGNED(addr, SZ_64K) || !IS_ALIGNED(size, SZ_64K)) {
> + /* PAGE align base addr and size */
> + addr &= PAGE_MASK;
> + size = PAGE_ALIGN(size + 
> offset_in_page(rmr_desc->base_address));
> +
> + pr_err(FW_BUG "RMR descriptor[0x%llx - 0x%llx] not 
> aligned to 64K, continue with [0x%llx - 0x%llx]\n",
> +rmr_desc->base_address,
> +rmr_desc->base_address + rmr_desc->length - 1,
> +addr, addr + size - 1);
> + }
> +
> + if (rmr->flags & ACPI_IORT_RMR_REMAP_PERMITTED)
> + type = IOMMU_RESV_DIRECT_RELAXABLE;
> + else
> + type = IOMMU_RESV_DIRECT;
> +
> + if (rmr->flags & ACPI_IORT_RMR_ACCESS_PRIVILEGE)
> + prot |= IOMMU_PRIV;
> +
> + /* Attributes 0x00 - 0x03 represents device memory */
> + if (ACPI_IORT_RMR_ACCESS_ATTRIBUTES(rmr->flags) <=
> + ACPI_IORT_RMR_ATTR_DEVICE_GRE)
> + prot |= IOMMU_MMIO;
> + else if (ACPI_IORT_RMR_ACCESS_ATTRIBUTES(rmr->flags) ==
> + ACPI_IORT_RMR_ATTR_NORMAL)
> + prot |= IOMMU_CACHE;
> +
> + /* Create a copy of sids array to associate with this resv 
> region */
> + sids_copy = kmemdup(sids, num_sids * sizeof(*sids), GFP_KERNEL);
> + if (!sids_copy)
> + return;
> +
> + region = iommu_alloc_resv_region(addr, size, prot, type);
> + if (!region) {
> + kfree(sids_copy);
> + return;
> + }
> +
> + region->fw_data.rmr.sids = sids_copy;
> + region->fw_data.rmr.num_sids = num_sids;
> + list_add_tail(>list, head);
> + }
> +}
> +
> +static u32 *iort_rmr_alloc_sids(u32 *sids, u32 count, u32 id_start,
> + u32 new_count)
> +{
> + u32 *new_sids;
> + u32 total_count = count + new_count;
> + int i;
> +
> + new_sids = krealloc_array(sids, count + new_count,
> +   sizeof(*new_sids), GFP_KERNEL);
> + if (!new_sids)
> + return NULL;
> +
> + /*Update new 

Re: [PATCH v8 00/11] Fix BUG_ON in vfio_iommu_group_notifier()

2022-03-10 Thread Eric Auger
 Remove DMA_OWNER_NONE. (Joerg)
>   - Change refcount to unsigned int. (Christoph)
>   - Move mutex lock into group set_dma_owner functions. (Christoph)
>   - Add kernel doc for iommu_attach/detach_domain_shared(). (Christoph)
>   - Move dma auto-claim into driver core. (Jason/Christoph)
>
> v5:
>   - 
> https://lore.kernel.org/linux-iommu/20220104015644.2294354-1-baolu...@linux.intel.com/
>   - Move kernel dma ownership auto-claiming from driver core to bus
> callback. (Greg)
>   - Refactor the iommu interfaces to make them more specific.
> (Jason/Robin)
>   - Simplify the dma ownership implementation by removing the owner
> type. (Jason)
>   - Commit message refactoring for PCI drivers. (Bjorn)
>   - Move iommu_attach/detach_device() improvement patches into another
> series as there are a lot of code refactoring and cleanup staffs
> in various device drivers.
>
> v6:
>   - 
> https://lore.kernel.org/linux-iommu/20220218005521.172832-1-baolu...@linux.intel.com/
>   - Refine comments and commit mesages.
>   - Rename iommu_group_set_dma_owner() to iommu_group_claim_dma_owner().
>   - Rename iommu_device_use/unuse_kernel_dma() to
> iommu_device_use/unuse_default_domain().
>   - Remove unnecessary EXPORT_SYMBOL_GPL.
>   - Change flag name from no_kernel_api_dma to driver_managed_dma.
>   - Merge 4 "Add driver dma ownership management" patches into single
> one.
>
> v7:
>   - We discussed about adding some fields in driver structure and
> intercepting it in the bus notifier for driver unbinding. We agreed
> that the driver structure should not be used out of the driver core.
>   - As iommu_group_claim/release_dma_owner() are only used by the VFIO,
> there're no use cases for multiple calls for a single group.
>   - Add some commit messages in "vfio: Set DMA ownership for
> VFIO" to describe the intentional enhancement of unsafe bridge
> drivers.
>   - Comments refinement.
>
> v8:
>   - Move iommu_use_default_domain() to the end of .dma_configure
> callback to avoid firmware-data-ordering thing.
> Link: 
> https://lore.kernel.org/linux-iommu/e2698dbe-18e2-1a82-8a12-fe45bc9be...@arm.com/

Feel free to add my T-b
Tested-by: Eric Auger 

Thanks

Eric
>   - Add Acked-by from PCI and VFIO maintainers.
>
> This is based on next branch of linux-iommu tree:
> https://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
> and also available on github:
> https://github.com/LuBaolu/intel-iommu/commits/iommu-dma-ownership-v8
>
> Best regards,
> baolu
>
> Jason Gunthorpe (1):
>   vfio: Delete the unbound_list
>
> Lu Baolu (10):
>   iommu: Add DMA ownership management interfaces
>   driver core: Add dma_cleanup callback in bus_type
>   amba: Stop sharing platform_dma_configure()
>   bus: platform,amba,fsl-mc,PCI: Add device DMA ownership management
>   PCI: pci_stub: Set driver_managed_dma
>   PCI: portdrv: Set driver_managed_dma
>   vfio: Set DMA ownership for VFIO devices
>   vfio: Remove use of vfio_group_viable()
>   vfio: Remove iommu group notifier
>   iommu: Remove iommu group changes notifier
>
>  include/linux/amba/bus.h  |   8 +
>  include/linux/device/bus.h|   3 +
>  include/linux/fsl/mc.h|   8 +
>  include/linux/iommu.h |  54 +++---
>  include/linux/pci.h   |   8 +
>  include/linux/platform_device.h   |  10 +-
>  drivers/amba/bus.c|  37 +++-
>  drivers/base/dd.c |   5 +
>  drivers/base/platform.c   |  21 ++-
>  drivers/bus/fsl-mc/fsl-mc-bus.c   |  24 ++-
>  drivers/iommu/iommu.c | 228 
>  drivers/pci/pci-driver.c  |  18 ++
>  drivers/pci/pci-stub.c|   1 +
>  drivers/pci/pcie/portdrv_pci.c|   2 +
>  drivers/vfio/fsl-mc/vfio_fsl_mc.c |   1 +
>  drivers/vfio/pci/vfio_pci.c   |   1 +
>  drivers/vfio/platform/vfio_amba.c |   1 +
>  drivers/vfio/platform/vfio_platform.c |   1 +
>  drivers/vfio/vfio.c   | 245 ++
>  19 files changed, 338 insertions(+), 338 deletions(-)
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 01/11] iommu: Add DMA ownership management interfaces

2022-03-07 Thread Eric Auger
Hi Lu,

On 3/7/22 4:27 AM, Lu Baolu wrote:
> Hi Robin,
>
> On 3/4/22 10:10 PM, Robin Murphy wrote:
>> On 2022-03-04 13:55, Eric Auger wrote:
>>> Hi Robin,
>>>
>>> On 3/4/22 1:22 PM, Robin Murphy wrote:
>>>> On 2022-03-04 10:43, Lu Baolu wrote:
>>>>> Hi Eric,
>>>>>
>>>>> On 2022/3/4 18:34, Eric Auger wrote:
>>>>>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>>>>>
>>>>>> sudo modprobe -v vfio-pci
>>>>>> echo vfio-pci | sudo tee -a
>>>>>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>>>>>> vfio-pci
>>>>>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>>>>>
>>>>>>
>>>>>> [  390.042811] [ cut here ]
>>>>>> [  390.046468] WARNING: CPU: 42 PID: 5589 at
>>>>>> drivers/iommu/iommu.c:3123
>>>>>> iommu_device_unuse_default_domain+0x68/0x100
>>>>>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>>>>>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack
>>>>>> ipt_REJECT
>>>>>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>>>>>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc
>>>>>> rfkill
>>>>>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>>>>>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c
>>>>>> mlx5_core sg
>>>>>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce
>>>>>> sbsa_gwdt
>>>>>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform
>>>>>> qcom_emac
>>>>>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>>>>>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>>>>>> 5.17.0-rc4-lu-v7-official+ #24
>>>>>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>>>>>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS
>>>>>> 0ACJA570
>>>>>> 11/05/2018
>>>>>> [  390.132492] pstate: a045 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>>>>>> BTYPE=--)
>>>>>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>>>>>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>>>>>> [  390.150894] sp : 8fbb3bc0
>>>>>> [  390.154193] x29: 8fbb3bc0 x28: 03c0cf6b2400 x27:
>>>>>> 
>>>>>> [  390.161311] x26:  x25:  x24:
>>>>>> 03c0c7cc5720
>>>>>> [  390.168429] x23: 03c0c2b9d150 x22: b4e61df223f8 x21:
>>>>>> b4e61df223f8
>>>>>> [  390.175547] x20: 03c7c03c3758 x19: 03c7c03c3700 x18:
>>>>>> 
>>>>>> [  390.182665] x17:  x16:  x15:
>>>>>> 
>>>>>> [  390.189783] x14:  x13: 0030 x12:
>>>>>> 03c0d519cd80
>>>>>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0dc0 x9 :
>>>>>> b4e620b54f8c
>>>>>> [  390.204019] x8 : 03c0cf6b3220 x7 : 4ef132bba000 x6 :
>>>>>> 00ff
>>>>>> [  390.211137] x5 : 03c0c2b9f108 x4 : 03c0d51f6438 x3 :
>>>>>> 
>>>>>> [  390.218255] x2 : 03c0cf6b2400 x1 :  x0 :
>>>>>> 
>>>>>> [  390.225374] Call trace:
>>>>>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>>>>>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>>>>>> [  390.236919]  __device_release_driver+0x1a8/0x260
>>>>>> [  390.241519]  device_driver_detach+0x50/0xd0
>>>>>> [  390.245686]  unbind_store+0xf8/0x120
>>>>>> [  390.249245]  drv_attr_store+0x30/0x44
>>>>>> [  390.252891]  sysfs_kf_write+0x50/0x60
>>>>>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>>>>>> [  390.260964]  new_sync_write+0xf0/0x18c
>>>>>> [  390.264696]  vfs_write+0x230/0x2d0
>>>>>> [  390.268082] 

Re: [PATCH v7 01/11] iommu: Add DMA ownership management interfaces

2022-03-04 Thread Eric Auger
Hi Robin,

On 3/4/22 1:22 PM, Robin Murphy wrote:
> On 2022-03-04 10:43, Lu Baolu wrote:
>> Hi Eric,
>>
>> On 2022/3/4 18:34, Eric Auger wrote:
>>> I hit a WARN_ON() when unbinding an e1000e driver just after boot:
>>>
>>> sudo modprobe -v vfio-pci
>>> echo vfio-pci | sudo tee -a
>>> /sys/bus/pci/devices/0004:01:00.0/driver_override
>>> vfio-pci
>>> echo 0004:01:00.0 | sudo tee -a  /sys/bus/pci/drivers/e1000e/unbind
>>>
>>>
>>> [  390.042811] [ cut here ]
>>> [  390.046468] WARNING: CPU: 42 PID: 5589 at drivers/iommu/iommu.c:3123
>>> iommu_device_unuse_default_domain+0x68/0x100
>>> [  390.056710] Modules linked in: vfio_pci vfio_pci_core vfio_virqfd
>>> vfio_iommu_type1 vfio xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT
>>> nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack
>>> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge stp llc rfkill
>>> sunrpc vfat fat mlx5_ib ib_uverbs ib_core acpi_ipmi ipmi_ssif
>>> ipmi_devintf ipmi_msghandler cppc_cpufreq drm xfs libcrc32c
>>> mlx5_core sg
>>> mlxfw crct10dif_ce tls ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt
>>> e1000e psample sdhci_acpi ahci_platform sdhci libahci_platform
>>> qcom_emac
>>> mmc_core hdma hdma_mgmt dm_mirror dm_region_hash dm_log dm_mod fuse
>>> [  390.110618] CPU: 42 PID: 5589 Comm: tee Kdump: loaded Not tainted
>>> 5.17.0-rc4-lu-v7-official+ #24
>>> [  390.119384] Hardware name: WIWYNN QDF2400 Reference Evaluation
>>> Platform CV90-LA115-P120/QDF2400 Customer Reference Board, BIOS
>>> 0ACJA570
>>> 11/05/2018
>>> [  390.132492] pstate: a045 (NzCv daif +PAN -UAO -TCO -DIT -SSBS
>>> BTYPE=--)
>>> [  390.139436] pc : iommu_device_unuse_default_domain+0x68/0x100
>>> [  390.145165] lr : iommu_device_unuse_default_domain+0x38/0x100
>>> [  390.150894] sp : 8fbb3bc0
>>> [  390.154193] x29: 8fbb3bc0 x28: 03c0cf6b2400 x27:
>>> 
>>> [  390.161311] x26:  x25:  x24:
>>> 03c0c7cc5720
>>> [  390.168429] x23: 03c0c2b9d150 x22: b4e61df223f8 x21:
>>> b4e61df223f8
>>> [  390.175547] x20: 03c7c03c3758 x19: 03c7c03c3700 x18:
>>> 
>>> [  390.182665] x17:  x16:  x15:
>>> 
>>> [  390.189783] x14:  x13: 0030 x12:
>>> 03c0d519cd80
>>> [  390.196901] x11: 7f7f7f7f7f7f7f7f x10: 0dc0 x9 :
>>> b4e620b54f8c
>>> [  390.204019] x8 : 03c0cf6b3220 x7 : 4ef132bba000 x6 :
>>> 00ff
>>> [  390.211137] x5 : 03c0c2b9f108 x4 : 03c0d51f6438 x3 :
>>> 
>>> [  390.218255] x2 : 03c0cf6b2400 x1 :  x0 :
>>> 
>>> [  390.225374] Call trace:
>>> [  390.227804]  iommu_device_unuse_default_domain+0x68/0x100
>>> [  390.233187]  pci_dma_cleanup+0x38/0x44
>>> [  390.236919]  __device_release_driver+0x1a8/0x260
>>> [  390.241519]  device_driver_detach+0x50/0xd0
>>> [  390.245686]  unbind_store+0xf8/0x120
>>> [  390.249245]  drv_attr_store+0x30/0x44
>>> [  390.252891]  sysfs_kf_write+0x50/0x60
>>> [  390.256537]  kernfs_fop_write_iter+0x134/0x1cc
>>> [  390.260964]  new_sync_write+0xf0/0x18c
>>> [  390.264696]  vfs_write+0x230/0x2d0
>>> [  390.268082]  ksys_write+0x74/0x100
>>> [  390.271467]  __arm64_sys_write+0x28/0x3c
>>> [  390.275373]  invoke_syscall.constprop.0+0x58/0xf0
>>> [  390.280061]  el0_svc_common.constprop.0+0x160/0x164
>>> [  390.284922]  do_el0_svc+0x34/0xcc
>>> [  390.288221]  el0_svc+0x30/0x140
>>> [  390.291346]  el0t_64_sync_handler+0xa4/0x130
>>> [  390.295599]  el0t_64_sync+0x1a0/0x1a4
>>> [  390.299245] ---[ end trace  ]---
>>>
>>>
>>> I put some traces in the code and I can see that
>>> iommu_device_use_default_domain() effectively is called on
>>> 0004:01:00.0 e1000e device on pci_dma_configure() but at that time
>>> the iommu group is NULL:
>>> [   10.569427] e1000e 0004:01:00.0: -- ENTRY pci_dma_configure
>>> driver_managed_area=0
>>> [   10.569431] e1000e 0004:01:00.0: 
>>> iommu_device_use_default_domain ENTRY
>>> [   10.569433] e1000e 0004:01:00.0: 
>>> iommu_device_use_default_domain no group
>>> [   10.569435

Re: [PATCH v7 01/11] iommu: Add DMA ownership management interfaces

2022-03-04 Thread Eric Auger
Hi Lu,

On 2/28/22 1:50 AM, Lu Baolu wrote:
> Multiple devices may be placed in the same IOMMU group because they
> cannot be isolated from each other. These devices must either be
> entirely under kernel control or userspace control, never a mixture.
>
> This adds dma ownership management in iommu core and exposes several
> interfaces for the device drivers and the device userspace assignment
> framework (i.e. VFIO), so that any conflict between user and kernel
> controlled dma could be detected at the beginning.
>
> The device driver oriented interfaces are,
>
>   int iommu_device_use_default_domain(struct device *dev);
>   void iommu_device_unuse_default_domain(struct device *dev);
>
> By calling iommu_device_use_default_domain(), the device driver tells
> the iommu layer that the device dma is handled through the kernel DMA
> APIs. The iommu layer will manage the IOVA and use the default domain
> for DMA address translation.
>
> The device user-space assignment framework oriented interfaces are,
>
>   int iommu_group_claim_dma_owner(struct iommu_group *group,
>   void *owner);
>   void iommu_group_release_dma_owner(struct iommu_group *group);
>   bool iommu_group_dma_owner_claimed(struct iommu_group *group);
>
> The device userspace assignment must be disallowed if the DMA owner
> claiming interface returns failure.
>
> Signed-off-by: Jason Gunthorpe 
> Signed-off-by: Kevin Tian 
> Signed-off-by: Lu Baolu 
> ---
>  include/linux/iommu.h |  31 +
>  drivers/iommu/iommu.c | 153 +-
>  2 files changed, 181 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 9208eca4b0d1..77972ef978b5 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -675,6 +675,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device 
> *dev,
>  void iommu_sva_unbind_device(struct iommu_sva *handle);
>  u32 iommu_sva_get_pasid(struct iommu_sva *handle);
>  
> +int iommu_device_use_default_domain(struct device *dev);
> +void iommu_device_unuse_default_domain(struct device *dev);
> +
> +int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner);
> +void iommu_group_release_dma_owner(struct iommu_group *group);
> +bool iommu_group_dma_owner_claimed(struct iommu_group *group);
> +
>  #else /* CONFIG_IOMMU_API */
>  
>  struct iommu_ops {};
> @@ -1031,6 +1038,30 @@ static inline struct iommu_fwspec 
> *dev_iommu_fwspec_get(struct device *dev)
>  {
>   return NULL;
>  }
> +
> +static inline int iommu_device_use_default_domain(struct device *dev)
> +{
> + return 0;
> +}
> +
> +static inline void iommu_device_unuse_default_domain(struct device *dev)
> +{
> +}
> +
> +static inline int
> +iommu_group_claim_dma_owner(struct iommu_group *group, void *owner)
> +{
> + return -ENODEV;
> +}
> +
> +static inline void iommu_group_release_dma_owner(struct iommu_group *group)
> +{
> +}
> +
> +static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group)
> +{
> + return false;
> +}
>  #endif /* CONFIG_IOMMU_API */
>  
>  /**
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index f2c45b85b9fc..eba8e8ccf19d 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -48,6 +48,8 @@ struct iommu_group {
>   struct iommu_domain *default_domain;
>   struct iommu_domain *domain;
>   struct list_head entry;
> + unsigned int owner_cnt;
> + void *owner;
>  };
>  
>  struct group_device {
> @@ -294,7 +296,11 @@ int iommu_probe_device(struct device *dev)
>   mutex_lock(>mutex);
>   iommu_alloc_default_domain(group, dev);
>  
> - if (group->default_domain) {
> + /*
> +  * If device joined an existing group which has been claimed, don't
> +  * attach the default domain.
> +  */
> + if (group->default_domain && !group->owner) {
>   ret = __iommu_attach_device(group->default_domain, dev);
>   if (ret) {
>   mutex_unlock(>mutex);
> @@ -2109,7 +2115,7 @@ static int __iommu_attach_group(struct iommu_domain 
> *domain,
>  {
>   int ret;
>  
> - if (group->default_domain && group->domain != group->default_domain)
> + if (group->domain && group->domain != group->default_domain)
>   return -EBUSY;
>  
>   ret = __iommu_group_for_each_dev(group, domain,
> @@ -2146,7 +2152,11 @@ static void __iommu_detach_group(struct iommu_domain 
> *domain,
>  {
>   int ret;
>  
> - if (!group->default_domain) {
> + /*
> +  * If the group has been claimed already, do not re-attach the default
> +  * domain.
> +  */
> + if (!group->default_domain || group->owner) {
>   __iommu_group_for_each_dev(group, domain,
>  iommu_group_do_detach_device);
>   group->domain = NULL;
> @@ -3095,3 +3105,140 @@ static ssize_t iommu_group_store_type(struct 
> 

Re: [RFC v16 1/9] iommu: Introduce attach/detach_pasid_table API

2021-12-09 Thread Eric Auger
Hi Jason,

On 12/9/21 4:40 PM, Jason Gunthorpe wrote:
> On Thu, Dec 09, 2021 at 08:50:04AM +0100, Eric Auger wrote:
>
>>> The kernel API should accept the S1ContextPtr IPA and all the parts of
>>> the STE that relate to the defining the layout of what the S1Context
>>> points to an thats it.
>> Yes that's exactly what is done currently. At config time the host must
>> trap guest STE changes (format and S1ContextPtr) and "incorporate" those
>> changes into the stage2 related STE information. The STE is owned by the
>> host kernel as it contains the stage2 information (S2TTB).
> [..]
>
>> Note this series only coped with a single CD in the Context Descriptor
>> Table.
> I'm confused, where does this limit arise?
>
> The kernel accepts as input all the bits in the STE that describe the
> layout of the CDT owned by userspace, shouldn't userspace be able to
> construct all forms of CDT with any number of CDs in them?
>
> Or do you mean this is some qemu limitation?
The upstream vSMMUv3 emulation does not support multiple CDs at the
moment and since I have no proper means to validate the vSVA case I am
rejecting any attempt from user space to inject guest configs featuring
mutliple PASIDs. Also PASID cache invalidation must be added to this series.

Nevertheless those limitations were tackled and overcomed by others in
CC so I don't think there is any blocking issue.

Thanks

Eric
>
> Jason
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC v16 1/9] iommu: Introduce attach/detach_pasid_table API

2021-12-09 Thread Eric Auger
Hi Kevin,

On 12/9/21 4:21 AM, Tian, Kevin wrote:
>> From: Jason Gunthorpe 
>> Sent: Wednesday, December 8, 2021 8:56 PM
>>
>> On Wed, Dec 08, 2021 at 08:33:33AM +0100, Eric Auger wrote:
>>> Hi Baolu,
>>>
>>> On 12/8/21 3:44 AM, Lu Baolu wrote:
>>>> Hi Eric,
>>>>
>>>> On 12/7/21 6:22 PM, Eric Auger wrote:
>>>>> On 12/6/21 11:48 AM, Joerg Roedel wrote:
>>>>>> On Wed, Oct 27, 2021 at 12:44:20PM +0200, Eric Auger wrote:
>>>>>>> Signed-off-by: Jean-Philippe Brucker
>>>>>>> Signed-off-by: Liu, Yi L
>>>>>>> Signed-off-by: Ashok Raj
>>>>>>> Signed-off-by: Jacob Pan
>>>>>>> Signed-off-by: Eric Auger
>>>>>> This Signed-of-by chain looks dubious, you are the author but the last
>>>>>> one in the chain?
>>>>> The 1st RFC in Aug 2018
>>>>> (https://lists.cs.columbia.edu/pipermail/kvmarm/2018-
>> August/032478.html)
>>>>> said this was a generalization of Jacob's patch
>>>>>
>>>>>
>>>>>    [PATCH v5 01/23] iommu: introduce bind_pasid_table API function
>>>>>
>>>>>
>>>>>
>>>>> https://lists.linuxfoundation.org/pipermail/iommu/2018-
>> May/027647.html
>>>>> So indeed Jacob should be the author. I guess the multiple rebases got
>>>>> this eventually replaced at some point, which is not an excuse. Please
>>>>> forgive me for that.
>>>>> Now the original patch already had this list of SoB so I don't know if I
>>>>> shall simplify it.
>>>> As we have decided to move the nested mode (dual stages)
>> implementation
>>>> onto the developing iommufd framework, what's the value of adding this
>>>> into iommu core?
>>> The iommu_uapi_attach_pasid_table uapi should disappear indeed as it is
>>> is bound to be replaced by /dev/iommu fellow API.
>>> However until I can rebase on /dev/iommu code I am obliged to keep it to
>>> maintain this integration, hence the RFC.
>> Indeed, we are getting pretty close to having the base iommufd that we
>> can start adding stuff like this into. Maybe in January, you can look
>> at some parts of what is evolving here:
>>
>> https://github.com/jgunthorpe/linux/commits/iommufd
>> https://github.com/LuBaolu/intel-iommu/commits/iommu-dma-ownership-
>> v2
>> https://github.com/luxis1999/iommufd/commits/iommufd-v5.16-rc2
>>
>> From a progress perspective I would like to start with simple 'page
>> tables in userspace', ie no PASID in this step.
>>
>> 'page tables in userspace' means an iommufd ioctl to create an
>> iommu_domain where the IOMMU HW is directly travesering a
>> device-specific page table structure in user space memory. All the HW
>> today implements this by using another iommu_domain to allow the IOMMU
>> HW DMA access to user memory - ie nesting or multi-stage or whatever.
> One clarification here in case people may get confused based on the
> current iommu_domain definition. Jason brainstormed with us on how
> to represent 'user page table' in the IOMMU sub-system. One is to
> extend iommu_domain as a general representation for any page table
> instance. The other option is to create new representations for user
> page tables and then link them under existing iommu_domain.
>
> This context is based on the 1st option. and As Jason said in the bottom
> we still need to sketch out whether it works as expected. 
>
>> This would come along with some ioctls to invalidate the IOTLB.
>>
>> I'm imagining this step as a iommu_group->op->create_user_domain()
>> driver callback which will create a new kind of domain with
>> domain-unique ops. Ie map/unmap related should all be NULL as those
>> are impossible operations.
>>
>> From there the usual struct device (ie RID) attach/detatch stuff needs
>> to take care of routing DMAs to this iommu_domain.
> Usage-wise this covers the guest IOVA requirements i.e. when the guest
> kernel enables vIOMMU for kernel DMA-API mappings or for device
> assignment to guest userspace.
>
> For intel this means optimization to the existing shadow-based vIOMMU
> implementation.
>
> For ARM this actually enables guest IOVA usage for the first time (correct
> me Eric?).
Yes that's correct. This is the scope of this series (single PASID)
>  IIRC SMMU doesn't support caching mode while write-protecting
> guest I/O page table was considered a no-go. So nesting is considered as
&

Re: [RFC v16 1/9] iommu: Introduce attach/detach_pasid_table API

2021-12-09 Thread Eric Auger
Hi Jason,

On 12/8/21 1:56 PM, Jason Gunthorpe wrote:
> On Wed, Dec 08, 2021 at 08:33:33AM +0100, Eric Auger wrote:
>> Hi Baolu,
>>
>> On 12/8/21 3:44 AM, Lu Baolu wrote:
>>> Hi Eric,
>>>
>>> On 12/7/21 6:22 PM, Eric Auger wrote:
>>>> On 12/6/21 11:48 AM, Joerg Roedel wrote:
>>>>> On Wed, Oct 27, 2021 at 12:44:20PM +0200, Eric Auger wrote:
>>>>>> Signed-off-by: Jean-Philippe Brucker
>>>>>> Signed-off-by: Liu, Yi L
>>>>>> Signed-off-by: Ashok Raj
>>>>>> Signed-off-by: Jacob Pan
>>>>>> Signed-off-by: Eric Auger
>>>>> This Signed-of-by chain looks dubious, you are the author but the last
>>>>> one in the chain?
>>>> The 1st RFC in Aug 2018
>>>> (https://lists.cs.columbia.edu/pipermail/kvmarm/2018-August/032478.html)
>>>> said this was a generalization of Jacob's patch
>>>>
>>>>
>>>>    [PATCH v5 01/23] iommu: introduce bind_pasid_table API function
>>>>
>>>>
>>>>   
>>>> https://lists.linuxfoundation.org/pipermail/iommu/2018-May/027647.html
>>>>
>>>> So indeed Jacob should be the author. I guess the multiple rebases got
>>>> this eventually replaced at some point, which is not an excuse. Please
>>>> forgive me for that.
>>>> Now the original patch already had this list of SoB so I don't know if I
>>>> shall simplify it.
>>> As we have decided to move the nested mode (dual stages) implementation
>>> onto the developing iommufd framework, what's the value of adding this
>>> into iommu core?
>> The iommu_uapi_attach_pasid_table uapi should disappear indeed as it is
>> is bound to be replaced by /dev/iommu fellow API.
>> However until I can rebase on /dev/iommu code I am obliged to keep it to
>> maintain this integration, hence the RFC.
> Indeed, we are getting pretty close to having the base iommufd that we
> can start adding stuff like this into. Maybe in January, you can look
> at some parts of what is evolving here:
>
> https://github.com/jgunthorpe/linux/commits/iommufd
> https://github.com/LuBaolu/intel-iommu/commits/iommu-dma-ownership-v2
> https://github.com/luxis1999/iommufd/commits/iommufd-v5.16-rc2
Interesting. thank you for the preview links. I will have a look asap

Eric
>
> From a progress perspective I would like to start with simple 'page
> tables in userspace', ie no PASID in this step.
>
> 'page tables in userspace' means an iommufd ioctl to create an
> iommu_domain where the IOMMU HW is directly travesering a
> device-specific page table structure in user space memory. All the HW
> today implements this by using another iommu_domain to allow the IOMMU
> HW DMA access to user memory - ie nesting or multi-stage or whatever.
>
> This would come along with some ioctls to invalidate the IOTLB.
>
> I'm imagining this step as a iommu_group->op->create_user_domain()
> driver callback which will create a new kind of domain with
> domain-unique ops. Ie map/unmap related should all be NULL as those
> are impossible operations.
>
> From there the usual struct device (ie RID) attach/detatch stuff needs
> to take care of routing DMAs to this iommu_domain.
>
> Step two would be to add the ability for an iommufd using driver to
> request that a RID is connected to an iommu_domain. This
> connection can be requested for any kind of iommu_domain, kernel owned
> or user owned.
>
> I don't quite have an answer how exactly the SMMUv3 vs Intel
> difference in PASID routing should be resolved.
>
> to get answers I'm hoping to start building some sketch RFCs for these
> different things on iommufd, hopefully in January. I'm looking at user
> page tables, PASID, dirty tracking and userspace IO fault handling as
> the main features iommufd must tackle.
>
> The purpose of the sketches would be to validate that the HW features
> we want to exposed can work will with the choices the base is making.
>
> Jason
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC v16 1/9] iommu: Introduce attach/detach_pasid_table API

2021-12-08 Thread Eric Auger
Hi Jason,

On 12/8/21 7:31 PM, Jason Gunthorpe wrote:
> On Wed, Dec 08, 2021 at 05:20:39PM +, Jean-Philippe Brucker wrote:
>> On Wed, Dec 08, 2021 at 08:56:16AM -0400, Jason Gunthorpe wrote:
>>> From a progress perspective I would like to start with simple 'page
>>> tables in userspace', ie no PASID in this step.
>>>
>>> 'page tables in userspace' means an iommufd ioctl to create an
>>> iommu_domain where the IOMMU HW is directly travesering a
>>> device-specific page table structure in user space memory. All the HW
>>> today implements this by using another iommu_domain to allow the IOMMU
>>> HW DMA access to user memory - ie nesting or multi-stage or whatever.
>>>
>>> This would come along with some ioctls to invalidate the IOTLB.
>>>
>>> I'm imagining this step as a iommu_group->op->create_user_domain()
>>> driver callback which will create a new kind of domain with
>>> domain-unique ops. Ie map/unmap related should all be NULL as those
>>> are impossible operations.
>>>
>>> From there the usual struct device (ie RID) attach/detatch stuff needs
>>> to take care of routing DMAs to this iommu_domain.
>>>
>>> Step two would be to add the ability for an iommufd using driver to
>>> request that a RID is connected to an iommu_domain. This
>>> connection can be requested for any kind of iommu_domain, kernel owned
>>> or user owned.
>>>
>>> I don't quite have an answer how exactly the SMMUv3 vs Intel
>>> difference in PASID routing should be resolved.
>> In SMMUv3 the user pgd is always stored in the PASID table (actually
>> called "context descriptor table" but I want to avoid confusion with
>> the VT-d "context table"). And to access the PASID table, the SMMUv3 first
>> translate its GPA into a PA using the stage-2 page table. For userspace to
>> pass individual pgds to the kernel, as opposed to passing whole PASID
>> tables, the host kernel needs to reserve GPA space and map it in stage-2,
>> so it can store the PASID table in there. Userspace manages GPA space.
> It is what I thought.. So in the SMMUv3 spec the STE is completely in
> kernel memory, but it points to an S1ContextPtr that must be an IPA if
> the "stage 1 translation tables" are IPA. Only via S1ContextPtr can we
> decode the substream?
Yes that's correct. S1ContextPtr is the IPA of the L1 Context Descriptor
Table which is then indexed by substreamID.

>
> So in SMMUv3 land we don't really ever talk about PASID, we have a
> 'user page table' that is bound to an entire RID and *all* PASIDs.
in ARM terminology substreamID matches the PASID and this is what
indexes the L1 Context Descriptor Table.

>
> While Intel would have a 'user page table' that is only bound to a RID
> & PASID
>
> Certianly it is not a difference we can hide from userspace.
>  
>> This would be easy for a single pgd. In this case the PASID table has a
>> single entry and userspace could just pass one GPA page during
>> registration. However it isn't easily generalized to full PASID support,
>> because managing a multi-level PASID table will require runtime GPA
>> allocation, and that API is awkward. That's why we opted for "attach PASID
>> table" operation rather than "attach page table" (back then the choice was
>> easy since VT-d used the same concept).
> I think the entire context descriptor table should be in userspace,
> and filled in by userspace, as part of the userspace page table.

In ARM nested mode the L1 Context Descriptor Table is fully managed by
the guest and the userspace only needs to trap its S1ContextPtr and pass
it to the host.
>
> The kernel API should accept the S1ContextPtr IPA and all the parts of
> the STE that relate to the defining the layout of what the S1Context
> points to an thats it.
Yes that's exactly what is done currently. At config time the host must
trap guest STE changes (format and S1ContextPtr) and "incorporate" those
changes into the stage2 related STE information. The STE is owned by the
host kernel as it contains the stage2 information (S2TTB).

In
https://developer.arm.com/documentation/ihi0070/latest
(ARM_IHI_0070_D_b_System_Memory_Management_Unit_Architecture_Specification.pdf)
Synthetic diagrams can be found in 3.3.2 StreamIDs to Context
Descriptors. They give the global view.

Note this series only coped with a single CD in the Context Descriptor
Table.

Thanks

Eric
>
> We should have another mode where the kernel owns everything, and the
> S1ContexPtr is a PA with Stage 2 bypassed.
>
> That part is fine, the more open question is what does the driver
> interface look like when userspace tell something like vfio-pci to
> connect to this thing. At some level the attaching device needs to
> authorize iommufd to take the entire PASID table and RID.
>
> Specifically we cannot use this thing with a mdev, while the Intel
> version of a userspace page table can be.
>
> Maybe that is just some 'allow whole device' flag in an API
>
> Jason
>

___
iommu mailing list

Re: [RFC v16 1/9] iommu: Introduce attach/detach_pasid_table API

2021-12-07 Thread Eric Auger
Hi Baolu,

On 12/8/21 3:44 AM, Lu Baolu wrote:
> Hi Eric,
>
> On 12/7/21 6:22 PM, Eric Auger wrote:
>> On 12/6/21 11:48 AM, Joerg Roedel wrote:
>>> On Wed, Oct 27, 2021 at 12:44:20PM +0200, Eric Auger wrote:
>>>> Signed-off-by: Jean-Philippe Brucker
>>>> Signed-off-by: Liu, Yi L
>>>> Signed-off-by: Ashok Raj
>>>> Signed-off-by: Jacob Pan
>>>> Signed-off-by: Eric Auger
>>> This Signed-of-by chain looks dubious, you are the author but the last
>>> one in the chain?
>> The 1st RFC in Aug 2018
>> (https://lists.cs.columbia.edu/pipermail/kvmarm/2018-August/032478.html)
>> said this was a generalization of Jacob's patch
>>
>>
>>    [PATCH v5 01/23] iommu: introduce bind_pasid_table API function
>>
>>
>>   
>> https://lists.linuxfoundation.org/pipermail/iommu/2018-May/027647.html
>>
>> So indeed Jacob should be the author. I guess the multiple rebases got
>> this eventually replaced at some point, which is not an excuse. Please
>> forgive me for that.
>> Now the original patch already had this list of SoB so I don't know if I
>> shall simplify it.
>
> As we have decided to move the nested mode (dual stages) implementation
> onto the developing iommufd framework, what's the value of adding this
> into iommu core?

The iommu_uapi_attach_pasid_table uapi should disappear indeed as it is
is bound to be replaced by /dev/iommu fellow API.
However until I can rebase on /dev/iommu code I am obliged to keep it to
maintain this integration, hence the RFC.

Thanks

Eric
>
> Best regards,
> baolu
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC v16 0/9] SMMUv3 Nested Stage Setup (IOMMU part)

2021-12-07 Thread Eric Auger
Hi Zhangfei,

On 12/7/21 11:35 AM, Zhangfei Gao wrote:
>
>
> On 2021/12/7 下午6:27, Eric Auger wrote:
>> Hi Zhangfei,
>>
>> On 12/3/21 1:27 PM, Zhangfei Gao wrote:
>>> Hi, Eric
>>>
>>> On 2021/10/27 下午6:44, Eric Auger wrote:
>>>> This series brings the IOMMU part of HW nested paging support
>>>> in the SMMUv3.
>>>>
>>>> The SMMUv3 driver is adapted to support 2 nested stages.
>>>>
>>>> The IOMMU API is extended to convey the guest stage 1
>>>> configuration and the hook is implemented in the SMMUv3 driver.
>>>>
>>>> This allows the guest to own the stage 1 tables and context
>>>> descriptors (so-called PASID table) while the host owns the
>>>> stage 2 tables and main configuration structures (STE).
>>>>
>>>> This work mainly is provided for test purpose as the upper
>>>> layer integration is under rework and bound to be based on
>>>> /dev/iommu instead of VFIO tunneling. In this version we also get
>>>> rid of the MSI BINDING ioctl, assuming the guest enforces
>>>> flat mapping of host IOVAs used to bind physical MSI doorbells.
>>>> In the current QEMU integration this is achieved by exposing
>>>> RMRs to the guest, using Shameer's series [1]. This approach
>>>> is RFC as the IORT spec is not really meant to do that
>>>> (single mapping flag limitation).
>>>>
>>>> Best Regards
>>>>
>>>> Eric
>>>>
>>>> This series (Host) can be found at:
>>>> https://github.com/eauger/linux/tree/v5.15-rc7-nested-v16
>>>> This includes a rebased VFIO integration (although not meant
>>>> to be upstreamed)
>>>>
>>>> Guest kernel branch can be found at:
>>>> https://github.com/eauger/linux/tree/shameer_rmrr_v7
>>>> featuring [1]
>>>>
>>>> QEMU integration (still based on VFIO and exposing RMRs)
>>>> can be found at:
>>>> https://github.com/eauger/qemu/tree/v6.1.0-rmr-v2-nested_smmuv3_v10
>>>> (use iommu=nested-smmuv3 ARM virt option)
>>>>
>>>> Guest dependency:
>>>> [1] [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node
>>> Thanks a lot for upgrading these patches.
>>>
>>> I have basically verified these patches on HiSilicon Kunpeng920.
>>> And integrated them to these branches.
>>> https://github.com/Linaro/linux-kernel-uadk/tree/uacce-devel-5.16
>>> https://github.com/Linaro/qemu/tree/v6.1.0-rmr-v2-nested_smmuv3_v10
>>>
>>> Though they are provided for test purpose,
>>>
>>> Tested-by: Zhangfei Gao 
>> Thank you very much. As you mentioned, until we do not have the
>> /dev/iommu integration this is maintained for testing purpose. The SMMU
>> changes shouldn't be much impacted though.
>> The added value of this respin was to propose an MSI binding solution
>> based on RMRRs which simplify things at kernel level.
>
> Current RMRR solution requires uefi enabled,
> and QEMU_EFI.fd  has to be provided to start qemu.
>
> Any plan to support dtb as well, which will be simpler since no need
> QEMU_EFI.fd anymore.
Yes the solution is based on ACPI IORT nodes. No clue if some DT
integration is under work. Shameer?

Thanks

Eric
>
> Thanks
>
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC v16 0/9] SMMUv3 Nested Stage Setup (IOMMU part)

2021-12-07 Thread Eric Auger
Hi Sumit,

On 12/3/21 2:13 PM, Sumit Gupta wrote:
> Hi Eric,
>
>> This series brings the IOMMU part of HW nested paging support
>> in the SMMUv3.
>>
>> The SMMUv3 driver is adapted to support 2 nested stages.
>>
>> The IOMMU API is extended to convey the guest stage 1
>> configuration and the hook is implemented in the SMMUv3 driver.
>>
>> This allows the guest to own the stage 1 tables and context
>> descriptors (so-called PASID table) while the host owns the
>> stage 2 tables and main configuration structures (STE).
>>
>> This work mainly is provided for test purpose as the upper
>> layer integration is under rework and bound to be based on
>> /dev/iommu instead of VFIO tunneling. In this version we also get
>> rid of the MSI BINDING ioctl, assuming the guest enforces
>> flat mapping of host IOVAs used to bind physical MSI doorbells.
>> In the current QEMU integration this is achieved by exposing
>> RMRs to the guest, using Shameer's series [1]. This approach
>> is RFC as the IORT spec is not really meant to do that
>> (single mapping flag limitation).
>>
>> Best Regards
>>
>> Eric
>>
>> This series (Host) can be found at:
>> https://github.com/eauger/linux/tree/v5.15-rc7-nested-v16
>> This includes a rebased VFIO integration (although not meant
>> to be upstreamed)
>>
>> Guest kernel branch can be found at:
>> https://github.com/eauger/linux/tree/shameer_rmrr_v7
>> featuring [1]
>>
>> QEMU integration (still based on VFIO and exposing RMRs)
>> can be found at:
>> https://github.com/eauger/qemu/tree/v6.1.0-rmr-v2-nested_smmuv3_v10
>> (use iommu=nested-smmuv3 ARM virt option)
>>
>> Guest dependency:
>> [1] [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node
>>
>> History:
>>
>> v15 -> v16:
>> - guest RIL must support RIL
>> - additional checks in the cache invalidation hook
>> - removal of the MSI BINDING ioctl (tentative replacement
>>    by RMRs)
>>
>>
>> Eric Auger (9):
>>    iommu: Introduce attach/detach_pasid_table API
>>    iommu: Introduce iommu_get_nesting
>>    iommu/smmuv3: Allow s1 and s2 configs to coexist
>>    iommu/smmuv3: Get prepared for nested stage support
>>    iommu/smmuv3: Implement attach/detach_pasid_table
>>    iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs
>>    iommu/smmuv3: Implement cache_invalidate
>>    iommu/smmuv3: report additional recoverable faults
>>    iommu/smmuv3: Disallow nested mode in presence of HW MSI regions
> Hi Eric,
>
> I validated the reworked test patches in v16 from the given
> branches with Kernel v5.15 & Qemu v6.2. Verified them with
> NVMe PCI device assigned to Guest VM.
> Sorry, forgot to update earlier.
>
> Tested-by: Sumit Gupta 

Thank you very much!

Best Regards

Eric
>
> Thanks,
> Sumit Gupta
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC v16 0/9] SMMUv3 Nested Stage Setup (IOMMU part)

2021-12-07 Thread Eric Auger
Hi Zhangfei,

On 12/3/21 1:27 PM, Zhangfei Gao wrote:
>
> Hi, Eric
>
> On 2021/10/27 下午6:44, Eric Auger wrote:
>> This series brings the IOMMU part of HW nested paging support
>> in the SMMUv3.
>>
>> The SMMUv3 driver is adapted to support 2 nested stages.
>>
>> The IOMMU API is extended to convey the guest stage 1
>> configuration and the hook is implemented in the SMMUv3 driver.
>>
>> This allows the guest to own the stage 1 tables and context
>> descriptors (so-called PASID table) while the host owns the
>> stage 2 tables and main configuration structures (STE).
>>
>> This work mainly is provided for test purpose as the upper
>> layer integration is under rework and bound to be based on
>> /dev/iommu instead of VFIO tunneling. In this version we also get
>> rid of the MSI BINDING ioctl, assuming the guest enforces
>> flat mapping of host IOVAs used to bind physical MSI doorbells.
>> In the current QEMU integration this is achieved by exposing
>> RMRs to the guest, using Shameer's series [1]. This approach
>> is RFC as the IORT spec is not really meant to do that
>> (single mapping flag limitation).
>>
>> Best Regards
>>
>> Eric
>>
>> This series (Host) can be found at:
>> https://github.com/eauger/linux/tree/v5.15-rc7-nested-v16
>> This includes a rebased VFIO integration (although not meant
>> to be upstreamed)
>>
>> Guest kernel branch can be found at:
>> https://github.com/eauger/linux/tree/shameer_rmrr_v7
>> featuring [1]
>>
>> QEMU integration (still based on VFIO and exposing RMRs)
>> can be found at:
>> https://github.com/eauger/qemu/tree/v6.1.0-rmr-v2-nested_smmuv3_v10
>> (use iommu=nested-smmuv3 ARM virt option)
>>
>> Guest dependency:
>> [1] [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node
>
> Thanks a lot for upgrading these patches.
>
> I have basically verified these patches on HiSilicon Kunpeng920.
> And integrated them to these branches.
> https://github.com/Linaro/linux-kernel-uadk/tree/uacce-devel-5.16
> https://github.com/Linaro/qemu/tree/v6.1.0-rmr-v2-nested_smmuv3_v10
>
> Though they are provided for test purpose,
>
> Tested-by: Zhangfei Gao 

Thank you very much. As you mentioned, until we do not have the
/dev/iommu integration this is maintained for testing purpose. The SMMU
changes shouldn't be much impacted though.
The added value of this respin was to propose an MSI binding solution
based on RMRRs which simplify things at kernel level.

Thanks!

Eric
>
> Thanks
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC v16 1/9] iommu: Introduce attach/detach_pasid_table API

2021-12-07 Thread Eric Auger
Hi Joerg,

On 12/6/21 11:48 AM, Joerg Roedel wrote:
> On Wed, Oct 27, 2021 at 12:44:20PM +0200, Eric Auger wrote:
>> Signed-off-by: Jean-Philippe Brucker 
>> Signed-off-by: Liu, Yi L 
>> Signed-off-by: Ashok Raj 
>> Signed-off-by: Jacob Pan 
>> Signed-off-by: Eric Auger 
> This Signed-of-by chain looks dubious, you are the author but the last
> one in the chain?
The 1st RFC in Aug 2018
(https://lists.cs.columbia.edu/pipermail/kvmarm/2018-August/032478.html)
said this was a generalization of Jacob's patch


  [PATCH v5 01/23] iommu: introduce bind_pasid_table API function


  https://lists.linuxfoundation.org/pipermail/iommu/2018-May/027647.html

So indeed Jacob should be the author. I guess the multiple rebases got
this eventually replaced at some point, which is not an excuse. Please
forgive me for that.
Now the original patch already had this list of SoB so I don't know if I
shall simplify it.


>
>> +int iommu_uapi_attach_pasid_table(struct iommu_domain *domain,
>> +  void __user *uinfo)
>> +{
> [...]
>
>> +if (pasid_table_data.format == IOMMU_PASID_FORMAT_SMMUV3 &&
>> +pasid_table_data.argsz <
>> +offsetofend(struct iommu_pasid_table_config, 
>> vendor_data.smmuv3))
>> +return -EINVAL;
> This check looks like it belongs in driver specific code.
Indeed, I will fix that in my next respin :-)

Thanks!

Eric
>
> Regards,
>
>   Joerg
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 2/5] iommu/virtio: Support bypass domains

2021-12-02 Thread Eric Auger



On 12/1/21 6:33 PM, Jean-Philippe Brucker wrote:
> The VIRTIO_IOMMU_F_BYPASS_CONFIG feature adds a new flag to the ATTACH
> request, that creates a bypass domain. Use it to enable identity
> domains.
>
> When VIRTIO_IOMMU_F_BYPASS_CONFIG is not supported by the device, we
> currently fail attaching to an identity domain. Future patches will
> instead create identity mappings in this case.
>
> Reviewed-by: Kevin Tian 
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Eric
> ---
>  drivers/iommu/virtio-iommu.c | 20 +++-
>  1 file changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 80930ce04a16..14dfee76fd19 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -71,6 +71,7 @@ struct viommu_domain {
>   struct rb_root_cached   mappings;
>  
>   unsigned long   nr_endpoints;
> + boolbypass;
>  };
>  
>  struct viommu_endpoint {
> @@ -587,7 +588,9 @@ static struct iommu_domain *viommu_domain_alloc(unsigned 
> type)
>  {
>   struct viommu_domain *vdomain;
>  
> - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
> + if (type != IOMMU_DOMAIN_UNMANAGED &&
> + type != IOMMU_DOMAIN_DMA &&
> + type != IOMMU_DOMAIN_IDENTITY)
>   return NULL;
>  
>   vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL);
> @@ -630,6 +633,17 @@ static int viommu_domain_finalise(struct viommu_endpoint 
> *vdev,
>   vdomain->map_flags  = viommu->map_flags;
>   vdomain->viommu = viommu;
>  
> + if (domain->type == IOMMU_DOMAIN_IDENTITY) {
> + if (!virtio_has_feature(viommu->vdev,
> + VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
> + ida_free(>domain_ids, vdomain->id);
> + vdomain->viommu = NULL;
> + return -EOPNOTSUPP;
> + }
> +
> + vdomain->bypass = true;
> + }
> +
>   return 0;
>  }
>  
> @@ -691,6 +705,9 @@ static int viommu_attach_dev(struct iommu_domain *domain, 
> struct device *dev)
>   .domain = cpu_to_le32(vdomain->id),
>   };
>  
> + if (vdomain->bypass)
> + req.flags |= cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS);
> +
>   for (i = 0; i < fwspec->num_ids; i++) {
>   req.endpoint = cpu_to_le32(fwspec->ids[i]);
>  
> @@ -1132,6 +1149,7 @@ static unsigned int features[] = {
>   VIRTIO_IOMMU_F_DOMAIN_RANGE,
>   VIRTIO_IOMMU_F_PROBE,
>   VIRTIO_IOMMU_F_MMIO,
> + VIRTIO_IOMMU_F_BYPASS_CONFIG,
>  };
>  
>  static struct virtio_device_id id_table[] = {

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 1/5] iommu/virtio: Add definitions for VIRTIO_IOMMU_F_BYPASS_CONFIG

2021-12-02 Thread Eric Auger
Hi Jean,

On 12/1/21 6:33 PM, Jean-Philippe Brucker wrote:
> Add definitions for the VIRTIO_IOMMU_F_BYPASS_CONFIG, which supersedes
> VIRTIO_IOMMU_F_BYPASS.
>
> Reviewed-by: Kevin Tian 
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Eric
> ---
>  include/uapi/linux/virtio_iommu.h | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/virtio_iommu.h 
> b/include/uapi/linux/virtio_iommu.h
> index 237e36a280cb..1ff357f0d72e 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -16,6 +16,7 @@
>  #define VIRTIO_IOMMU_F_BYPASS3
>  #define VIRTIO_IOMMU_F_PROBE 4
>  #define VIRTIO_IOMMU_F_MMIO  5
> +#define VIRTIO_IOMMU_F_BYPASS_CONFIG 6
>  
>  struct virtio_iommu_range_64 {
>   __le64  start;
> @@ -36,6 +37,8 @@ struct virtio_iommu_config {
>   struct virtio_iommu_range_32domain_range;
>   /* Probe buffer size */
>   __le32  probe_size;
> + __u8bypass;
> + __u8reserved[3];
>  };
>  
>  /* Request types */
> @@ -66,11 +69,14 @@ struct virtio_iommu_req_tail {
>   __u8reserved[3];
>  };
>  
> +#define VIRTIO_IOMMU_ATTACH_F_BYPASS (1 << 0)
> +
>  struct virtio_iommu_req_attach {
>   struct virtio_iommu_req_headhead;
>   __le32  domain;
>   __le32  endpoint;
> - __u8reserved[8];
> + __le32  flags;
> + __u8reserved[4];
>   struct virtio_iommu_req_tailtail;
>  };
>  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 5/5] iommu/virtio: Support identity-mapped domains

2021-11-27 Thread Eric Auger
Hi Jean,

On 11/23/21 4:53 PM, Jean-Philippe Brucker wrote:
> Support identity domains for devices that do not offer the
> VIRTIO_IOMMU_F_BYPASS_CONFIG feature, by creating 1:1 mappings between
> the virtual and physical address space. Identity domains created this
> way still perform noticeably better than DMA domains, because they don't
> have the overhead of setting up and tearing down mappings at runtime.
> The performance difference between this and bypass is minimal in
> comparison.
>
> It does not matter that the physical addresses in the identity mappings
> do not all correspond to memory. By enabling passthrough we are trusting
> the device driver and the device itself to only perform DMA to suitable
> locations. In some cases it may even be desirable to perform DMA to MMIO
> regions.
>
> Reviewed-by: Kevin Tian 
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  drivers/iommu/virtio-iommu.c | 63 +---
>  1 file changed, 58 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index eceb9281c8c1..6a8a52b4297b 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -375,6 +375,55 @@ static size_t viommu_del_mappings(struct viommu_domain 
> *vdomain,
>   return unmapped;
>  }
>  
> +/*
> + * Fill the domain with identity mappings, skipping the device's reserved
> + * regions.
> + */
> +static int viommu_domain_map_identity(struct viommu_endpoint *vdev,
> +   struct viommu_domain *vdomain)
> +{
> + int ret;
> + struct iommu_resv_region *resv;
> + u64 iova = vdomain->domain.geometry.aperture_start;
> + u64 limit = vdomain->domain.geometry.aperture_end;
> + u32 flags = VIRTIO_IOMMU_MAP_F_READ | VIRTIO_IOMMU_MAP_F_WRITE;
> + unsigned long granule = 1UL << __ffs(vdomain->domain.pgsize_bitmap);
> +
> + iova = ALIGN(iova, granule);
> + limit = ALIGN_DOWN(limit + 1, granule) - 1;
> +
> + list_for_each_entry(resv, >resv_regions, list) {
> + u64 resv_start = ALIGN_DOWN(resv->start, granule);
> + u64 resv_end = ALIGN(resv->start + resv->length, granule) - 1;
> +
> + if (resv_end < iova || resv_start > limit)
> + /* No overlap */
> + continue;
> +
> + if (resv_start > iova) {
> + ret = viommu_add_mapping(vdomain, iova, resv_start - 1,
> +  (phys_addr_t)iova, flags);
> + if (ret)
> + goto err_unmap;
> + }
> +
> + if (resv_end >= limit)
> + return 0;
> +
> + iova = resv_end + 1;
> + }
> +
> + ret = viommu_add_mapping(vdomain, iova, limit, (phys_addr_t)iova,
> +  flags);
> + if (ret)
> + goto err_unmap;
> + return 0;
> +
> +err_unmap:
> + viommu_del_mappings(vdomain, 0, iova);
> + return ret;
> +}
> +
>  /*
>   * viommu_replay_mappings - re-send MAP requests
>   *
> @@ -637,14 +686,18 @@ static int viommu_domain_finalise(struct 
> viommu_endpoint *vdev,
>   vdomain->viommu = viommu;
>  
>   if (domain->type == IOMMU_DOMAIN_IDENTITY) {
> - if (!virtio_has_feature(viommu->vdev,
> - VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
> + if (virtio_has_feature(viommu->vdev,
> +VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
> + vdomain->bypass = true;
> + return 0;
> + }
> +
> + ret = viommu_domain_map_identity(vdev, vdomain);
> + if (ret) {
>       ida_free(>domain_ids, vdomain->id);
> - vdomain->viommu = 0;
> + vdomain->viommu = NULL;
nit: that change could have been done in patch 2
>   return -EOPNOTSUPP;
>   }
> -
> - vdomain->bypass = true;
>   }
>  
>   return 0;
Besides
Reviewed-by: Eric Auger 

Eric

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 3/5] iommu/virtio: Sort reserved regions

2021-11-27 Thread Eric Auger
Hi Jean,

On 11/23/21 4:53 PM, Jean-Philippe Brucker wrote:
> To ease identity mapping support, keep the list of reserved regions
> sorted.
>
> Reviewed-by: Kevin Tian 
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Eric
> ---
>  drivers/iommu/virtio-iommu.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index ee8a7afd667b..d63ec4d11b00 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -423,7 +423,7 @@ static int viommu_add_resv_mem(struct viommu_endpoint 
> *vdev,
>   size_t size;
>   u64 start64, end64;
>   phys_addr_t start, end;
> - struct iommu_resv_region *region = NULL;
> + struct iommu_resv_region *region = NULL, *next;
>   unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
>  
>   start = start64 = le64_to_cpu(mem->start);
> @@ -454,7 +454,12 @@ static int viommu_add_resv_mem(struct viommu_endpoint 
> *vdev,
>   if (!region)
>   return -ENOMEM;
>  
> - list_add(>list, >resv_regions);
> + /* Keep the list sorted */
> + list_for_each_entry(next, >resv_regions, list) {
> + if (next->start > region->start)
> + break;
> + }
> + list_add_tail(>list, >list);
>   return 0;
>  }
>  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 4/5] iommu/virtio: Pass end address to viommu_add_mapping()

2021-11-27 Thread Eric Auger



On 11/23/21 4:53 PM, Jean-Philippe Brucker wrote:
> To support identity mappings, the virtio-iommu driver must be able to
> represent full 64-bit ranges internally. Pass (start, end) instead of
> (start, size) to viommu_add/del_mapping().
>
> Clean comments. The one about the returned size was never true: when
> sweeping the whole address space the returned size will most certainly
> be smaller than 2^64.
>
> Reviewed-by: Kevin Tian 
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Eric

> ---
>  drivers/iommu/virtio-iommu.c | 31 +++
>  1 file changed, 15 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index d63ec4d11b00..eceb9281c8c1 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -311,8 +311,8 @@ static int viommu_send_req_sync(struct viommu_dev 
> *viommu, void *buf,
>   *
>   * On success, return the new mapping. Otherwise return NULL.
>   */
> -static int viommu_add_mapping(struct viommu_domain *vdomain, unsigned long 
> iova,
> -   phys_addr_t paddr, size_t size, u32 flags)
> +static int viommu_add_mapping(struct viommu_domain *vdomain, u64 iova, u64 
> end,
> +   phys_addr_t paddr, u32 flags)
>  {
>   unsigned long irqflags;
>   struct viommu_mapping *mapping;
> @@ -323,7 +323,7 @@ static int viommu_add_mapping(struct viommu_domain 
> *vdomain, unsigned long iova,
>  
>   mapping->paddr  = paddr;
>   mapping->iova.start = iova;
> - mapping->iova.last  = iova + size - 1;
> + mapping->iova.last  = end;
>   mapping->flags  = flags;
>  
>   spin_lock_irqsave(>mappings_lock, irqflags);
> @@ -338,26 +338,24 @@ static int viommu_add_mapping(struct viommu_domain 
> *vdomain, unsigned long iova,
>   *
>   * @vdomain: the domain
>   * @iova: start of the range
> - * @size: size of the range. A size of 0 corresponds to the entire address
> - *   space.
> + * @end: end of the range
>   *
> - * On success, returns the number of unmapped bytes (>= size)
> + * On success, returns the number of unmapped bytes
>   */
>  static size_t viommu_del_mappings(struct viommu_domain *vdomain,
> -   unsigned long iova, size_t size)
> +   u64 iova, u64 end)
>  {
>   size_t unmapped = 0;
>   unsigned long flags;
> - unsigned long last = iova + size - 1;
>   struct viommu_mapping *mapping = NULL;
>   struct interval_tree_node *node, *next;
>  
>   spin_lock_irqsave(>mappings_lock, flags);
> - next = interval_tree_iter_first(>mappings, iova, last);
> + next = interval_tree_iter_first(>mappings, iova, end);
>   while (next) {
>   node = next;
>   mapping = container_of(node, struct viommu_mapping, iova);
> - next = interval_tree_iter_next(node, iova, last);
> + next = interval_tree_iter_next(node, iova, end);
>  
>   /* Trying to split a mapping? */
>   if (mapping->iova.start < iova)
> @@ -656,8 +654,8 @@ static void viommu_domain_free(struct iommu_domain 
> *domain)
>  {
>   struct viommu_domain *vdomain = to_viommu_domain(domain);
>  
> - /* Free all remaining mappings (size 2^64) */
> - viommu_del_mappings(vdomain, 0, 0);
> + /* Free all remaining mappings */
> + viommu_del_mappings(vdomain, 0, ULLONG_MAX);
>  
>   if (vdomain->viommu)
>   ida_free(>viommu->domain_ids, vdomain->id);
> @@ -742,6 +740,7 @@ static int viommu_map(struct iommu_domain *domain, 
> unsigned long iova,
>  {
>   int ret;
>   u32 flags;
> + u64 end = iova + size - 1;
>   struct virtio_iommu_req_map map;
>   struct viommu_domain *vdomain = to_viommu_domain(domain);
>  
> @@ -752,7 +751,7 @@ static int viommu_map(struct iommu_domain *domain, 
> unsigned long iova,
>   if (flags & ~vdomain->map_flags)
>   return -EINVAL;
>  
> - ret = viommu_add_mapping(vdomain, iova, paddr, size, flags);
> + ret = viommu_add_mapping(vdomain, iova, end, paddr, flags);
>   if (ret)
>   return ret;
>  
> @@ -761,7 +760,7 @@ static int viommu_map(struct iommu_domain *domain, 
> unsigned long iova,
>   .domain = cpu_to_le32(vdomain->id),
>   .virt_start = cpu_to_le64(iova),
>   .phys_start = cpu_to_le64(paddr),
> - .virt_end   = cpu_to_le64(iova + size - 1),
> + .virt_end   = cpu_to_le64(

Re: [PATCH v2 2/5] iommu/virtio: Support bypass domains

2021-11-27 Thread Eric Auger
Hi Jean,

On 11/23/21 4:52 PM, Jean-Philippe Brucker wrote:
> The VIRTIO_IOMMU_F_BYPASS_CONFIG feature adds a new flag to the ATTACH
> request, that creates a bypass domain. Use it to enable identity
> domains.
>
> When VIRTIO_IOMMU_F_BYPASS_CONFIG is not supported by the device, we
> currently fail attaching to an identity domain. Future patches will
> instead create identity mappings in this case.
>
> Reviewed-by: Kevin Tian 
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  drivers/iommu/virtio-iommu.c | 20 +++-
>  1 file changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 80930ce04a16..ee8a7afd667b 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -71,6 +71,7 @@ struct viommu_domain {
>   struct rb_root_cached   mappings;
>  
>   unsigned long   nr_endpoints;
> + boolbypass;
>  };
>  
>  struct viommu_endpoint {
> @@ -587,7 +588,9 @@ static struct iommu_domain *viommu_domain_alloc(unsigned 
> type)
>  {
>   struct viommu_domain *vdomain;
>  
> - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
> + if (type != IOMMU_DOMAIN_UNMANAGED &&
> + type != IOMMU_DOMAIN_DMA &&
> + type != IOMMU_DOMAIN_IDENTITY)
>   return NULL;
>  
>   vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL);
> @@ -630,6 +633,17 @@ static int viommu_domain_finalise(struct viommu_endpoint 
> *vdev,
>   vdomain->map_flags  = viommu->map_flags;
>   vdomain->viommu = viommu;
>  
> + if (domain->type == IOMMU_DOMAIN_IDENTITY) {
> + if (!virtio_has_feature(viommu->vdev,
nit: couldn't the check be done before the ida_alloc_range(),
simplifying the failure cleanup?

Thanks

Eric
> + VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
> + ida_free(>domain_ids, vdomain->id);
> + vdomain->viommu = 0;
> + return -EOPNOTSUPP;
> + }
> +
> + vdomain->bypass = true;
> + }
> +
>   return 0;
>  }
>  
> @@ -691,6 +705,9 @@ static int viommu_attach_dev(struct iommu_domain *domain, 
> struct device *dev)
>   .domain = cpu_to_le32(vdomain->id),
>   };
>  
> + if (vdomain->bypass)
> + req.flags |= cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS);
> +
>   for (i = 0; i < fwspec->num_ids; i++) {
>   req.endpoint = cpu_to_le32(fwspec->ids[i]);
>  
> @@ -1132,6 +1149,7 @@ static unsigned int features[] = {
>   VIRTIO_IOMMU_F_DOMAIN_RANGE,
>   VIRTIO_IOMMU_F_PROBE,
>   VIRTIO_IOMMU_F_MMIO,
> + VIRTIO_IOMMU_F_BYPASS_CONFIG,
>  };
>  
>  static struct virtio_device_id id_table[] = {

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/5] iommu/virtio: Add definitions for VIRTIO_IOMMU_F_BYPASS_CONFIG

2021-11-26 Thread Eric Auger
Hi Jean,

On 11/23/21 4:52 PM, Jean-Philippe Brucker wrote:
> Add definitions for the VIRTIO_IOMMU_F_BYPASS_CONFIG, which supersedes
> VIRTIO_IOMMU_F_BYPASS.
>
> Reviewed-by: Kevin Tian 
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  include/uapi/linux/virtio_iommu.h | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/virtio_iommu.h 
> b/include/uapi/linux/virtio_iommu.h
> index 237e36a280cb..cafd8cf7febf 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -16,6 +16,7 @@
>  #define VIRTIO_IOMMU_F_BYPASS3
>  #define VIRTIO_IOMMU_F_PROBE 4
>  #define VIRTIO_IOMMU_F_MMIO  5
> +#define VIRTIO_IOMMU_F_BYPASS_CONFIG 6
>  
>  struct virtio_iommu_range_64 {
>   __le64  start;
> @@ -36,6 +37,8 @@ struct virtio_iommu_config {
>   struct virtio_iommu_range_32domain_range;
>   /* Probe buffer size */
>   __le32  probe_size;
> + __u8bypass;
> + __u8reserved[7];
in [PATCH v3] virtio-iommu: Rework the bypass feature I see

+  u8 bypass;
+  u8 reserved[3];

What was exactly voted?

Thanks

Eric

>  };
>  
>  /* Request types */
> @@ -66,11 +69,14 @@ struct virtio_iommu_req_tail {
>   __u8reserved[3];
>  };
>  
> +#define VIRTIO_IOMMU_ATTACH_F_BYPASS (1 << 0)
> +
>  struct virtio_iommu_req_attach {
>   struct virtio_iommu_req_headhead;
>   __le32  domain;
>   __le32  endpoint;
> - __u8reserved[8];
> + __le32  flags;
> + __u8reserved[4];
>   struct virtio_iommu_req_tailtail;
>  };
>  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC v16 9/9] iommu/smmuv3: Disallow nested mode in presence of HW MSI regions

2021-10-27 Thread Eric Auger
Nested mode currently is not compatible with HW MSI reserved regions.
Indeed MSI transactions targeting those MSI doorbells bypass the SMMU.
This would require the guest to also bypass those ranges but the guest
has no information about them.

Let's check nested mode is not attempted in such configuration.

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 23 +
 1 file changed, 23 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ddfc069c10ae..12e7d7920f27 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2488,6 +2488,23 @@ static void arm_smmu_detach_dev(struct arm_smmu_master 
*master)
arm_smmu_install_ste_for_dev(master);
 }
 
+static bool arm_smmu_has_hw_msi_resv_region(struct device *dev)
+{
+   struct iommu_resv_region *region;
+   bool has_msi_resv_region = false;
+   LIST_HEAD(resv_regions);
+
+   iommu_get_resv_regions(dev, _regions);
+   list_for_each_entry(region, _regions, list) {
+   if (region->type == IOMMU_RESV_MSI) {
+   has_msi_resv_region = true;
+   break;
+   }
+   }
+   iommu_put_resv_regions(dev, _regions);
+   return has_msi_resv_region;
+}
+
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
int ret = 0;
@@ -2545,6 +2562,12 @@ static int arm_smmu_attach_dev(struct iommu_domain 
*domain, struct device *dev)
ret = -EINVAL;
goto out_unlock;
}
+   /* Nested mode is not compatible with MSI HW reserved regions */
+   if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED &&
+   arm_smmu_has_hw_msi_resv_region(dev)) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
 
master->domain = smmu_domain;
 
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC v16 8/9] iommu/smmuv3: report additional recoverable faults

2021-10-27 Thread Eric Auger
Up to now we have only reported translation faults. Now that
the guest can induce some configuration faults, let's report them
too. Add propagation for BAD_SUBSTREAMID, CD_FETCH, BAD_CD, WALK_EABT.
We also fix the transcoding for some existing translation faults.

Signed-off-by: Eric Auger 

---

v14 -> v15:
- adapt to removal of IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID
  in [PATCH v13 10/10] iommu/arm-smmu-v3: Add stall support for
  platform devices
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 40 +++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  4 +++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index e84a7c3e8730..ddfc069c10ae 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1488,6 +1488,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
u32 perm = 0;
struct arm_smmu_master *master;
bool ssid_valid = evt[0] & EVTQ_0_SSV;
+   u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
struct iommu_fault_event fault_evt = { };
struct iommu_fault *flt = _evt.fault;
@@ -1540,8 +1541,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
} else {
flt->type = IOMMU_FAULT_DMA_UNRECOV;
flt->event = (struct iommu_fault_unrecoverable) {
-   .reason = reason,
-   .flags = IOMMU_FAULT_UNRECOV_ADDR_VALID,
.perm = perm,
.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
};
@@ -1550,6 +1549,43 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
flt->event.flags |= IOMMU_FAULT_UNRECOV_PASID_VALID;
flt->event.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
}
+
+   switch (type) {
+   case EVT_ID_TRANSLATION_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_PTE_FETCH;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_ADDR_SIZE_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_OOR_ADDRESS;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_ACCESS_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_ACCESS;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_PERMISSION_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_PERMISSION;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_BAD_SUBSTREAMID:
+   flt->event.reason = IOMMU_FAULT_REASON_PASID_INVALID;
+   break;
+   case EVT_ID_CD_FETCH:
+   flt->event.reason = IOMMU_FAULT_REASON_PASID_FETCH;
+   flt->event.flags |= 
IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID;
+   break;
+   case EVT_ID_BAD_CD:
+   flt->event.reason = IOMMU_FAULT_REASON_BAD_PASID_ENTRY;
+   break;
+   case EVT_ID_WALK_EABT:
+   flt->event.reason = IOMMU_FAULT_REASON_WALK_EABT;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID |
+   
IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID;
+   break;
+   default:
+   /* TODO: report other unrecoverable faults. */
+   return -EFAULT;
+   }
}
 
mutex_lock(>streams_mutex);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 05959df01618..b914570ee5ba 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -379,6 +379,10 @@
 
 #define EVTQ_0_ID  GENMASK_ULL(7, 0)
 
+#define EVT_ID_BAD_SUBSTREAMID 0x08
+#define EVT_ID_CD_FETCH0x09
+#define EVT_ID_BAD_CD  0x0a
+#define EVT_ID_WALK_EABT   0x0b
 #define EVT_ID_TRANSLATION_FAULT   0x10
 #define EVT_ID_ADDR_SIZE_FAULT 0x11
 #define EVT_ID_ACCESS_FAULT0x12
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC v16 7/9] iommu/smmuv3: Implement cache_invalidate

2021-10-27 Thread Eric Auger
Implement domain-selective, pasid selective and page-selective
IOTLB invalidations.

Signed-off-by: Eric Auger 

---

v15 -> v16:
- make sure the range is set (RIL guest) and check the granule
  size is supported by the physical IOMMU
- use cmd_with_sync

v14 -> v15:
- remove the redundant arm_smmu_cmdq_issue_sync(smmu)
  in IOMMU_INV_GRANU_ADDR case (Zenghui)
- if RIL is not supported by the host, make sure the granule_size
  that is passed by the userspace is supported or fix it
  (Chenxiang)

v13 -> v14:
- Add domain invalidation
- do global inval when asid is not provided with addr
  granularity

v7 -> v8:
- ASID based invalidation using iommu_inv_pasid_info
- check ARCHID/PASID flags in addr based invalidation
- use __arm_smmu_tlb_inv_context and __arm_smmu_tlb_inv_range_nosync

v6 -> v7
- check the uapi version

v3 -> v4:
- adapt to changes in the uapi
- add support for leaf parameter
- do not use arm_smmu_tlb_inv_range_nosync or arm_smmu_tlb_inv_context
  anymore

v2 -> v3:
- replace __arm_smmu_tlb_sync by arm_smmu_cmdq_issue_sync

v1 -> v2:
- properly pass the asid
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 82 +
 1 file changed, 82 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d5e722105624..e84a7c3e8730 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2923,6 +2923,87 @@ static void arm_smmu_detach_pasid_table(struct 
iommu_domain *domain)
mutex_unlock(_domain->init_mutex);
 }
 
+static int
+arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+ struct iommu_cache_invalidate_info *inv_info)
+{
+   struct arm_smmu_cmdq_ent cmd = {.opcode = CMDQ_OP_TLBI_NSNH_ALL};
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   return -EINVAL;
+
+   if (!smmu)
+   return -EINVAL;
+
+   if (inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+   return -EINVAL;
+
+   if (inv_info->cache & IOMMU_CACHE_INV_TYPE_PASID ||
+   inv_info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB) {
+   return -ENOENT;
+   }
+
+   if (!(inv_info->cache & IOMMU_CACHE_INV_TYPE_IOTLB))
+   return -EINVAL;
+
+   /* IOTLB invalidation */
+
+   switch (inv_info->granularity) {
+   case IOMMU_INV_GRANU_PASID:
+   {
+   struct iommu_inv_pasid_info *info =
+   _info->granu.pasid_info;
+
+   if (info->flags & IOMMU_INV_ADDR_FLAGS_PASID)
+   return -ENOENT;
+   if (!(info->flags & IOMMU_INV_PASID_FLAGS_ARCHID))
+   return -EINVAL;
+
+   __arm_smmu_tlb_inv_context(smmu_domain, info->archid);
+   return 0;
+   }
+   case IOMMU_INV_GRANU_ADDR:
+   {
+   struct iommu_inv_addr_info *info = _info->granu.addr_info;
+   uint64_t granule_size  = info->granule_size;
+   uint64_t size = info->nb_granules * info->granule_size;
+   bool leaf = info->flags & IOMMU_INV_ADDR_FLAGS_LEAF;
+   int tg;
+
+   if (info->flags & IOMMU_INV_ADDR_FLAGS_PASID)
+   return -ENOENT;
+
+   if (!(info->flags & IOMMU_INV_ADDR_FLAGS_ARCHID))
+   break;
+
+   tg = __ffs(granule_size);
+   if (!granule_size || granule_size & ~(1ULL << tg) ||
+   !(granule_size & smmu->pgsize_bitmap))
+   return -EINVAL;
+
+   /* range invalidation must be used */
+   if (!size)
+   return -EINVAL;
+
+   arm_smmu_tlb_inv_range_domain(info->addr, size,
+ granule_size, leaf,
+ info->archid, smmu_domain);
+   return 0;
+   }
+   case IOMMU_INV_GRANU_DOMAIN:
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   /* Global S1 invalidation */
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   arm_smmu_cmdq_issue_cmd_with_sync(smmu, );
+   return 0;
+}
+
 static bool arm_smmu_dev_has_feature(struct device *dev,
 enum iommu_dev_features feat)
 {
@@ -3022,6 +3103,7 @@ static struct iommu_ops arm_smmu_ops = {
.put_resv_regions   = generic_iommu_put_resv_regions,
.attach_pasid_table = arm_smmu_attach_pasid_table,
.detach_pasid_table = arm_smmu_detach_pasid_table,
+   .cache_invalidate   = arm_smmu_c

[RFC v16 6/9] iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs

2021-10-27 Thread Eric Auger
With nested stage support, soon we will need to invalidate
S1 contexts and ranges tagged with an unmanaged asid, this
latter being managed by the guest. So let's introduce 2 helpers
that allow to invalidate with externally managed ASIDs

Signed-off-by: Eric Auger 

---

v15 -> v16:
- Use arm_smmu_cmdq_issue_cmd_with_sync()

v14 -> v15:
- Always send CMDQ_OP_TLBI_NH_VA and do not test
  smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H as the guest does
  not run in hyp mode atm (Zenghui).

v13 -> v14
- Actually send the NH_ASID command (reported by Xingang Wang)
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 41 -
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index bb2681581283..d5e722105624 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1871,9 +1871,9 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain 
*smmu_domain, int ssid,
 }
 
 /* IO_PGTABLE API */
-static void arm_smmu_tlb_inv_context(void *cookie)
+static void __arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain,
+  int ext_asid)
 {
-   struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_cmdq_ent cmd;
 
@@ -1884,7 +1884,12 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 * insertion to guarantee those are observed before the TLBI. Do be
 * careful, 007.
 */
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+   if (ext_asid >= 0) { /* guest stage 1 invalidation */
+   cmd.opcode  = CMDQ_OP_TLBI_NH_ASID;
+   cmd.tlbi.asid   = ext_asid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   arm_smmu_cmdq_issue_cmd_with_sync(smmu, );
+   } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
arm_smmu_tlb_inv_asid(smmu, smmu_domain->s1_cfg.cd.asid);
} else {
cmd.opcode  = CMDQ_OP_TLBI_S12_VMALL;
@@ -1894,6 +1899,13 @@ static void arm_smmu_tlb_inv_context(void *cookie)
arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
 }
 
+static void arm_smmu_tlb_inv_context(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
+
+   __arm_smmu_tlb_inv_context(smmu_domain, -1);
+}
+
 static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 unsigned long iova, size_t size,
 size_t granule,
@@ -1955,9 +1967,10 @@ static void __arm_smmu_tlb_inv_range(struct 
arm_smmu_cmdq_ent *cmd,
arm_smmu_cmdq_batch_submit(smmu, );
 }
 
-static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
- size_t granule, bool leaf,
- struct arm_smmu_domain *smmu_domain)
+static void
+arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
+ size_t granule, bool leaf, int ext_asid,
+ struct arm_smmu_domain *smmu_domain)
 {
struct arm_smmu_cmdq_ent cmd = {
.tlbi = {
@@ -1965,7 +1978,16 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long 
iova, size_t size,
},
};
 
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+   if (ext_asid >= 0) {  /* guest stage 1 invalidation */
+   /*
+* At the moment the guest only uses NS-EL1, to be
+* revisited when nested virt gets supported with E2H
+* exposed.
+*/
+   cmd.opcode  = CMDQ_OP_TLBI_NH_VA;
+   cmd.tlbi.asid   = ext_asid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode  = smmu_domain->smmu->features & 
ARM_SMMU_FEAT_E2H ?
  CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA;
cmd.tlbi.asid   = smmu_domain->s1_cfg.cd.asid;
@@ -1973,6 +1995,7 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long 
iova, size_t size,
cmd.opcode  = CMDQ_OP_TLBI_S2_IPA;
cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
}
+
__arm_smmu_tlb_inv_range(, iova, size, granule, smmu_domain);
 
/*
@@ -2011,7 +2034,7 @@ static void arm_smmu_tlb_inv_page_nosync(struct 
iommu_iotlb_gather *gather,
 static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
  size_t granule, void *cookie)
 {
-   arm_smmu_tlb_inv_range_domain(iova, size, granule, false, cookie);
+   arm_smmu_tlb_inv_range_domain(iova, size, granule, false, -1, cookie);
 }
 
 static const struct iommu_flush_ops arm_smmu_fl

[RFC v16 5/9] iommu/smmuv3: Implement attach/detach_pasid_table

2021-10-27 Thread Eric Auger
On attach_pasid_table() we program STE S1 related info set
by the guest into the actual physical STEs. At minimum
we need to program the context descriptor GPA and compute
whether the stage1 is translated/bypassed or aborted.

On detach, the stage 1 config is unset and the abort flag is
unset.

Signed-off-by: Eric Auger 

---
v14 -> v15:
- add a comment before arm_smmu_get_cd_ptr to warn the
  developper this function must not be used in case of nested
  (Keqian)

v13 -> v14:
- on PASID table detach, reset the abort flag (Keqian)

v7 -> v8:
- remove smmu->features check, now done on domain finalize

v6 -> v7:
- check versions and comment the fact we don't need to take
  into account s1dss and s1fmt
v3 -> v4:
- adapt to changes in iommu_pasid_table_config
- different programming convention at s1_cfg/s2_cfg/ste.abort

v2 -> v3:
- callback now is named set_pasid_table and struct fields
  are laid out differently.

v1 -> v2:
- invalidate the STE before changing them
- hold init_mutex
- handle new fields
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 93 +
 1 file changed, 93 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5e0917e1226b..bb2681581283 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1004,6 +1004,10 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
+/*
+ * Must not be used in case of nested mode where the CD table is owned
+ * by the guest
+ */
 static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain *smmu_domain,
   u32 ssid)
 {
@@ -2809,6 +2813,93 @@ static void arm_smmu_get_resv_regions(struct device *dev,
iommu_dma_get_resv_regions(dev, head);
 }
 
+static int arm_smmu_attach_pasid_table(struct iommu_domain *domain,
+  struct iommu_pasid_table_config *cfg)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_master *master;
+   struct arm_smmu_device *smmu;
+   unsigned long flags;
+   int ret = -EINVAL;
+
+   if (cfg->format != IOMMU_PASID_FORMAT_SMMUV3)
+   return -EINVAL;
+
+   if (cfg->version != PASID_TABLE_CFG_VERSION_1 ||
+   cfg->vendor_data.smmuv3.version != PASID_TABLE_SMMUV3_CFG_VERSION_1)
+   return -EINVAL;
+
+   mutex_lock(_domain->init_mutex);
+
+   smmu = smmu_domain->smmu;
+
+   if (!smmu)
+   goto out;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto out;
+
+   switch (cfg->config) {
+   case IOMMU_PASID_CONFIG_ABORT:
+   smmu_domain->s1_cfg.set = false;
+   smmu_domain->abort = true;
+   break;
+   case IOMMU_PASID_CONFIG_BYPASS:
+   smmu_domain->s1_cfg.set = false;
+   smmu_domain->abort = false;
+   break;
+   case IOMMU_PASID_CONFIG_TRANSLATE:
+   /* we do not support S1 <-> S1 transitions */
+   if (smmu_domain->s1_cfg.set)
+   goto out;
+
+   /*
+* we currently support a single CD so s1fmt and s1dss
+* fields are also ignored
+*/
+   if (cfg->pasid_bits)
+   goto out;
+
+   smmu_domain->s1_cfg.cdcfg.cdtab_dma = cfg->base_ptr;
+   smmu_domain->s1_cfg.set = true;
+   smmu_domain->abort = false;
+   break;
+   default:
+   goto out;
+   }
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head)
+   arm_smmu_install_ste_for_dev(master);
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+   ret = 0;
+out:
+   mutex_unlock(_domain->init_mutex);
+   return ret;
+}
+
+static void arm_smmu_detach_pasid_table(struct iommu_domain *domain)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_master *master;
+   unsigned long flags;
+
+   mutex_lock(_domain->init_mutex);
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto unlock;
+
+   smmu_domain->s1_cfg.set = false;
+   smmu_domain->abort = false;
+
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head)
+   arm_smmu_install_ste_for_dev(master);
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+
+unlock:
+   mutex_unlock(_domain->init_mutex);
+}
+
 static bool arm_smmu_dev_has_feature(struct device *dev,
 enum iommu_dev_features feat)
 {
@@ -2906,6 +2997,8 @@ st

[RFC v16 4/9] iommu/smmuv3: Get prepared for nested stage support

2021-10-27 Thread Eric Auger
When nested stage translation is setup, both s1_cfg and
s2_cfg are set.

We introduce a new smmu_domain abort field that will be set
upon guest stage1 configuration passing. If no guest stage1
config has been attached, it is ignored when writing the STE.

arm_smmu_write_strtab_ent() is modified to write both stage
fields in the STE and deal with the abort field.

In nested mode, only stage 2 is "finalized" as the host does
not own/configure the stage 1 context descriptor; guest does.

Signed-off-by: Eric Auger 

---
v13 -> v14:
- removed BUG_ON(ste_live && !nested) as this should never happen
- restored the old comment as there is always an abort in between
  S2 -> S1 + S2 and S1 + S2 -> S2
- remove sparse warning

v10 -> v11:
- Fix an issue reported by Shameer when switching from with vSMMU
  to without vSMMU. Despite the spec does not seem to mention it
  seems to be needed to reset the 2 high 64b when switching from
  S1+S2 cfg to S1 only. Especially dst[3] needs to be reset (S2TTB).
  On some implementations, if the S2TTB is not reset, this causes
  a C_BAD_STE error
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 55 ++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  2 +
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b8384a834552..5e0917e1226b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1252,7 +1252,8 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
 * 3. Update Config, sync
 */
u64 val = le64_to_cpu(dst[0]);
-   bool ste_live = false;
+   bool s1_live = false, s2_live = false, ste_live;
+   bool abort, translate = false;
struct arm_smmu_device *smmu = NULL;
struct arm_smmu_s1_cfg *s1_cfg;
struct arm_smmu_s2_cfg *s2_cfg;
@@ -1292,6 +1293,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
default:
break;
}
+   translate = s1_cfg->set || s2_cfg->set;
}
 
if (val & STRTAB_STE_0_V) {
@@ -1299,23 +1301,36 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
case STRTAB_STE_0_CFG_BYPASS:
break;
case STRTAB_STE_0_CFG_S1_TRANS:
+   s1_live = true;
+   break;
case STRTAB_STE_0_CFG_S2_TRANS:
-   ste_live = true;
+   s2_live = true;
+   break;
+   case STRTAB_STE_0_CFG_NESTED:
+   s1_live = true;
+   s2_live = true;
break;
case STRTAB_STE_0_CFG_ABORT:
-   BUG_ON(!disable_bypass);
break;
default:
BUG(); /* STE corruption */
}
}
 
+   ste_live = s1_live || s2_live;
+
/* Nuke the existing STE_0 value, as we're going to rewrite it */
val = STRTAB_STE_0_V;
 
/* Bypass/fault */
-   if (!smmu_domain || !(s1_cfg->set || s2_cfg->set)) {
-   if (!smmu_domain && disable_bypass)
+
+   if (!smmu_domain)
+   abort = disable_bypass;
+   else
+   abort = smmu_domain->abort;
+
+   if (abort || !translate) {
+   if (abort)
val |= FIELD_PREP(STRTAB_STE_0_CFG, 
STRTAB_STE_0_CFG_ABORT);
else
val |= FIELD_PREP(STRTAB_STE_0_CFG, 
STRTAB_STE_0_CFG_BYPASS);
@@ -1333,11 +1348,17 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
return;
}
 
+   if (ste_live) {
+   /* First invalidate the live STE */
+   dst[0] = cpu_to_le64(STRTAB_STE_0_CFG_ABORT);
+   arm_smmu_sync_ste_for_sid(smmu, sid);
+   }
+
if (s1_cfg->set) {
u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
 
-   BUG_ON(ste_live);
+   BUG_ON(s1_live);
dst[1] = cpu_to_le64(
 FIELD_PREP(STRTAB_STE_1_S1DSS, 
STRTAB_STE_1_S1DSS_SSID0) |
 FIELD_PREP(STRTAB_STE_1_S1CIR, 
STRTAB_STE_1_S1C_CACHE_WBRA) |
@@ -1356,7 +1377,14 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
}
 
if (s2_cfg->set) {
-   BUG_ON(ste_live);
+   u64 vttbr = s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK;
+
+   if (s2_live) {
+   u64 s2ttb = le64_to_cpu(dst[3]) & 
STRTAB_STE_3_S2TTB_MASK;
+
+   

[RFC v16 3/9] iommu/smmuv3: Allow s1 and s2 configs to coexist

2021-10-27 Thread Eric Auger
In true nested mode, both s1_cfg and s2_cfg will coexist.
Let's remove the union and add a "set" field in each
config structure telling whether the config is set and needs
to be applied when writing the STE. In legacy nested mode,
only the second stage is used. In true nested mode, both stages
are used and the S1 config is "set" when the guest passes
its pasid table.

No functional change intended.

Signed-off-by: Eric Auger 

---

v13 -> v14:
- slight reword of the commit message

v12 -> v13:
- does not dynamically allocate s1-cfg and s2_cfg anymore. Add
  the set field
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 43 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  8 ++--
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 61477853a536..b8384a834552 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1254,8 +1254,8 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
u64 val = le64_to_cpu(dst[0]);
bool ste_live = false;
struct arm_smmu_device *smmu = NULL;
-   struct arm_smmu_s1_cfg *s1_cfg = NULL;
-   struct arm_smmu_s2_cfg *s2_cfg = NULL;
+   struct arm_smmu_s1_cfg *s1_cfg;
+   struct arm_smmu_s2_cfg *s2_cfg;
struct arm_smmu_domain *smmu_domain = NULL;
struct arm_smmu_cmdq_ent prefetch_cmd = {
.opcode = CMDQ_OP_PREFETCH_CFG,
@@ -1270,13 +1270,24 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
}
 
if (smmu_domain) {
+   s1_cfg = _domain->s1_cfg;
+   s2_cfg = _domain->s2_cfg;
+
switch (smmu_domain->stage) {
case ARM_SMMU_DOMAIN_S1:
-   s1_cfg = _domain->s1_cfg;
+   s1_cfg->set = true;
+   s2_cfg->set = false;
break;
case ARM_SMMU_DOMAIN_S2:
+   s1_cfg->set = false;
+   s2_cfg->set = true;
+   break;
case ARM_SMMU_DOMAIN_NESTED:
-   s2_cfg = _domain->s2_cfg;
+   /*
+* Actual usage of stage 1 depends on nested mode:
+* legacy (2d stage only) or true nested mode
+*/
+   s2_cfg->set = true;
break;
default:
break;
@@ -1303,7 +1314,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
val = STRTAB_STE_0_V;
 
/* Bypass/fault */
-   if (!smmu_domain || !(s1_cfg || s2_cfg)) {
+   if (!smmu_domain || !(s1_cfg->set || s2_cfg->set)) {
if (!smmu_domain && disable_bypass)
val |= FIELD_PREP(STRTAB_STE_0_CFG, 
STRTAB_STE_0_CFG_ABORT);
else
@@ -1322,7 +1333,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
return;
}
 
-   if (s1_cfg) {
+   if (s1_cfg->set) {
u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
 
@@ -1344,7 +1355,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
FIELD_PREP(STRTAB_STE_0_S1FMT, s1_cfg->s1fmt);
}
 
-   if (s2_cfg) {
+   if (s2_cfg->set) {
BUG_ON(ste_live);
dst[2] = cpu_to_le64(
 FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
@@ -2036,23 +2047,23 @@ static void arm_smmu_domain_free(struct iommu_domain 
*domain)
 {
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_device *smmu = smmu_domain->smmu;
+   struct arm_smmu_s1_cfg *s1_cfg = _domain->s1_cfg;
+   struct arm_smmu_s2_cfg *s2_cfg = _domain->s2_cfg;
 
free_io_pgtable_ops(smmu_domain->pgtbl_ops);
 
/* Free the CD and ASID, if we allocated them */
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
-   struct arm_smmu_s1_cfg *cfg = _domain->s1_cfg;
-
+   if (s1_cfg->set) {
/* Prevent SVA from touching the CD while we're freeing it */
mutex_lock(_smmu_asid_lock);
-   if (cfg->cdcfg.cdtab)
+   if (s1_cfg->cdcfg.cdtab)
arm_smmu_free_cd_tables(smmu_domain);
-   arm_smmu_free_asid(>cd);
+   arm_smmu_free_asid(_cfg->cd);
mutex_unlock(_smmu_asid_lock);
-   } else {
-   struct arm_smmu_s2_cfg *cfg = _domain->s2_cfg;
-   

[RFC v16 2/9] iommu: Introduce iommu_get_nesting

2021-10-27 Thread Eric Auger
Add iommu_get_nesting() which allow to retrieve whether a domain
uses nested stages.

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  8 
 drivers/iommu/arm/arm-smmu/arm-smmu.c   |  8 
 drivers/iommu/intel/iommu.c | 13 +
 drivers/iommu/iommu.c   | 10 ++
 include/linux/iommu.h   |  8 
 5 files changed, 47 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index a388e318f86e..61477853a536 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2731,6 +2731,13 @@ static int arm_smmu_enable_nesting(struct iommu_domain 
*domain)
return ret;
 }
 
+static bool arm_smmu_get_nesting(struct iommu_domain *domain)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+   return smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED;
+}
+
 static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2845,6 +2852,7 @@ static struct iommu_ops arm_smmu_ops = {
.release_device = arm_smmu_release_device,
.device_group   = arm_smmu_device_group,
.enable_nesting = arm_smmu_enable_nesting,
+   .get_nesting= arm_smmu_get_nesting,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
.put_resv_regions   = generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4bc75c4ce402..167cf1d51279 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1522,6 +1522,13 @@ static int arm_smmu_enable_nesting(struct iommu_domain 
*domain)
return ret;
 }
 
+static bool arm_smmu_get_nesting(struct iommu_domain *domain)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+   return smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED;
+}
+
 static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirks)
 {
@@ -1595,6 +1602,7 @@ static struct iommu_ops arm_smmu_ops = {
.probe_finalize = arm_smmu_probe_finalize,
.device_group   = arm_smmu_device_group,
.enable_nesting = arm_smmu_enable_nesting,
+   .get_nesting= arm_smmu_get_nesting,
.set_pgtable_quirks = arm_smmu_set_pgtable_quirks,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d75f59ae28e6..e42767bd47f9 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5524,6 +5524,18 @@ intel_iommu_enable_nesting(struct iommu_domain *domain)
return ret;
 }
 
+static bool intel_iommu_get_nesting(struct iommu_domain *domain)
+{
+   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+   bool nesting;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   nesting =  dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE &&
+  !(dmar_domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL);
+   spin_unlock_irqrestore(_domain_lock, flags);
+   return nesting;
+}
+
 /*
  * Check that the device does not live on an external facing PCI port that is
  * marked as untrusted. Such devices should not be able to apply quirks and
@@ -5561,6 +5573,7 @@ const struct iommu_ops intel_iommu_ops = {
.domain_alloc   = intel_iommu_domain_alloc,
.domain_free= intel_iommu_domain_free,
.enable_nesting = intel_iommu_enable_nesting,
+   .get_nesting= intel_iommu_get_nesting,
.attach_dev = intel_iommu_attach_device,
.detach_dev = intel_iommu_detach_device,
.aux_attach_dev = intel_iommu_aux_attach_device,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6033c263c6e6..3e639c4e8015 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2843,6 +2843,16 @@ int iommu_enable_nesting(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL_GPL(iommu_enable_nesting);
 
+bool iommu_get_nesting(struct iommu_domain *domain)
+{
+   if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+   return false;
+   if (!domain->ops->enable_nesting)
+   return false;
+   return domain->ops->get_nesting(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_get_nesting);
+
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirk)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e34a1b1c805b..846e19151f40 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h

[RFC v16 1/9] iommu: Introduce attach/detach_pasid_table API

2021-10-27 Thread Eric Auger
In virtualization use case, when a guest is assigned
a PCI host device, protected by a virtual IOMMU on the guest,
the physical IOMMU must be programmed to be consistent with
the guest mappings. If the physical IOMMU supports two
translation stages it makes sense to program guest mappings
onto the first stage/level (ARM/Intel terminology) while the host
owns the stage/level 2.

In that case, it is mandated to trap on guest configuration
settings and pass those to the physical iommu driver.

This patch adds a new API to the iommu subsystem that allows
to set/unset the pasid table information.

A generic iommu_pasid_table_config struct is introduced in
a new iommu.h uapi header. This is going to be used by the VFIO
user API.

Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
Signed-off-by: Jacob Pan 
Signed-off-by: Eric Auger 

---

v13 -> v14:
- export iommu_attach_pasid_table
- add dummy iommu_uapi_attach_pasid_table
- swap base_ptr and format in iommu_pasid_table_config

v12 -> v13:
- Fix config check

v11 -> v12:
- add argsz, name the union
---
 drivers/iommu/iommu.c  | 69 ++
 include/linux/iommu.h  | 27 +++
 include/uapi/linux/iommu.h | 54 +
 3 files changed, 150 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3303d707bab4..6033c263c6e6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2236,6 +2236,75 @@ int iommu_uapi_sva_unbind_gpasid(struct iommu_domain 
*domain, struct device *dev
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
 
+int iommu_attach_pasid_table(struct iommu_domain *domain,
+struct iommu_pasid_table_config *cfg)
+{
+   if (unlikely(!domain->ops->attach_pasid_table))
+   return -ENODEV;
+
+   return domain->ops->attach_pasid_table(domain, cfg);
+}
+EXPORT_SYMBOL_GPL(iommu_attach_pasid_table);
+
+int iommu_uapi_attach_pasid_table(struct iommu_domain *domain,
+ void __user *uinfo)
+{
+   struct iommu_pasid_table_config pasid_table_data = { 0 };
+   u32 minsz;
+
+   if (unlikely(!domain->ops->attach_pasid_table))
+   return -ENODEV;
+
+   /*
+* No new spaces can be added before the variable sized union, the
+* minimum size is the offset to the union.
+*/
+   minsz = offsetof(struct iommu_pasid_table_config, vendor_data);
+
+   /* Copy minsz from user to get flags and argsz */
+   if (copy_from_user(_table_data, uinfo, minsz))
+   return -EFAULT;
+
+   /* Fields before the variable size union are mandatory */
+   if (pasid_table_data.argsz < minsz)
+   return -EINVAL;
+
+   /* PASID and address granu require additional info beyond minsz */
+   if (pasid_table_data.version != PASID_TABLE_CFG_VERSION_1)
+   return -EINVAL;
+   if (pasid_table_data.format == IOMMU_PASID_FORMAT_SMMUV3 &&
+   pasid_table_data.argsz <
+   offsetofend(struct iommu_pasid_table_config, 
vendor_data.smmuv3))
+   return -EINVAL;
+
+   /*
+* User might be using a newer UAPI header which has a larger data
+* size, we shall support the existing flags within the current
+* size. Copy the remaining user data _after_ minsz but not more
+* than the current kernel supported size.
+*/
+   if (copy_from_user((void *)_table_data + minsz, uinfo + minsz,
+  min_t(u32, pasid_table_data.argsz, 
sizeof(pasid_table_data)) - minsz))
+   return -EFAULT;
+
+   /* Now the argsz is validated, check the content */
+   if (pasid_table_data.config < IOMMU_PASID_CONFIG_TRANSLATE ||
+   pasid_table_data.config > IOMMU_PASID_CONFIG_ABORT)
+   return -EINVAL;
+
+   return domain->ops->attach_pasid_table(domain, _table_data);
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_attach_pasid_table);
+
+void iommu_detach_pasid_table(struct iommu_domain *domain)
+{
+   if (unlikely(!domain->ops->detach_pasid_table))
+   return;
+
+   domain->ops->detach_pasid_table(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d2f3435e7d17..e34a1b1c805b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -232,6 +232,8 @@ struct iommu_iotlb_gather {
  * @cache_invalidate: invalidate translation caches
  * @sva_bind_gpasid: bind guest pasid and mm
  * @sva_unbind_gpasid: unbind guest pasid and mm
+ * @attach_pasid_table: attach a pasid table
+ * @detach_pasid_table: detach the pasid table
  * @def_domain_type: device default domain type, return value:
  *

[RFC v16 0/9] SMMUv3 Nested Stage Setup (IOMMU part)

2021-10-27 Thread Eric Auger
This series brings the IOMMU part of HW nested paging support
in the SMMUv3.

The SMMUv3 driver is adapted to support 2 nested stages.

The IOMMU API is extended to convey the guest stage 1
configuration and the hook is implemented in the SMMUv3 driver.

This allows the guest to own the stage 1 tables and context
descriptors (so-called PASID table) while the host owns the
stage 2 tables and main configuration structures (STE).

This work mainly is provided for test purpose as the upper
layer integration is under rework and bound to be based on
/dev/iommu instead of VFIO tunneling. In this version we also get
rid of the MSI BINDING ioctl, assuming the guest enforces
flat mapping of host IOVAs used to bind physical MSI doorbells.
In the current QEMU integration this is achieved by exposing
RMRs to the guest, using Shameer's series [1]. This approach
is RFC as the IORT spec is not really meant to do that
(single mapping flag limitation).

Best Regards

Eric

This series (Host) can be found at:
https://github.com/eauger/linux/tree/v5.15-rc7-nested-v16
This includes a rebased VFIO integration (although not meant
to be upstreamed)

Guest kernel branch can be found at:
https://github.com/eauger/linux/tree/shameer_rmrr_v7
featuring [1]

QEMU integration (still based on VFIO and exposing RMRs)
can be found at:
https://github.com/eauger/qemu/tree/v6.1.0-rmr-v2-nested_smmuv3_v10
(use iommu=nested-smmuv3 ARM virt option)

Guest dependency:
[1] [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node

History:

v15 -> v16:
- guest RIL must support RIL
- additional checks in the cache invalidation hook
- removal of the MSI BINDING ioctl (tentative replacement
  by RMRs)


Eric Auger (9):
  iommu: Introduce attach/detach_pasid_table API
  iommu: Introduce iommu_get_nesting
  iommu/smmuv3: Allow s1 and s2 configs to coexist
  iommu/smmuv3: Get prepared for nested stage support
  iommu/smmuv3: Implement attach/detach_pasid_table
  iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs
  iommu/smmuv3: Implement cache_invalidate
  iommu/smmuv3: report additional recoverable faults
  iommu/smmuv3: Disallow nested mode in presence of HW MSI regions

 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 383 ++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  14 +-
 drivers/iommu/arm/arm-smmu/arm-smmu.c   |   8 +
 drivers/iommu/intel/iommu.c |  13 +
 drivers/iommu/iommu.c   |  79 
 include/linux/iommu.h   |  35 ++
 include/uapi/linux/iommu.h  |  54 +++
 7 files changed, 548 insertions(+), 38 deletions(-)

-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node

2021-09-30 Thread Eric Auger
Hi Shameer,

On 8/5/21 10:07 AM, Shameer Kolothum wrote:
> Hi,
>
> The series adds support to IORT RMR nodes specified in IORT
> Revision E.b -ARM DEN 0049E[0]. RMR nodes are used to describe
> memory ranges that are used by endpoints and require a unity
> mapping in SMMU.

I used your series and RMRs to force a guest iommu (vSMMUv3 nested stage
use case) to have a flat mapping for IOVAs within [0x800, 0x810]
(matching MSI_IOVA_BASE and MSI_IOVA_LENGTH) used by the host to map MSI
physical doorbells.

That way when an assigned device protected by a vSMMUv3 implemented upon
nested stage issues an MSI transaction, let's say using IOVA=0x800,
we would get:
                    S1 (guest)           S2 (host)
0x800            0x800            Physical DB

This method was suggested by Jean-Philippe (added in CC) and it
simplifies the nested stage integration because we don't have to care
about nested stage MSI bindings.

However if I understand correctly we cannot define a range of SIDs using
the same RMR (due to the single mapping bit which must be set, Table 5
flags format). This is a spec restriction and not an issue with your series.

As VFIO devices can be hot-plugged we thus need to create as many RMR
nodes as potential BDFs, leading to 256 * 6 = 1536 RMR nodes if you have
5 pcie root ports as it is usual in VMs. Then this causes some trouble
at qemu level for instance, wrt migration. See [RFC]
hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding.

Do you know if there is a plan to remove the single mapping limitation
in the spec?

Thanks

Eric
>
> We have faced issues with 3408iMR RAID controller cards which
> fail to boot when SMMU is enabled. This is because these
> controllers make use of host memory for various caching related
> purposes and when SMMU is enabled the iMR firmware fails to 
> access these memory regions as there is no mapping for them.
> IORT RMR provides a way for UEFI to describe and report these
> memory regions so that the kernel can make a unity mapping for
> these in SMMU.
>
> Change History: 
>
> v6 --> v7
>
> The only change from v6 is the fix pointed out by Steve to
> the SMMUv2 SMR bypass install in patch #8.
>
> Thanks to the Tested-by tags by Laurentiu with SMMUv2 and
> Hanjun/Huiqiang with SMMUv3 for v6. I haven't added the tags
> yet as the series still needs more review[1].
>
> Feedback and tests on this series is very much appreciated.
>
> v5 --> v6
> - Addressed comments from Robin & Lorenzo.
>   : Moved iort_parse_rmr() to acpi_iort_init() from
> iort_init_platform_devices().
>   : Removed use of struct iort_rmr_entry during the initial
> parse. Using struct iommu_resv_region instead.
>   : Report RMR address alignment and overlap errors, but continue.
>   : Reworked arm_smmu_init_bypass_stes() (patch # 6).
> - Updated SMMUv2 bypass SMR code. Thanks to Jon N (patch #8).
> - Set IOMMU protection flags(IOMMU_CACHE, IOMMU_MMIO) based
>   on Type of RMR region. Suggested by Jon N.
>
> Thanks,
> Shameer
> [0] https://developer.arm.com/documentation/den0049/latest/
> [1] 
> https://lore.kernel.org/linux-acpi/20210716083442.1708-1-shameerali.kolothum.th...@huawei.com/T/#m043c95b869973a834b2fd57f3e1ed0325c84f3b7
> --
> v4 --> v5
>  -Added a fw_data union to struct iommu_resv_region and removed
>   struct iommu_rmr (Based on comments from Joerg/Robin).
>  -Added iommu_put_rmrs() to release mem.
>  -Thanks to Steve for verifying on SMMUv2, but not added the Tested-by
>   yet because of the above changes.
>
> v3 -->v4
> -Included the SMMUv2 SMR bypass install changes suggested by
>  Steve(patch #7)
> -As per Robin's comments, RMR reserve implementation is now
>  more generic  (patch #8) and dropped v3 patches 8 and 10.
> -Rebase to 5.13-rc1
>
> RFC v2 --> v3
>  -Dropped RFC tag as the ACPICA header changes are now ready to be
>   part of 5.13[0]. But this series still has a dependency on that patch.
>  -Added IORT E.b related changes(node flags, _DSM function 5 checks for
>   PCIe).
>  -Changed RMR to stream id mapping from M:N to M:1 as per the spec and
>   discussion here[1].
>  -Last two patches add support for SMMUv2(Thanks to Jon Nettleton!)
> --
>
> Jon Nettleton (1):
>   iommu/arm-smmu: Get associated RMR info and install bypass SMR
>
> Shameer Kolothum (8):
>   iommu: Introduce a union to struct iommu_resv_region
>   ACPI/IORT: Add support for RMR node parsing
>   iommu/dma: Introduce generic helper to retrieve RMR info
>   ACPI/IORT: Add a helper to retrieve RMR memory regions
>   iommu/arm-smmu-v3: Introduce strtab init helper
>   iommu/arm-smmu-v3: Refactor arm_smmu_init_bypass_stes() to force
> bypass
>   iommu/arm-smmu-v3: Get associated RMR info and install bypass STE
>   iommu/dma: Reserve any RMR regions associated with a dev
>
>  drivers/acpi/arm64/iort.c   | 172 +++-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  76 +++--
>  drivers/iommu/arm/arm-smmu/arm-smmu.c   

Re: [PATCH v15 00/12] SMMUv3 Nested Stage Setup (IOMMU part)

2021-09-28 Thread Eric Auger
Hi Krishna,

On 9/27/21 11:17 PM, Krishna Reddy wrote:
> Hi Eric,
>> This is based on Jean-Philippe's
>> [PATCH v14 00/10] iommu: I/O page faults for SMMUv3
>> https://www.spinics.net/lists/arm-kernel/msg886518.html
>> (including the patches that were not pulled for 5.13)
>>
> Jean's patches have been merged to v5.14.
> Do you anticipate IOMMU/VFIO part patches getting into upstream kernel soon?

I am going to respin the smmu part rebased on v5.15. As for the VFIO
part, this needs to be totally redesigned based on /dev/iommu (see
[RFC 00/20] Introduce /dev/iommu for userspace I/O address space
management).

I will provide some updated kernel and qemu branches for testing purpose
only.

Thanks

Eric
>
> -KR
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 03/20] vfio: Add vfio_[un]register_device()

2021-09-23 Thread Eric Auger
Hi,

On 9/22/21 3:00 AM, Jason Gunthorpe wrote:
> On Wed, Sep 22, 2021 at 12:54:02AM +, Tian, Kevin wrote:
>>> From: Jason Gunthorpe 
>>> Sent: Wednesday, September 22, 2021 12:01 AM
>>>
  One open about how to organize the device nodes under
>>> /dev/vfio/devices/.
 This RFC adopts a simple policy by keeping a flat layout with mixed
>>> devname
 from all kinds of devices. The prerequisite of this model is that devnames
 from different bus types are unique formats:
>>> This isn't reliable, the devname should just be vfio0, vfio1, etc
>>>
>>> The userspace can learn the correct major/minor by inspecting the
>>> sysfs.
>>>
>>> This whole concept should disappear into the prior patch that adds the
>>> struct device in the first place, and I think most of the code here
>>> can be deleted once the struct device is used properly.
>>>
>> Can you help elaborate above flow? This is one area where we need
>> more guidance.
>>
>> When Qemu accepts an option "-device vfio-pci,host=:BB:DD.F",
>> how does Qemu identify which vifo0/1/... is associated with the specified 
>> :BB:DD.F? 
> When done properly in the kernel the file:
>
> /sys/bus/pci/devices/:BB:DD.F/vfio/vfioX/dev
>
> Will contain the major:minor of the VFIO device.
>
> Userspace then opens the /dev/vfio/devices/vfioX and checks with fstat
> that the major:minor matches.
>
> in the above pattern "pci" and ":BB:DD.FF" are the arguments passed
> to qemu.
I guess this would be the same for platform devices, for instance
/sys/bus/platform/devices/AMDI8001:01/vfio/vfioX/dev, right?

Thanks

Eric
>
> You can look at this for some general over engineered code to handle
> opening from a sysfs handle like above:
>
> https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c
>
> Jason
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 09/20] iommu: Add page size and address width attributes

2021-09-22 Thread Eric Auger
Hi,

On 9/19/21 8:38 AM, Liu Yi L wrote:
> From: Lu Baolu 
>
> This exposes PAGE_SIZE and ADDR_WIDTH attributes. The iommufd could use
> them to define the IOAS.
>
> Signed-off-by: Lu Baolu 
> ---
>  include/linux/iommu.h | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 943de6897f56..86d34e4ce05e 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -153,9 +153,13 @@ enum iommu_dev_features {
>  /**
>   * enum iommu_devattr - Per device IOMMU attributes
>   * @IOMMU_DEV_INFO_FORCE_SNOOP [bool]: IOMMU can force DMA to be snooped.
> + * @IOMMU_DEV_INFO_PAGE_SIZE [u64]: Page sizes that iommu supports.
> + * @IOMMU_DEV_INFO_ADDR_WIDTH [u32]: Address width supported.
I think this deserves additional info. What address width do we talk
about, input, output, what stage if the IOMMU does support multiple stages

Thanks

Eric
>   */
>  enum iommu_devattr {
>   IOMMU_DEV_INFO_FORCE_SNOOP,
> + IOMMU_DEV_INFO_PAGE_SIZE,
> + IOMMU_DEV_INFO_ADDR_WIDTH,
>  };
>  
>  #define IOMMU_PASID_INVALID  (-1U)

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC v2] /dev/iommu uAPI proposal

2021-08-10 Thread Eric Auger
Hi Kevin,

On 8/5/21 2:36 AM, Tian, Kevin wrote:
>> From: Eric Auger 
>> Sent: Wednesday, August 4, 2021 11:59 PM
>>
> [...] 
>>> 1.2. Attach Device to I/O address space
>>> +++
>>>
>>> Device attach/bind is initiated through passthrough framework uAPI.
>>>
>>> Device attaching is allowed only after a device is successfully bound to
>>> the IOMMU fd. User should provide a device cookie when binding the
>>> device through VFIO uAPI. This cookie is used when the user queries
>>> device capability/format, issues per-device iotlb invalidation and
>>> receives per-device I/O page fault data via IOMMU fd.
>>>
>>> Successful binding puts the device into a security context which isolates
>>> its DMA from the rest system. VFIO should not allow user to access the
>> s/from the rest system/from the rest of the system
>>> device before binding is completed. Similarly, VFIO should prevent the
>>> user from unbinding the device before user access is withdrawn.
>> With Intel scalable IOV, I understand you could assign an RID/PASID to
>> one VM and another one to another VM (which is not the case for ARM). Is
>> it a targetted use case?How would it be handled? Is it related to the
>> sub-groups evoked hereafter?
> Not related to sub-group. Each mdev is bound to the IOMMU fd respectively
> with the defPASID which represents the mdev.
But how does it work in term of security. The device (RID) is bound to
an IOMMU fd. But then each SID/PASID may be working for a different VM.
How do you detect this is safe as each SID can work safely for a
different VM versus the ARM case where it is not possible.

1.3 says
"

1)  A successful binding call for the first device in the group creates
the security context for the entire group, by:
"
What does it mean for above scalable IOV use case?

>
>> Actually all devices bound to an IOMMU fd should have the same parent
>> I/O address space or root address space, am I correct? If so, maybe add
>> this comment explicitly?
> in most cases yes but it's not mandatory. multiple roots are allowed
> (e.g. with vIOMMU but no nesting).
OK, right, this corresponds to example 4.2 for example. I misinterpreted
the notion of security context. The security context does not match the
IOMMU fd but is something implicit created on 1st device binding.
>
> [...]
>>> The device in the /dev/iommu context always refers to a physical one
>>> (pdev) which is identifiable via RID. Physically each pdev can support
>>> one default I/O address space (routed via RID) and optionally multiple
>>> non-default I/O address spaces (via RID+PASID).
>>>
>>> The device in VFIO context is a logic concept, being either a physical
>>> device (pdev) or mediated device (mdev or subdev). Each vfio device
>>> is represented by RID+cookie in IOMMU fd. User is allowed to create
>>> one default I/O address space (routed by vRID from user p.o.v) per
>>> each vfio_device.
>> The concept of default address space is not fully clear for me. I
>> currently understand this is a
>> root address space (not nesting). Is that coorect.This may need
>> clarification.
> w/o PASID there is only one address space (either GPA or GIOVA)
> per device. This one is called default. whether it's root is orthogonal
> (e.g. GIOVA could be also nested) to the device view of this space.
>
> w/ PASID additional address spaces can be targeted by the device.
> those are called non-default.
>
> I could also rename default to RID address space and non-default to 
> RID+PASID address space if doing so makes it clearer.
Yes I think it is worth having a kind of glossary and defining root as,
default as as you clearly defined child/parent.
>
>>> VFIO decides the routing information for this default
>>> space based on device type:
>>>
>>> 1)  pdev, routed via RID;
>>>
>>> 2)  mdev/subdev with IOMMU-enforced DMA isolation, routed via
>>> the parent's RID plus the PASID marking this mdev;
>>>
>>> 3)  a purely sw-mediated device (sw mdev), no routing required i.e. no
>>> need to install the I/O page table in the IOMMU. sw mdev just uses
>>> the metadata to assist its internal DMA isolation logic on top of
>>> the parent's IOMMU page table;
>> Maybe you should introduce this concept of SW mediated device earlier
>> because it seems to special case the way the attach behaves. I am
>> especially refering to
>>
>> "Successful attaching activates an I/O address space in the IOMMU, if the
&

Re: [RFC v2] /dev/iommu uAPI proposal

2021-08-04 Thread Eric Auger
Hi Kevin,

Few comments/questions below.

On 7/9/21 9:48 AM, Tian, Kevin wrote:
> /dev/iommu provides an unified interface for managing I/O page tables for 
> devices assigned to userspace. Device passthrough frameworks (VFIO, vDPA, 
> etc.) are expected to use this interface instead of creating their own logic 
> to 
> isolate untrusted device DMAs initiated by userspace. 
>
> This proposal describes the uAPI of /dev/iommu and also sample sequences 
> with VFIO as example in typical usages. The driver-facing kernel API provided 
> by the iommu layer is still TBD, which can be discussed after consensus is 
> made on this uAPI.
>
> It's based on a lengthy discussion starting from here:
>   
> https://lore.kernel.org/linux-iommu/20210330132830.go2356...@nvidia.com/ 
>
> v1 can be found here:
>   
> https://lore.kernel.org/linux-iommu/ph0pr12mb54811863b392c644e5365446dc...@ph0pr12mb5481.namprd12.prod.outlook.com/T/
>
> This doc is also tracked on github, though it's not very useful for v1->v2 
> given dramatic refactoring:
>   https://github.com/luxis1999/dev_iommu_uapi 
>
> Changelog (v1->v2):
> - Rename /dev/ioasid to /dev/iommu (Jason);
> - Add a section for device-centric vs. group-centric design (many);
> - Add a section for handling no-snoop DMA (Jason/Alex/Paolo);
> - Add definition of user/kernel/shared I/O page tables (Baolu/Jason);
> - Allow one device bound to multiple iommu fd's (Jason);
> - No need to track user I/O page tables in kernel on ARM/AMD (Jean/Jason);
> - Add a device cookie for iotlb invalidation and fault handling (Jean/Jason);
> - Add capability/format query interface per device cookie (Jason);
> - Specify format/attribute when creating an IOASID, leading to several v1
>   uAPI commands removed (Jason);
> - Explain the value of software nesting (Jean);
> - Replace IOASID_REGISTER_VIRTUAL_MEMORY with software nesting (David/Jason);
> - Cover software mdev usage (Jason);
> - No restriction on map/unmap vs. bind/invalidate (Jason/David);
> - Report permitted IOVA range instead of reserved range (David);
> - Refine the sample structures and helper functions (Jason);
> - Add definition of default and non-default I/O address spaces;
> - Expand and clarify the design for PASID virtualization;
> - and lots of subtle refinement according to above changes;
>
> TOC
> 
> 1. Terminologies and Concepts
> 1.1. Manage I/O address space
> 1.2. Attach device to I/O address space
> 1.3. Group isolation
> 1.4. PASID virtualization
> 1.4.1. Devices which don't support DMWr
> 1.4.2. Devices which support DMWr
> 1.4.3. Mix different types together
> 1.4.4. User sequence
> 1.5. No-snoop DMA
> 2. uAPI Proposal
> 2.1. /dev/iommu uAPI
> 2.2. /dev/vfio device uAPI
> 2.3. /dev/kvm uAPI
> 3. Sample Structures and Helper Functions
> 4. Use Cases and Flows
> 4.1. A simple example
> 4.2. Multiple IOASIDs (no nesting)
> 4.3. IOASID nesting (software)
> 4.4. IOASID nesting (hardware)
> 4.5. Guest SVA (vSVA)
> 4.6. I/O page fault
> 
>
> 1. Terminologies and Concepts
> -
>
> IOMMU fd is the container holding multiple I/O address spaces. User 
> manages those address spaces through fd operations. Multiple fd's are 
> allowed per process, but with this proposal one fd should be sufficient for 
> all intended usages.
>
> IOASID is the fd-local software handle representing an I/O address space. 
> Each IOASID is associated with a single I/O page table. IOASIDs can be 
> nested together, implying the output address from one I/O page table 
> (represented by child IOASID) must be further translated by another I/O 
> page table (represented by parent IOASID).
>
> An I/O address space takes effect only after it is attached by a device. 
> One device is allowed to attach to multiple I/O address spaces. One I/O 
> address space can be attached by multiple devices.
>
> Device must be bound to an IOMMU fd before attach operation can be
> conducted. Though not necessary, user could bind one device to multiple
> IOMMU FD's. But no cross-FD IOASID nesting is allowed.
>
> The format of an I/O page table must be compatible to the attached 
> devices (or more specifically to the IOMMU which serves the DMA from
> the attached devices). User is responsible for specifying the format
> when allocating an IOASID, according to one or multiple devices which
> will be attached right after. Attaching a device to an IOASID with 
> incompatible format is simply rejected.
>
> Relationship between IOMMU fd, VFIO fd and KVM fd:
>
> -   IOMMU fd provides uAPI for managing IOASIDs and I/O page tables. 
> It also provides an unified capability/format reporting interface for
> each bound device. 
>
> -   VFIO fd provides uAPI for device binding and attaching. In this proposal 
> VFIO is used as the example of device passthrough frameworks. The
> routing information that identifies an I/O 

Re: [PATCH v5 2/5] ACPI: Move IOMMU setup code out of IORT

2021-06-22 Thread Eric Auger
Hi Jean,
On 6/18/21 5:20 PM, Jean-Philippe Brucker wrote:
> Extract the code that sets up the IOMMU infrastructure from IORT, since
> it can be reused by VIOT. Move it one level up into a new
> acpi_iommu_configure_id() function, which calls the IORT parsing
> function which in turn calls the acpi_iommu_fwspec_init() helper.
>
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Thanks

Eric

> ---
>  include/acpi/acpi_bus.h   |  3 ++
>  include/linux/acpi_iort.h |  8 ++---
>  drivers/acpi/arm64/iort.c | 74 +--
>  drivers/acpi/scan.c   | 73 +-
>  4 files changed, 86 insertions(+), 72 deletions(-)
>
> diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
> index 3a82faac5767..41f092a269f6 100644
> --- a/include/acpi/acpi_bus.h
> +++ b/include/acpi/acpi_bus.h
> @@ -588,6 +588,9 @@ struct acpi_pci_root {
>  
>  bool acpi_dma_supported(struct acpi_device *adev);
>  enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
> +int acpi_iommu_fwspec_init(struct device *dev, u32 id,
> +struct fwnode_handle *fwnode,
> +const struct iommu_ops *ops);
>  int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
>  u64 *size);
>  int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
> diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
> index f7f054833afd..f1f0842a2cb2 100644
> --- a/include/linux/acpi_iort.h
> +++ b/include/linux/acpi_iort.h
> @@ -35,8 +35,7 @@ void acpi_configure_pmsi_domain(struct device *dev);
>  int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
>  /* IOMMU interface */
>  int iort_dma_get_ranges(struct device *dev, u64 *size);
> -const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
> - const u32 *id_in);
> +int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
>  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
> *head);
>  phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
>  #else
> @@ -50,9 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device 
> *dev) { }
>  /* IOMMU interface */
>  static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
>  { return -ENODEV; }
> -static inline const struct iommu_ops *iort_iommu_configure_id(
> -   struct device *dev, const u32 *id_in)
> -{ return NULL; }
> +static inline int iort_iommu_configure_id(struct device *dev, const u32 
> *id_in)
> +{ return -ENODEV; }
>  static inline
>  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
> *head)
>  { return 0; }
> diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
> index a940be1cf2af..487d1095030d 100644
> --- a/drivers/acpi/arm64/iort.c
> +++ b/drivers/acpi/arm64/iort.c
> @@ -806,23 +806,6 @@ static struct acpi_iort_node 
> *iort_get_msi_resv_iommu(struct device *dev)
>   return NULL;
>  }
>  
> -static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device 
> *dev)
> -{
> - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> -
> - return (fwspec && fwspec->ops) ? fwspec->ops : NULL;
> -}
> -
> -static inline int iort_add_device_replay(struct device *dev)
> -{
> - int err = 0;
> -
> - if (dev->bus && !device_iommu_mapped(dev))
> - err = iommu_probe_device(dev);
> -
> - return err;
> -}
> -
>  /**
>   * iort_iommu_msi_get_resv_regions - Reserved region driver helper
>   * @dev: Device from iommu_get_resv_regions()
> @@ -900,18 +883,6 @@ static inline bool iort_iommu_driver_enabled(u8 type)
>   }
>  }
>  
> -static int arm_smmu_iort_xlate(struct device *dev, u32 streamid,
> -struct fwnode_handle *fwnode,
> -const struct iommu_ops *ops)
> -{
> - int ret = iommu_fwspec_init(dev, fwnode, ops);
> -
> - if (!ret)
> - ret = iommu_fwspec_add_ids(dev, , 1);
> -
> - return ret;
> -}
> -
>  static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
>  {
>   struct acpi_iort_root_complex *pci_rc;
> @@ -946,7 +917,7 @@ static int iort_iommu_xlate(struct device *dev, struct 
> acpi_iort_node *node,
>   return iort_iommu_driver_enabled(node->type) ?
>  -EPROBE_DEFER : -ENODEV;
>  
> - return arm_smmu_iort_xlate(dev, streamid, iort_fwnode, ops);
> + return acpi_iommu_fwspec_init(dev, streamid, iort_fwnode, ops);
>  }
>  
>  struct iort_pci_alias_info {
&

Re: [PATCH v5 4/5] iommu/dma: Pass address limit rather than size to iommu_setup_dma_ops()

2021-06-22 Thread Eric Auger
Hi Jean,

On 6/18/21 5:20 PM, Jean-Philippe Brucker wrote:
> Passing a 64-bit address width to iommu_setup_dma_ops() is valid on
> virtual platforms, but isn't currently possible. The overflow check in
> iommu_dma_init_domain() prevents this even when @dma_base isn't 0. Pass
> a limit address instead of a size, so callers don't have to fake a size
> to work around the check.
>
> The base and limit parameters are being phased out, because:
> * they are redundant for x86 callers. dma-iommu already reserves the
>   first page, and the upper limit is already in domain->geometry.
> * they can now be obtained from dev->dma_range_map on Arm.
> But removing them on Arm isn't completely straightforward so is left for
> future work. As an intermediate step, simplify the x86 callers by
> passing dummy limits.
>
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Eric
> ---
>  include/linux/dma-iommu.h   |  4 ++--
>  arch/arm64/mm/dma-mapping.c |  2 +-
>  drivers/iommu/amd/iommu.c   |  2 +-
>  drivers/iommu/dma-iommu.c   | 12 ++--
>  drivers/iommu/intel/iommu.c |  5 +
>  5 files changed, 11 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
> index 6e75a2d689b4..758ca4694257 100644
> --- a/include/linux/dma-iommu.h
> +++ b/include/linux/dma-iommu.h
> @@ -19,7 +19,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, 
> dma_addr_t base);
>  void iommu_put_dma_cookie(struct iommu_domain *domain);
>  
>  /* Setup call for arch DMA mapping code */
> -void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
> +void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
>  
>  /* The DMA API isn't _quite_ the whole story, though... */
>  /*
> @@ -50,7 +50,7 @@ struct msi_msg;
>  struct device;
>  
>  static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
> - u64 size)
> +u64 dma_limit)
>  {
>  }
>  
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index 4bf1dd3eb041..6719f9efea09 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -50,7 +50,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, 
> u64 size,
>  
>   dev->dma_coherent = coherent;
>   if (iommu)
> - iommu_setup_dma_ops(dev, dma_base, size);
> + iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
>  
>  #ifdef CONFIG_XEN
>   if (xen_swiotlb_detect())
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 3ac42bbdefc6..216323fb27ef 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -1713,7 +1713,7 @@ static void amd_iommu_probe_finalize(struct device *dev)
>   /* Domains are initialized for this device - have a look what we ended 
> up with */
>   domain = iommu_get_domain_for_dev(dev);
>   if (domain->type == IOMMU_DOMAIN_DMA)
> - iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
> + iommu_setup_dma_ops(dev, 0, U64_MAX);
>   else
>   set_dma_ops(dev, NULL);
>  }
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 7bcdd1205535..c62e19bed302 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -319,16 +319,16 @@ static bool dev_is_untrusted(struct device *dev)
>   * iommu_dma_init_domain - Initialise a DMA mapping domain
>   * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
>   * @base: IOVA at which the mappable address space starts
> - * @size: Size of IOVA space
> + * @limit: Last address of the IOVA space
>   * @dev: Device the domain is being initialised for
>   *
> - * @base and @size should be exact multiples of IOMMU page granularity to
> + * @base and @limit + 1 should be exact multiples of IOMMU page granularity 
> to
>   * avoid rounding surprises. If necessary, we reserve the page at address 0
>   * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
>   * any change which could make prior IOVAs invalid will fail.
>   */
>  static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t 
> base,
> - u64 size, struct device *dev)
> +  dma_addr_t limit, struct device *dev)
>  {
>   struct iommu_dma_cookie *cookie = domain->iova_cookie;
>   unsigned long order, base_pfn;
> @@ -346,7 +346,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
> *domain, dma_addr_t base,
>   /* Check the domain allows at least some access to the device... */
>   if (domain->geometry

Re: [PATCH v4 6/6] iommu/virtio: Enable x86 support

2021-06-16 Thread Eric Auger
Hi jean,

On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> With the VIOT support in place, x86 platforms can now use the
> virtio-iommu.
>
> Because the other x86 IOMMU drivers aren't yet ready to use the
> acpi_dma_setup() path, x86 doesn't implement arch_setup_dma_ops() at the
> moment. Similarly to Vt-d and AMD IOMMU, call iommu_setup_dma_ops() from
> probe_finalize().
>
> Acked-by: Joerg Roedel 
> Acked-by: Michael S. Tsirkin 
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Eric
> ---
>  drivers/iommu/Kconfig| 3 ++-
>  drivers/iommu/dma-iommu.c| 1 +
>  drivers/iommu/virtio-iommu.c | 8 
>  3 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index aff8a4830dd1..07b7c25cbed8 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -400,8 +400,9 @@ config HYPERV_IOMMU
>  config VIRTIO_IOMMU
>   tristate "Virtio IOMMU driver"
>   depends on VIRTIO
> - depends on ARM64
> + depends on (ARM64 || X86)
>   select IOMMU_API
> + select IOMMU_DMA
>   select INTERVAL_TREE
>   select ACPI_VIOT if ACPI
>   help
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 175f8eaeb5b3..46ed43c400cf 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1332,6 +1332,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 
> dma_base, u64 dma_limit)
>pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA 
> ops\n",
>dev_name(dev));
>  }
> +EXPORT_SYMBOL_GPL(iommu_setup_dma_ops);
>  
>  static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
>   phys_addr_t msi_addr, struct iommu_domain *domain)
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 218fe8560e8d..77aee1207ced 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -1026,6 +1026,13 @@ static struct iommu_device *viommu_probe_device(struct 
> device *dev)
>   return ERR_PTR(ret);
>  }
>  
> +static void viommu_probe_finalize(struct device *dev)
> +{
> +#ifndef CONFIG_ARCH_HAS_SETUP_DMA_OPS
> + iommu_setup_dma_ops(dev, 0, U64_MAX);
> +#endif
> +}
> +
>  static void viommu_release_device(struct device *dev)
>  {
>   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> @@ -1062,6 +1069,7 @@ static struct iommu_ops viommu_ops = {
>   .iova_to_phys   = viommu_iova_to_phys,
>   .iotlb_sync = viommu_iotlb_sync,
>   .probe_device   = viommu_probe_device,
> + .probe_finalize = viommu_probe_finalize,
>   .release_device = viommu_release_device,
>   .device_group   = viommu_device_group,
>   .get_resv_regions   = viommu_get_resv_regions,

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 5/6] iommu/dma: Simplify calls to iommu_setup_dma_ops()

2021-06-16 Thread Eric Auger
Hi Jean,

On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> dma-iommu uses the address bounds described in domain->geometry during
> IOVA allocation. The address size parameters of iommu_setup_dma_ops()
> are useful for describing additional limits set by the platform
> firmware, but aren't needed for drivers that call this function from
> probe_finalize(). The base parameter can be zero because dma-iommu
> already removes the first IOVA page, and the limit parameter can be
> U64_MAX because it's only checked against the domain geometry. Simplify
> calls to iommu_setup_dma_ops().
>
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 

Eric
> ---
>  drivers/iommu/amd/iommu.c   |  9 +
>  drivers/iommu/dma-iommu.c   |  4 +++-
>  drivers/iommu/intel/iommu.c | 10 +-
>  3 files changed, 5 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 94b96d81fcfd..d3123bc05c08 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -1708,14 +1708,7 @@ static struct iommu_device 
> *amd_iommu_probe_device(struct device *dev)
>  
>  static void amd_iommu_probe_finalize(struct device *dev)
>  {
> - struct iommu_domain *domain;
> -
> - /* Domains are initialized for this device - have a look what we ended 
> up with */
> - domain = iommu_get_domain_for_dev(dev);
> - if (domain->type == IOMMU_DOMAIN_DMA)
> - iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, U64_MAX);
> - else
> - set_dma_ops(dev, NULL);
> + iommu_setup_dma_ops(dev, 0, U64_MAX);
>  }
>  
>  static void amd_iommu_release_device(struct device *dev)
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index c62e19bed302..175f8eaeb5b3 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1322,7 +1322,9 @@ void iommu_setup_dma_ops(struct device *dev, u64 
> dma_base, u64 dma_limit)
>   if (domain->type == IOMMU_DOMAIN_DMA) {
>   if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
>   goto out_err;
> - dev->dma_ops = _dma_ops;
> + set_dma_ops(dev, _dma_ops);
> + } else {
> + set_dma_ops(dev, NULL);
>   }
>  
>   return;
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 85f18342603c..8d866940692a 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -5165,15 +5165,7 @@ static void intel_iommu_release_device(struct device 
> *dev)
>  
>  static void intel_iommu_probe_finalize(struct device *dev)
>  {
> - dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
> - struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> - struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> -
> - if (domain && domain->type == IOMMU_DOMAIN_DMA)
> - iommu_setup_dma_ops(dev, base,
> - __DOMAIN_MAX_ADDR(dmar_domain->gaw));
> - else
> - set_dma_ops(dev, NULL);
> + iommu_setup_dma_ops(dev, 0, U64_MAX);
>  }
>  
>  static void intel_iommu_get_resv_regions(struct device *device,

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 4/6] iommu/dma: Pass address limit rather than size to iommu_setup_dma_ops()

2021-06-16 Thread Eric Auger
Hi Jean,

On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> Passing a 64-bit address width to iommu_setup_dma_ops() is valid on
> virtual platforms, but isn't currently possible. The overflow check in
> iommu_dma_init_domain() prevents this even when @dma_base isn't 0. Pass
> a limit address instead of a size, so callers don't have to fake a size
> to work around the check.
>
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  include/linux/dma-iommu.h   |  4 ++--
>  arch/arm64/mm/dma-mapping.c |  2 +-
>  drivers/iommu/amd/iommu.c   |  2 +-
>  drivers/iommu/dma-iommu.c   | 12 ++--
>  drivers/iommu/intel/iommu.c |  2 +-
>  5 files changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
> index 6e75a2d689b4..758ca4694257 100644
> --- a/include/linux/dma-iommu.h
> +++ b/include/linux/dma-iommu.h
> @@ -19,7 +19,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, 
> dma_addr_t base);
>  void iommu_put_dma_cookie(struct iommu_domain *domain);
>  
>  /* Setup call for arch DMA mapping code */
> -void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
> +void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
>  
>  /* The DMA API isn't _quite_ the whole story, though... */
>  /*
> @@ -50,7 +50,7 @@ struct msi_msg;
>  struct device;
>  
>  static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
> - u64 size)
> +u64 dma_limit)
>  {
>  }
>  
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index 4bf1dd3eb041..7bd1d2199141 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -50,7 +50,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, 
> u64 size,
>  
>   dev->dma_coherent = coherent;
>   if (iommu)
> - iommu_setup_dma_ops(dev, dma_base, size);
> + iommu_setup_dma_ops(dev, dma_base, size - dma_base - 1);
I don't get  size - dma_base - 1?
>  
>  #ifdef CONFIG_XEN
>   if (xen_swiotlb_detect())
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 3ac42bbdefc6..94b96d81fcfd 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -1713,7 +1713,7 @@ static void amd_iommu_probe_finalize(struct device *dev)
>   /* Domains are initialized for this device - have a look what we ended 
> up with */
>   domain = iommu_get_domain_for_dev(dev);
>   if (domain->type == IOMMU_DOMAIN_DMA)
> - iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
> + iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, U64_MAX);
>   else
>   set_dma_ops(dev, NULL);
>  }
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 7bcdd1205535..c62e19bed302 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -319,16 +319,16 @@ static bool dev_is_untrusted(struct device *dev)
>   * iommu_dma_init_domain - Initialise a DMA mapping domain
>   * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
>   * @base: IOVA at which the mappable address space starts
> - * @size: Size of IOVA space
> + * @limit: Last address of the IOVA space
>   * @dev: Device the domain is being initialised for
>   *
> - * @base and @size should be exact multiples of IOMMU page granularity to
> + * @base and @limit + 1 should be exact multiples of IOMMU page granularity 
> to
>   * avoid rounding surprises. If necessary, we reserve the page at address 0
>   * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
>   * any change which could make prior IOVAs invalid will fail.
>   */
>  static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t 
> base,
> - u64 size, struct device *dev)
> +  dma_addr_t limit, struct device *dev)
>  {
>   struct iommu_dma_cookie *cookie = domain->iova_cookie;
>   unsigned long order, base_pfn;
> @@ -346,7 +346,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
> *domain, dma_addr_t base,
>   /* Check the domain allows at least some access to the device... */
>   if (domain->geometry.force_aperture) {
>   if (base > domain->geometry.aperture_end ||
> - base + size <= domain->geometry.aperture_start) {
> + limit < domain->geometry.aperture_start) {
>   pr_warn("specified DMA range outside IOMMU 
> capability\n");
>   return -EFAULT;
>   }
> @@ -1308,7 +1308,7 @@ static const struct dma_map_ops iommu_dma_ops = {
>   * The IOMMU core code allocates the default DMA domain, which the underlying
>   * IOMMU driver needs to support via the dma-iommu layer.
>   */
> -void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
> +void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
>  {
>   struct 

Re: [PATCH v4 3/6] ACPI: Add driver for the VIOT table

2021-06-16 Thread Eric Auger
(ACPI_SIG_VIOT, 0, );
> + if (ACPI_FAILURE(status)) {
> + if (status != AE_NOT_FOUND) {
> + const char *msg = acpi_format_exception(status);
> +
> + pr_err("Failed to get table, %s\n", msg);
> + }
> + return;
> + }
> +
> + viot = (void *)hdr;
> +
> + node = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->node_offset);
> + for (i = 0; i < viot->node_count; i++) {
> + if (viot_parse_node(node))
> + return;
> +
> + node = ACPI_ADD_PTR(struct acpi_viot_header, node,
> + node->length);
> + }
> +}
> +
> +static int viot_dev_iommu_init(struct device *dev, struct viot_iommu *viommu,
> +u32 epid)
> +{
> + const struct iommu_ops *ops;
> +
> + if (!viommu)
> + return -ENODEV;
> +
> + /* We're not translating ourself */
> + if (viommu->fwnode == dev->fwnode)
> + return -EINVAL;
> +
> + ops = iommu_ops_from_fwnode(viommu->fwnode);
> + if (!ops)
> + return IS_ENABLED(CONFIG_VIRTIO_IOMMU) ?
> + -EPROBE_DEFER : -ENODEV;
> +
> + return acpi_iommu_fwspec_init(dev, epid, viommu->fwnode, ops);
> +}
> +
> +static int viot_pci_dev_iommu_init(struct pci_dev *pdev, u16 dev_id, void 
> *data)
> +{
> + u32 epid;
> + struct viot_endpoint *ep;
> + u32 domain_nr = pci_domain_nr(pdev->bus);
> +
> + list_for_each_entry(ep, _pci_ranges, list) {
> + if (domain_nr >= ep->segment_start &&
> + domain_nr <= ep->segment_end &&
> + dev_id >= ep->bdf_start &&
> + dev_id <= ep->bdf_end) {
> + epid = ((domain_nr - ep->segment_start) << 16) +
> + dev_id - ep->bdf_start + ep->endpoint_id;
> +
> + /*
> +  * If we found a PCI range managed by the viommu, we're
> +  * the one that has to request ACS.
> +  */
> + pci_request_acs();
> +
> + return viot_dev_iommu_init(>dev, ep->viommu,
> +epid);
> + }
> + }
> + return -ENODEV;
> +}
> +
> +static int viot_mmio_dev_iommu_init(struct platform_device *pdev)
> +{
> + struct resource *mem;
> + struct viot_endpoint *ep;
> +
> + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> + if (!mem)
> + return -ENODEV;
> +
> + list_for_each_entry(ep, _mmio_endpoints, list) {
> + if (ep->address == mem->start)
> + return viot_dev_iommu_init(>dev, ep->viommu,
> +ep->endpoint_id);
> + }
> + return -ENODEV;
> +}
> +
> +/**
> + * viot_iommu_configure - Setup IOMMU ops for an endpoint described by VIOT
> + * @dev: the endpoint
> + *
> + * Return: 0 on success, <0 on failure
> + */
> +int viot_iommu_configure(struct device *dev)
> +{
> + if (dev_is_pci(dev))
> + return pci_for_each_dma_alias(to_pci_dev(dev),
> +   viot_pci_dev_iommu_init, NULL);
> + else if (dev_is_platform(dev))
> + return viot_mmio_dev_iommu_init(to_platform_device(dev));
> + return -ENODEV;
> +}
> diff --git a/MAINTAINERS b/MAINTAINERS
> index b706dd20ff2b..8d71591f979a 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -431,6 +431,14 @@ W:   https://01.org/linux-acpi
>  B:   https://bugzilla.kernel.org
>  F:   drivers/acpi/acpi_video.c
>  
> +ACPI VIOT DRIVER
> +M:   Jean-Philippe Brucker 
> +L:   linux-a...@vger.kernel.org
> +L:   iommu@lists.linux-foundation.org
> +S:   Maintained
> +F:   drivers/acpi/viot.c
> +F:   include/linux/acpi_viot.h
> +
>  ACPI WMI DRIVER
>  L:   platform-driver-...@vger.kernel.org
>  S:   Orphan
Besides
Reviewed-by: Eric Auger 

Thanks

Eric

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 0/6] Add support for ACPI VIOT

2021-06-16 Thread Eric Auger
Hi Jean,

On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> Add a driver for the ACPI VIOT table, which provides topology
> information for para-virtual IOMMUs. Enable virtio-iommu on
> non-devicetree platforms, including x86.
>
> Since v3 [1] I fixed a build bug for !CONFIG_IOMMU_API. Joerg offered to
> take this series through the IOMMU tree, which requires Acks for patches
> 1-3.
>
> You can find a QEMU implementation at [2], with extra support for
> testing all VIOT nodes including MMIO-based endpoints and IOMMU.
> This series is at [3].
>
> [1] 
> https://lore.kernel.org/linux-iommu/2021060215.1077006-1-jean-phili...@linaro.org/
> [2] https://jpbrucker.net/git/qemu/log/?h=virtio-iommu/acpi
> [3] https://jpbrucker.net/git/linux/log/?h=virtio-iommu/acpi

I tested the series on both aarch64 and x86_64 with qemu. It works for me.
Feel free to add my T-b.

Tested-by: Eric Auger 

Thanks

Eric

>
>
> Jean-Philippe Brucker (6):
>   ACPI: arm64: Move DMA setup operations out of IORT
>   ACPI: Move IOMMU setup code out of IORT
>   ACPI: Add driver for the VIOT table
>   iommu/dma: Pass address limit rather than size to
> iommu_setup_dma_ops()
>   iommu/dma: Simplify calls to iommu_setup_dma_ops()
>   iommu/virtio: Enable x86 support
>
>  drivers/acpi/Kconfig |   3 +
>  drivers/iommu/Kconfig|   4 +-
>  drivers/acpi/Makefile|   2 +
>  drivers/acpi/arm64/Makefile  |   1 +
>  include/acpi/acpi_bus.h  |   3 +
>  include/linux/acpi.h |   3 +
>  include/linux/acpi_iort.h|  14 +-
>  include/linux/acpi_viot.h|  19 ++
>  include/linux/dma-iommu.h|   4 +-
>  arch/arm64/mm/dma-mapping.c  |   2 +-
>  drivers/acpi/arm64/dma.c |  50 +
>  drivers/acpi/arm64/iort.c| 129 ++---
>  drivers/acpi/bus.c   |   2 +
>  drivers/acpi/scan.c  |  78 +++-
>  drivers/acpi/viot.c  | 364 +++
>  drivers/iommu/amd/iommu.c|   9 +-
>  drivers/iommu/dma-iommu.c|  17 +-
>  drivers/iommu/intel/iommu.c  |  10 +-
>  drivers/iommu/virtio-iommu.c |   8 +
>  MAINTAINERS  |   8 +
>  20 files changed, 580 insertions(+), 150 deletions(-)
>  create mode 100644 include/linux/acpi_viot.h
>  create mode 100644 drivers/acpi/arm64/dma.c
>  create mode 100644 drivers/acpi/viot.c
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 1/6] ACPI: arm64: Move DMA setup operations out of IORT

2021-06-16 Thread Eric Auger
Hi jean,

On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> Extract generic DMA setup code out of IORT, so it can be reused by VIOT.
> Keep it in drivers/acpi/arm64 for now, since it could break x86
> platforms that haven't run this code so far, if they have invalid
> tables.
>
> Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Eric Auger 


Eric

> ---
>  drivers/acpi/arm64/Makefile |  1 +
>  include/linux/acpi.h|  3 +++
>  include/linux/acpi_iort.h   |  6 ++---
>  drivers/acpi/arm64/dma.c| 50 ++
>  drivers/acpi/arm64/iort.c   | 54 ++---
>  drivers/acpi/scan.c |  2 +-
>  6 files changed, 66 insertions(+), 50 deletions(-)
>  create mode 100644 drivers/acpi/arm64/dma.c
>
> diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
> index 6ff50f4ed947..66acbe77f46e 100644
> --- a/drivers/acpi/arm64/Makefile
> +++ b/drivers/acpi/arm64/Makefile
> @@ -1,3 +1,4 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  obj-$(CONFIG_ACPI_IORT)  += iort.o
>  obj-$(CONFIG_ACPI_GTDT)  += gtdt.o
> +obj-y+= dma.o
> diff --git a/include/linux/acpi.h b/include/linux/acpi.h
> index c60745f657e9..7aaa9559cc19 100644
> --- a/include/linux/acpi.h
> +++ b/include/linux/acpi.h
> @@ -259,9 +259,12 @@ void acpi_numa_x2apic_affinity_init(struct 
> acpi_srat_x2apic_cpu_affinity *pa);
>  
>  #ifdef CONFIG_ARM64
>  void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
> +void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size);
>  #else
>  static inline void
>  acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
> +static inline void
> +acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { }
>  #endif
>  
>  int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
> diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
> index 1a12baa58e40..f7f054833afd 100644
> --- a/include/linux/acpi_iort.h
> +++ b/include/linux/acpi_iort.h
> @@ -34,7 +34,7 @@ struct irq_domain *iort_get_device_domain(struct device 
> *dev, u32 id,
>  void acpi_configure_pmsi_domain(struct device *dev);
>  int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
>  /* IOMMU interface */
> -void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
> +int iort_dma_get_ranges(struct device *dev, u64 *size);
>  const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
>   const u32 *id_in);
>  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
> *head);
> @@ -48,8 +48,8 @@ static inline struct irq_domain *iort_get_device_domain(
>  { return NULL; }
>  static inline void acpi_configure_pmsi_domain(struct device *dev) { }
>  /* IOMMU interface */
> -static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
> -   u64 *size) { }
> +static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
> +{ return -ENODEV; }
>  static inline const struct iommu_ops *iort_iommu_configure_id(
> struct device *dev, const u32 *id_in)
>  { return NULL; }
> diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c
> new file mode 100644
> index ..f16739ad3cc0
> --- /dev/null
> +++ b/drivers/acpi/arm64/dma.c
> @@ -0,0 +1,50 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
> +{
> + int ret;
> + u64 end, mask;
> + u64 dmaaddr = 0, size = 0, offset = 0;
> +
> + /*
> +  * If @dev is expected to be DMA-capable then the bus code that created
> +  * it should have initialised its dma_mask pointer by this point. For
> +  * now, we'll continue the legacy behaviour of coercing it to the
> +  * coherent mask if not, but we'll no longer do so quietly.
> +  */
> + if (!dev->dma_mask) {
> + dev_warn(dev, "DMA mask not set\n");
> + dev->dma_mask = >coherent_dma_mask;
> + }
> +
> + if (dev->coherent_dma_mask)
> + size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
> + else
> + size = 1ULL << 32;
> +
> + ret = acpi_dma_get_range(dev, , , );
> + if (ret == -ENODEV)
> + ret = iort_dma_get_ranges(dev, );
> + if (!ret) {
> + /*
> +  * Limit coherent and dma mask based on size retrieved from
> +  * firmware.
> +   

Re: [PATCH v4 2/6] ACPI: Move IOMMU setup code out of IORT

2021-06-16 Thread Eric Auger
Hi jean,
On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> Extract the code that sets up the IOMMU infrastructure from IORT, since
> it can be reused by VIOT. Move it one level up into a new
> acpi_iommu_configure_id() function, which calls the IORT parsing
> function which in turn calls the acpi_iommu_fwspec_init() helper.
>
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  include/acpi/acpi_bus.h   |  3 ++
>  include/linux/acpi_iort.h |  8 ++---
>  drivers/acpi/arm64/iort.c | 75 +--
>  drivers/acpi/scan.c   | 73 -
>  4 files changed, 87 insertions(+), 72 deletions(-)
>
> diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
> index 3a82faac5767..41f092a269f6 100644
> --- a/include/acpi/acpi_bus.h
> +++ b/include/acpi/acpi_bus.h
> @@ -588,6 +588,9 @@ struct acpi_pci_root {
>  
>  bool acpi_dma_supported(struct acpi_device *adev);
>  enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
> +int acpi_iommu_fwspec_init(struct device *dev, u32 id,
> +struct fwnode_handle *fwnode,
> +const struct iommu_ops *ops);
>  int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
>  u64 *size);
>  int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
> diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
> index f7f054833afd..f1f0842a2cb2 100644
> --- a/include/linux/acpi_iort.h
> +++ b/include/linux/acpi_iort.h
> @@ -35,8 +35,7 @@ void acpi_configure_pmsi_domain(struct device *dev);
>  int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
>  /* IOMMU interface */
>  int iort_dma_get_ranges(struct device *dev, u64 *size);
> -const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
> - const u32 *id_in);
> +int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
>  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
> *head);
>  phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
>  #else
> @@ -50,9 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device 
> *dev) { }
>  /* IOMMU interface */
>  static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
>  { return -ENODEV; }
> -static inline const struct iommu_ops *iort_iommu_configure_id(
> -   struct device *dev, const u32 *id_in)
> -{ return NULL; }
> +static inline int iort_iommu_configure_id(struct device *dev, const u32 
> *id_in)
> +{ return -ENODEV; }
>  static inline
>  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
> *head)
>  { return 0; }
> diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
> index a940be1cf2af..b5b021e064b6 100644
> --- a/drivers/acpi/arm64/iort.c
> +++ b/drivers/acpi/arm64/iort.c
> @@ -806,23 +806,6 @@ static struct acpi_iort_node 
> *iort_get_msi_resv_iommu(struct device *dev)
>   return NULL;
>  }
>  
> -static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device 
> *dev)
> -{
> - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> -
> - return (fwspec && fwspec->ops) ? fwspec->ops : NULL;
> -}
> -
> -static inline int iort_add_device_replay(struct device *dev)
> -{
> - int err = 0;
> -
> - if (dev->bus && !device_iommu_mapped(dev))
> - err = iommu_probe_device(dev);
> -
> - return err;
> -}
> -
>  /**
>   * iort_iommu_msi_get_resv_regions - Reserved region driver helper
>   * @dev: Device from iommu_get_resv_regions()
> @@ -900,18 +883,6 @@ static inline bool iort_iommu_driver_enabled(u8 type)
>   }
>  }
>  
> -static int arm_smmu_iort_xlate(struct device *dev, u32 streamid,
> -struct fwnode_handle *fwnode,
> -const struct iommu_ops *ops)
> -{
> - int ret = iommu_fwspec_init(dev, fwnode, ops);
> -
> - if (!ret)
> - ret = iommu_fwspec_add_ids(dev, , 1);
> -
> - return ret;
> -}
> -
>  static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
>  {
>   struct acpi_iort_root_complex *pci_rc;
> @@ -946,7 +917,7 @@ static int iort_iommu_xlate(struct device *dev, struct 
> acpi_iort_node *node,
>   return iort_iommu_driver_enabled(node->type) ?
>  -EPROBE_DEFER : -ENODEV;
>  
> - return arm_smmu_iort_xlate(dev, streamid, iort_fwnode, ops);
> + return acpi_iommu_fwspec_init(dev, streamid, iort_fwnode, ops);
>  }
>  
>  struct iort_pci_alias_info {
> @@ -1020,24 +991,14 @@ static int iort_nc_iommu_map_id(struct device *dev,
>   * @dev: device to configure
>   * @id_in: optional input id const value pointer
>   *
> - * Returns: iommu_ops pointer on configuration success
> - *  NULL on configuration failure
> + * Returns: 0 on success, <0 on failure
>   */
> -const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
> -   

Re: Plan for /dev/ioasid RFC v2

2021-06-09 Thread Eric Auger
Hi Kevin,

On 6/9/21 11:37 AM, Tian, Kevin wrote:
>> From: Eric Auger 
>> Sent: Wednesday, June 9, 2021 4:15 PM
>>
>> Hi Kevin,
>>
>> On 6/7/21 4:58 AM, Tian, Kevin wrote:
>>> Hi, all,
>>>
>>> We plan to work on v2 now, given many good comments already received
>>> and substantial changes envisioned. This is a very complex topic with
>>> many sub-threads being discussed. To ensure that I didn't miss valuable
>>> suggestions (and also keep everyone on the same page), here I'd like to
>>> provide a list of planned changes in my mind. Please let me know if
>>> anything important is lost.  :)
>>>
>>> --
>>>
>>> (Remaining opens in v1)
>>>
>>> -   Protocol between kvm/vfio/ioasid for wbinvd/no-snoop. I'll see how
>>> much can be refined based on discussion progress when v2 is out;
>>>
>>> -   Device-centric (Jason) vs. group-centric (David) uAPI. David is not 
>>> fully
>>> convinced yet. Based on discussion v2 will continue to have ioasid uAPI
>>> being device-centric (but it's fine for vfio to be group-centric). A new
>>> section will be added to elaborate this part;
>>>
>>> -   PASID virtualization (section 4) has not been thoroughly discussed yet.
>>> Jason gave some suggestion on how to categorize intended usages.
>>> I will rephrase this section and hope more discussions can be held for
>>> it in v2;
>>>
>>> (Adopted suggestions)
>>>
>>> -   (Jason) Rename /dev/ioasid to /dev/iommu (so does uAPI e.g. IOASID
>>> _XXX to IOMMU_XXX). One suggestion (Jason) was to also rename
>>> RID+PASID to SID+SSID. But given the familiarity of the former, I will
>>> still use RID+PASID in v2 to ease the discussoin;
>>>
>>> -   (Jason) v1 prevents one device from binding to multiple ioasid_fd's. 
>>> This
>>> will be fixed in v2;
>>>
>>> -   (Jean/Jason) No need to track guest I/O page tables on ARM/AMD.
>> When
>>> a pasid table is bound, it becomes a container for all guest I/O page
>> tables;
>> while I am totally in line with that change, I guess we need to revisit
>> the invalidate ioctl
>> to support PASID table invalidation.
> Yes, this is planned when doing this change.
OK
>
>>> -   (Jean/Jason) Accordingly a device label is required so iotlb 
>>> invalidation
>>> and fault handling can both support per-device operation. Per Jean's
>>> suggestion, this label will come from userspace (when VFIO_BIND_
>>> IOASID_FD);
>> what is not totally clear to me is the correspondance between this label
>> and the SID/SSID tuple.
>> My understanding is it rather maps to the SID because you can attach
>> several ioasids to the device.
>> So it is not clear to me how you reconstruct the SSID info
>>
> Yes, device handle maps to SID. The fault data reported to userspace
> will include {device_label, ioasid, vendor_fault_data}. In your case
> I believe SSID will be included in vendor_fault_data thus no reconstruct
> required. For Intel the user could figure out vPASID according to device_
> label and ioasid, i.e. no need to include PASID info in vendor_fault_data.
OK that works.

Thanks

Eric
>
> Thanks
> Kevin

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Plan for /dev/ioasid RFC v2

2021-06-09 Thread Eric Auger
Hi Kevin,

On 6/7/21 4:58 AM, Tian, Kevin wrote:
> Hi, all,
>
> We plan to work on v2 now, given many good comments already received
> and substantial changes envisioned. This is a very complex topic with
> many sub-threads being discussed. To ensure that I didn't miss valuable 
> suggestions (and also keep everyone on the same page), here I'd like to 
> provide a list of planned changes in my mind. Please let me know if 
> anything important is lost.  :)
>
> --
>
> (Remaining opens in v1)
>
> -   Protocol between kvm/vfio/ioasid for wbinvd/no-snoop. I'll see how
> much can be refined based on discussion progress when v2 is out;
>
> -   Device-centric (Jason) vs. group-centric (David) uAPI. David is not fully
> convinced yet. Based on discussion v2 will continue to have ioasid uAPI
> being device-centric (but it's fine for vfio to be group-centric). A new
> section will be added to elaborate this part;
>
> -   PASID virtualization (section 4) has not been thoroughly discussed yet. 
> Jason gave some suggestion on how to categorize intended usages. 
> I will rephrase this section and hope more discussions can be held for 
> it in v2;
>
> (Adopted suggestions)
>
> -   (Jason) Rename /dev/ioasid to /dev/iommu (so does uAPI e.g. IOASID
> _XXX to IOMMU_XXX). One suggestion (Jason) was to also rename 
> RID+PASID to SID+SSID. But given the familiarity of the former, I will 
> still use RID+PASID in v2 to ease the discussoin;
>
> -   (Jason) v1 prevents one device from binding to multiple ioasid_fd's. This 
> will be fixed in v2;
>
> -   (Jean/Jason) No need to track guest I/O page tables on ARM/AMD. When 
> a pasid table is bound, it becomes a container for all guest I/O page 
> tables;
while I am totally in line with that change, I guess we need to revisit
the invalidate ioctl
to support PASID table invalidation.
>
> -   (Jean/Jason) Accordingly a device label is required so iotlb invalidation 
> and fault handling can both support per-device operation. Per Jean's 
> suggestion, this label will come from userspace (when VFIO_BIND_
> IOASID_FD);

what is not totally clear to me is the correspondance between this label
and the SID/SSID tuple.
My understanding is it rather maps to the SID because you can attach
several ioasids to the device.
So it is not clear to me how you reconstruct the SSID info

Thanks

Eric
>
> -   (Jason) Addition of device label allows per-device capability/format 
> check before IOASIDs are created. This leads to another major uAPI 
> change in v2 - specify format info when creating an IOASID (mapping 
> protocol, nesting, coherent, etc.). User is expected to check per-device 
> format and then set proper format for IOASID upon to-be-attached 
> device;

> -   (Jason/David) No restriction on map/unmap vs. bind/invalidate. They
> can be used in either parent or child;
>
> -   (David) Change IOASID_GET_INFO to report permitted range instead of
> reserved IOVA ranges. This works better for PPC;
>
> -   (Jason) For helper functions, expect to have explicit bus-type wrappers
> e.g. ioasid_pci_device_attach;
>
> (Not adopted)
>
> -   (Parav) Make page pinning a syscall;
> -   (Jason. W/Enrico) one I/O page table per fd;
> -   (David) Replace IOASID_REGISTER_MEMORY through another ioasid
> nesting (sort of passthrough mode). Need more thinking. v2 will not 
> change this part;
>
> Thanks
> Kevin
>

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 13/13] vfio/pci: Inject page response upon response region fill

2021-04-11 Thread Eric Auger
When the userspace increments the head of the page response
buffer ring, let's push the response into the iommu layer.
This is done through a workqueue that pops the responses from
the ring buffer and increment the tail.

Signed-off-by: Eric Auger 
---
 drivers/vfio/pci/vfio_pci.c | 40 +
 drivers/vfio/pci/vfio_pci_private.h |  7 +
 drivers/vfio/pci/vfio_pci_rdwr.c|  1 +
 3 files changed, 48 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 560b1a830726..bb4a0e1e39bf 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -552,6 +552,32 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
return ret;
 }
 
+static void dma_response_inject(struct work_struct *work)
+{
+   struct vfio_pci_dma_fault_response_work *rwork =
+   container_of(work, struct vfio_pci_dma_fault_response_work, 
inject);
+   struct vfio_region_dma_fault_response *header = rwork->header;
+   struct vfio_pci_device *vdev = rwork->vdev;
+   struct iommu_page_response *resp;
+   u32 tail, head, size;
+
+   mutex_lock(>fault_response_queue_lock);
+
+   tail = header->tail;
+   head = header->head;
+   size = header->nb_entries;
+
+   while (CIRC_CNT(head, tail, size) >= 1) {
+   resp = (struct iommu_page_response 
*)(vdev->fault_response_pages + header->offset +
+   tail * header->entry_size);
+
+   /* TODO: properly handle the return value */
+   iommu_page_response(>pdev->dev, resp);
+   header->tail = tail = (tail + 1) % size;
+   }
+   mutex_unlock(>fault_response_queue_lock);
+}
+
 #define DMA_FAULT_RESPONSE_RING_LENGTH 512
 
 static int vfio_pci_dma_fault_response_init(struct vfio_pci_device *vdev)
@@ -597,8 +623,22 @@ static int vfio_pci_dma_fault_response_init(struct 
vfio_pci_device *vdev)
header->nb_entries = DMA_FAULT_RESPONSE_RING_LENGTH;
header->offset = PAGE_SIZE;
 
+   vdev->response_work = kzalloc(sizeof(*vdev->response_work), GFP_KERNEL);
+   if (!vdev->response_work)
+   goto out;
+   vdev->response_work->header = header;
+   vdev->response_work->vdev = vdev;
+
+   /* launch the thread that will extract the response */
+   INIT_WORK(>response_work->inject, dma_response_inject);
+   vdev->dma_fault_response_wq =
+   create_singlethread_workqueue("vfio-dma-fault-response");
+   if (!vdev->dma_fault_response_wq)
+   return -ENOMEM;
+
return 0;
 out:
+   kfree(vdev->fault_response_pages);
vdev->fault_response_pages = NULL;
return ret;
 }
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index f7b1e7fb86e5..835fbb221dea 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -52,6 +52,12 @@ struct vfio_pci_irq_ctx {
struct irq_bypass_producer  producer;
 };
 
+struct vfio_pci_dma_fault_response_work {
+   struct work_struct inject;
+   struct vfio_region_dma_fault_response *header;
+   struct vfio_pci_device *vdev;
+};
+
 struct vfio_pci_device;
 struct vfio_pci_region;
 
@@ -146,6 +152,7 @@ struct vfio_pci_device {
u8  *fault_pages;
u8  *fault_response_pages;
struct workqueue_struct *dma_fault_response_wq;
+   struct vfio_pci_dma_fault_response_work *response_work;
struct mutexfault_queue_lock;
struct mutexfault_response_queue_lock;
struct list_headdummy_resources_list;
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index efde0793360b..78c494fe35cc 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -430,6 +430,7 @@ size_t vfio_pci_dma_fault_response_rw(struct 
vfio_pci_device *vdev, char __user
mutex_lock(>fault_response_queue_lock);
header->head = new_head;
mutex_unlock(>fault_response_queue_lock);
+   queue_work(vdev->dma_fault_response_wq, 
>response_work->inject);
} else {
if (copy_to_user(buf, base + pos, count))
return -EFAULT;
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 12/13] vfio/pci: Register a DMA fault response region

2021-04-11 Thread Eric Auger
In preparation for vSVA, let's register a DMA fault response region,
where the userspace will push the page responses and increment the
head of the buffer. The kernel will pop those responses and inject them
on iommu side.

Signed-off-by: Eric Auger 

---

v11 -> v12:
- use DMA_FAULT_RESPONSE cap [Shameer]
- struct vfio_pci_device dma_fault_response_wq field introduced in
  this patch
- return 0 if the domain is NULL
- pass an int pointer to iommu_domain_get_attr
---
 drivers/vfio/pci/vfio_pci.c | 125 ++--
 drivers/vfio/pci/vfio_pci_private.h |   6 ++
 drivers/vfio/pci/vfio_pci_rdwr.c|  39 +
 include/uapi/linux/vfio.h   |  32 +++
 4 files changed, 193 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 72d7c667b64c..560b1a830726 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -316,9 +316,20 @@ static void vfio_pci_dma_fault_release(struct 
vfio_pci_device *vdev,
kfree(vdev->fault_pages);
 }
 
-static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
-  struct vfio_pci_region *region,
-  struct vm_area_struct *vma)
+static void
+vfio_pci_dma_fault_response_release(struct vfio_pci_device *vdev,
+   struct vfio_pci_region *region)
+{
+   if (vdev->dma_fault_response_wq)
+   destroy_workqueue(vdev->dma_fault_response_wq);
+   kfree(vdev->fault_response_pages);
+   vdev->fault_response_pages = NULL;
+}
+
+static int __vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
+struct vfio_pci_region *region,
+struct vm_area_struct *vma,
+u8 *pages)
 {
u64 phys_len, req_len, pgoff, req_start;
unsigned long long addr;
@@ -331,14 +342,14 @@ static int vfio_pci_dma_fault_mmap(struct vfio_pci_device 
*vdev,
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
req_start = pgoff << PAGE_SHIFT;
 
-   /* only the second page of the producer fault region is mmappable */
+   /* only the second page of the fault region is mmappable */
if (req_start < PAGE_SIZE)
return -EINVAL;
 
if (req_start + req_len > phys_len)
return -EINVAL;
 
-   addr = virt_to_phys(vdev->fault_pages);
+   addr = virt_to_phys(pages);
vma->vm_private_data = vdev;
vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
 
@@ -347,13 +358,29 @@ static int vfio_pci_dma_fault_mmap(struct vfio_pci_device 
*vdev,
return ret;
 }
 
-static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
-struct vfio_pci_region *region,
-struct vfio_info_cap *caps)
+static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vm_area_struct *vma)
+{
+   return __vfio_pci_dma_fault_mmap(vdev, region, vma, vdev->fault_pages);
+}
+
+static int
+vfio_pci_dma_fault_response_mmap(struct vfio_pci_device *vdev,
+   struct vfio_pci_region *region,
+   struct vm_area_struct *vma)
+{
+   return __vfio_pci_dma_fault_mmap(vdev, region, vma, 
vdev->fault_response_pages);
+}
+
+static int __vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vfio_info_cap *caps,
+  u32 cap_id)
 {
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
struct vfio_region_info_cap_fault cap = {
-   .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
+   .header.id = cap_id,
.header.version = 1,
.version = 1,
};
@@ -381,6 +408,23 @@ static int vfio_pci_dma_fault_add_capability(struct 
vfio_pci_device *vdev,
return ret;
 }
 
+static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
+struct vfio_pci_region *region,
+struct vfio_info_cap *caps)
+{
+   return __vfio_pci_dma_fault_add_capability(vdev, region, caps,
+  
VFIO_REGION_INFO_CAP_DMA_FAULT);
+}
+
+static int
+vfio_pci_dma_fault_response_add_capability(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vfio_info_cap *caps)
+{
+   retu

[PATCH v13 11/13] vfio: Document nested stage control

2021-04-11 Thread Eric Auger
The VFIO API was enhanced to support nested stage control: a bunch of
new iotcls, one DMA FAULT region and an associated specific IRQ.

Let's document the process to follow to set up nested mode.

Signed-off-by: Eric Auger 

---

v11 -> v12:
s/VFIO_REGION_INFO_CAP_PRODUCER_FAULT/VFIO_REGION_INFO_CAP_DMA_FAULT

v8 -> v9:
- new names for SET_MSI_BINDING and SET_PASID_TABLE
- new layout for the DMA FAULT memory region and specific IRQ

v2 -> v3:
- document the new fault API

v1 -> v2:
- use the new ioctl names
- add doc related to fault handling
---
 Documentation/driver-api/vfio.rst | 77 +++
 1 file changed, 77 insertions(+)

diff --git a/Documentation/driver-api/vfio.rst 
b/Documentation/driver-api/vfio.rst
index f1a4d3c3ba0b..14e41324237d 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -239,6 +239,83 @@ group and can access them as follows::
/* Gratuitous device reset and go... */
ioctl(device, VFIO_DEVICE_RESET);
 
+IOMMU Dual Stage Control
+
+
+Some IOMMUs support 2 stages/levels of translation. "Stage" corresponds to
+the ARM terminology while "level" corresponds to Intel's VTD terminology. In
+the following text we use either without distinction.
+
+This is useful when the guest is exposed with a virtual IOMMU and some
+devices are assigned to the guest through VFIO. Then the guest OS can use
+stage 1 (IOVA -> GPA), while the hypervisor uses stage 2 for VM isolation
+(GPA -> HPA).
+
+The guest gets ownership of the stage 1 page tables and also owns stage 1
+configuration structures. The hypervisor owns the root configuration structure
+(for security reason), including stage 2 configuration. This works as long
+configuration structures and page table format are compatible between the
+virtual IOMMU and the physical IOMMU.
+
+Assuming the HW supports it, this nested mode is selected by choosing the
+VFIO_TYPE1_NESTING_IOMMU type through:
+
+ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_NESTING_IOMMU);
+
+This forces the hypervisor to use the stage 2, leaving stage 1 available for
+guest usage.
+
+Once groups are attached to the container, the guest stage 1 translation
+configuration data can be passed to VFIO by using
+
+ioctl(container, VFIO_IOMMU_SET_PASID_TABLE, _table_info);
+
+This allows to combine the guest stage 1 configuration structure along with
+the hypervisor stage 2 configuration structure. Stage 1 configuration
+structures are dependent on the IOMMU type.
+
+As the stage 1 translation is fully delegated to the HW, translation faults
+encountered during the translation process need to be propagated up to
+the virtualizer and re-injected into the guest.
+
+The userspace must be prepared to receive faults. The VFIO-PCI device
+exposes one dedicated DMA FAULT region: it contains a ring buffer and
+its header that allows to manage the head/tail indices. The region is
+identified by the following index/subindex:
+- VFIO_REGION_TYPE_NESTED/VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT
+
+The DMA FAULT region exposes a VFIO_REGION_INFO_CAP_DMA_FAULT
+region capability that allows the userspace to retrieve the ABI version
+of the fault records filled by the host.
+
+On top of that region, the userspace can be notified whenever a fault
+occurs at the physical level. It can use the VFIO_IRQ_TYPE_NESTED/
+VFIO_IRQ_SUBTYPE_DMA_FAULT specific IRQ to attach the eventfd to be
+signalled.
+
+The ring buffer containing the fault records can be mmapped. When
+the userspace consumes a fault in the queue, it should increment
+the consumer index to allow new fault records to replace the used ones.
+
+The queue size and the entry size can be retrieved in the header.
+The tail index should never overshoot the producer index as in any
+other circular buffer scheme. Also it must be less than the queue size
+otherwise the change fails.
+
+When the guest invalidates stage 1 related caches, invalidations must be
+forwarded to the host through
+ioctl(container, VFIO_IOMMU_CACHE_INVALIDATE, _data);
+Those invalidations can happen at various granularity levels, page, context, 
...
+
+The ARM SMMU specification introduces another challenge: MSIs are translated by
+both the virtual SMMU and the physical SMMU. To build a nested mapping for the
+IOVA programmed into the assigned device, the guest needs to pass its IOVA/MSI
+doorbell GPA binding to the host. Then the hypervisor can build a nested stage 
2
+binding eventually translating into the physical MSI doorbell.
+
+This is achieved by calling
+ioctl(container, VFIO_IOMMU_SET_MSI_BINDING, _binding);
+
 VFIO User API
 ---
 
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 10/13] vfio/pci: Register and allow DMA FAULT IRQ signaling

2021-04-11 Thread Eric Auger
Register the VFIO_IRQ_TYPE_NESTED/VFIO_IRQ_SUBTYPE_DMA_FAULT
IRQ that allows to signal a nested mode DMA fault.

Signed-off-by: Eric Auger 

---

v10 -> v11:
- the irq now is registered in vfio_pci_dma_fault_init()
  in case the domain is nested
---
 drivers/vfio/pci/vfio_pci.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b3fc6ed4ed7a..72d7c667b64c 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -396,6 +396,7 @@ vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, 
void *data)
(struct vfio_region_dma_fault *)vdev->fault_pages;
struct iommu_fault *new;
u32 head, tail, size;
+   int ext_irq_index;
int ret = -EINVAL;
 
if (WARN_ON(!reg))
@@ -420,7 +421,19 @@ vfio_pci_iommu_dev_fault_handler(struct iommu_fault 
*fault, void *data)
ret = 0;
 unlock:
mutex_unlock(>fault_queue_lock);
-   return ret;
+   if (ret)
+   return ret;
+
+   ext_irq_index = vfio_pci_get_ext_irq_index(vdev, VFIO_IRQ_TYPE_NESTED,
+  VFIO_IRQ_SUBTYPE_DMA_FAULT);
+   if (ext_irq_index < 0)
+   return -EINVAL;
+
+   mutex_lock(>igate);
+   if (vdev->ext_irqs[ext_irq_index].trigger)
+   eventfd_signal(vdev->ext_irqs[ext_irq_index].trigger, 1);
+   mutex_unlock(>igate);
+   return 0;
 }
 
 #define DMA_FAULT_RING_LENGTH 512
@@ -475,6 +488,12 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
if (ret) /* the dma fault region is freed in vfio_pci_disable() */
goto out;
 
+   ret = vfio_pci_register_irq(vdev, VFIO_IRQ_TYPE_NESTED,
+   VFIO_IRQ_SUBTYPE_DMA_FAULT,
+   VFIO_IRQ_INFO_EVENTFD);
+   if (ret) /* the fault handler is also freed in vfio_pci_disable() */
+   goto out;
+
return 0;
 out:
kfree(vdev->fault_pages);
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 09/13] vfio: Add new IRQ for DMA fault reporting

2021-04-11 Thread Eric Auger
Add a new IRQ type/subtype to get notification on nested
stage DMA faults.

Signed-off-by: Eric Auger 
---
 include/uapi/linux/vfio.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0728b6f3f348..ad7c275b4074 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -729,6 +729,9 @@ struct vfio_irq_info_cap_type {
__u32 subtype;  /* type specific */
 };
 
+#define VFIO_IRQ_TYPE_NESTED   (1)
+#define VFIO_IRQ_SUBTYPE_DMA_FAULT (1)
+
 /**
  * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
  *
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 08/13] vfio/pci: Add framework for custom interrupt indices

2021-04-11 Thread Eric Auger
Implement IRQ capability chain infrastructure. All interrupt
indexes beyond VFIO_PCI_NUM_IRQS are handled as extended
interrupts. They are registered with a specific type/subtype
and supported flags.

Signed-off-by: Eric Auger 

---

v11 -> v12:
- check !vdev->num_ext_irqs in vfio_pci_set_ext_irq_trigger()
  [Shameer, Qubingbing]
---
 drivers/vfio/pci/vfio_pci.c | 99 +++--
 drivers/vfio/pci/vfio_pci_intrs.c   | 62 ++
 drivers/vfio/pci/vfio_pci_private.h | 14 
 3 files changed, 157 insertions(+), 18 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index eef76560be55..b3fc6ed4ed7a 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -610,6 +610,14 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
ret = iommu_unregister_device_fault_handler(>pdev->dev);
WARN_ON(ret == -EBUSY);
 
+   for (i = 0; i < vdev->num_ext_irqs; i++)
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+   vdev->num_ext_irqs = 0;
+   kfree(vdev->ext_irqs);
+   vdev->ext_irqs = NULL;
+
/* Device closed, don't need mutex here */
list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
 >ioeventfds_list, next) {
@@ -825,6 +833,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
return 1;
} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
return 1;
+   } else if (irq_type >= VFIO_PCI_NUM_IRQS &&
+  irq_type < VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs) {
+   return 1;
}
 
return 0;
@@ -1011,7 +1022,7 @@ static long vfio_pci_ioctl(void *device_data,
info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
-   info.num_irqs = VFIO_PCI_NUM_IRQS;
+   info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs;
 
ret = vfio_pci_info_zdev_add_caps(vdev, );
if (ret && ret != -ENODEV) {
@@ -1187,36 +1198,87 @@ static long vfio_pci_ioctl(void *device_data,
 
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
struct vfio_irq_info info;
+   struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+   unsigned long capsz;
 
minsz = offsetofend(struct vfio_irq_info, count);
 
+   /* For backward compatibility, cannot require this */
+   capsz = offsetofend(struct vfio_irq_info, cap_offset);
+
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
-   if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+   if (info.argsz < minsz ||
+   info.index >= VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs)
return -EINVAL;
 
-   switch (info.index) {
-   case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
-   case VFIO_PCI_REQ_IRQ_INDEX:
-   break;
-   case VFIO_PCI_ERR_IRQ_INDEX:
-   if (pci_is_pcie(vdev->pdev))
-   break;
-   fallthrough;
-   default:
-   return -EINVAL;
-   }
+   if (info.argsz >= capsz)
+   minsz = capsz;
 
info.flags = VFIO_IRQ_INFO_EVENTFD;
 
-   info.count = vfio_pci_get_irq_count(vdev, info.index);
-
-   if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+   switch (info.index) {
+   case VFIO_PCI_INTX_IRQ_INDEX:
info.flags |= (VFIO_IRQ_INFO_MASKABLE |
   VFIO_IRQ_INFO_AUTOMASKED);
-   else
+   break;
+   case VFIO_PCI_MSI_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+   case VFIO_PCI_REQ_IRQ_INDEX:
info.flags |= VFIO_IRQ_INFO_NORESIZE;
+   break;
+   case VFIO_PCI_ERR_IRQ_INDEX:
+   info.flags |= VFIO_IRQ_INFO_NORESIZE;
+   if (!pci_is_pcie(vdev->pdev))
+   return -EINVAL;
+   break;
+   default:
+   {
+   struct vfio_irq_info_cap_type cap_type = {
+   .header.id = VFIO_IRQ_INFO_CAP_TYPE,
+   .header.version = 1 };
+   int ret, i;
+
+   if (info.index >= VFIO_PCI_NUM_IRQS +
+ 

[PATCH v13 07/13] vfio: Use capability chains to handle device specific irq

2021-04-11 Thread Eric Auger
From: Tina Zhang 

Caps the number of irqs with fixed indexes and uses capability chains
to chain device specific irqs.

Signed-off-by: Tina Zhang 
Signed-off-by: Eric Auger 
[Eric: Put cap_offset at the end of the vfio_irq_info struct,
remove GFX IRQ at the moment and remove any reference to this latter
in the commit message]

---
---
 include/uapi/linux/vfio.h | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ca8cc796e254..0728b6f3f348 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -708,11 +708,27 @@ struct vfio_irq_info {
 #define VFIO_IRQ_INFO_MASKABLE (1 << 1)
 #define VFIO_IRQ_INFO_AUTOMASKED   (1 << 2)
 #define VFIO_IRQ_INFO_NORESIZE (1 << 3)
+#define VFIO_IRQ_INFO_FLAG_CAPS(1 << 4) /* Info supports caps 
*/
__u32   index;  /* IRQ index */
__u32   count;  /* Number of IRQs within this index */
+   __u32   cap_offset; /* Offset within info struct of first cap */
 };
 #define VFIO_DEVICE_GET_IRQ_INFO   _IO(VFIO_TYPE, VFIO_BASE + 9)
 
+/*
+ * The irq type capability allows IRQs unique to a specific device or
+ * class of devices to be exposed.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IRQ_INFO_CAP_TYPE  3
+
+struct vfio_irq_info_cap_type {
+   struct vfio_info_cap_header header;
+   __u32 type; /* global per bus driver */
+   __u32 subtype;  /* type specific */
+};
+
 /**
  * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
  *
@@ -814,7 +830,8 @@ enum {
VFIO_PCI_MSIX_IRQ_INDEX,
VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_REQ_IRQ_INDEX,
-   VFIO_PCI_NUM_IRQS
+   VFIO_PCI_NUM_IRQS = 5   /* Fixed user ABI, IRQ indexes >=5 use   */
+   /* device specific cap to define content */
 };
 
 /*
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 06/13] vfio/pci: Allow to mmap the fault queue

2021-04-11 Thread Eric Auger
The DMA FAULT region contains the fault ring buffer.
There is benefit to let the userspace mmap this area.
Expose this mmappable area through a sparse mmap entry
and implement the mmap operation.

Signed-off-by: Eric Auger 

---

v8 -> v9:
- remove unused index local variable in vfio_pci_fault_mmap
---
 drivers/vfio/pci/vfio_pci.c | 61 +++--
 1 file changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 92840e0f46bf..eef76560be55 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -316,21 +316,75 @@ static void vfio_pci_dma_fault_release(struct 
vfio_pci_device *vdev,
kfree(vdev->fault_pages);
 }
 
+static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vm_area_struct *vma)
+{
+   u64 phys_len, req_len, pgoff, req_start;
+   unsigned long long addr;
+   unsigned int ret;
+
+   phys_len = region->size;
+
+   req_len = vma->vm_end - vma->vm_start;
+   pgoff = vma->vm_pgoff &
+   ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+   req_start = pgoff << PAGE_SHIFT;
+
+   /* only the second page of the producer fault region is mmappable */
+   if (req_start < PAGE_SIZE)
+   return -EINVAL;
+
+   if (req_start + req_len > phys_len)
+   return -EINVAL;
+
+   addr = virt_to_phys(vdev->fault_pages);
+   vma->vm_private_data = vdev;
+   vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
+
+   ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+ req_len, vma->vm_page_prot);
+   return ret;
+}
+
 static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
 struct vfio_pci_region *region,
 struct vfio_info_cap *caps)
 {
+   struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
struct vfio_region_info_cap_fault cap = {
.header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
.header.version = 1,
.version = 1,
};
-   return vfio_info_add_capability(caps, , sizeof(cap));
+   size_t size = sizeof(*sparse) + sizeof(*sparse->areas);
+   int ret;
+
+   ret = vfio_info_add_capability(caps, , sizeof(cap));
+   if (ret)
+   return ret;
+
+   sparse = kzalloc(size, GFP_KERNEL);
+   if (!sparse)
+   return -ENOMEM;
+
+   sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+   sparse->header.version = 1;
+   sparse->nr_areas = 1;
+   sparse->areas[0].offset = PAGE_SIZE;
+   sparse->areas[0].size = region->size - PAGE_SIZE;
+
+   ret = vfio_info_add_capability(caps, >header, size);
+   if (ret)
+   kfree(sparse);
+
+   return ret;
 }
 
 static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
.rw = vfio_pci_dma_fault_rw,
.release= vfio_pci_dma_fault_release,
+   .mmap   = vfio_pci_dma_fault_mmap,
.add_capability = vfio_pci_dma_fault_add_capability,
 };
 
@@ -404,7 +458,8 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
VFIO_REGION_TYPE_NESTED,
VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
_pci_dma_fault_regops, size,
-   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+   VFIO_REGION_INFO_FLAG_MMAP,
vdev->fault_pages);
if (ret)
goto out;
@@ -412,7 +467,7 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
header = (struct vfio_region_dma_fault *)vdev->fault_pages;
header->entry_size = sizeof(struct iommu_fault);
header->nb_entries = DMA_FAULT_RING_LENGTH;
-   header->offset = sizeof(struct vfio_region_dma_fault);
+   header->offset = PAGE_SIZE;
 
ret = iommu_register_device_fault_handler(>pdev->dev,
vfio_pci_iommu_dev_fault_handler,
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 05/13] vfio/pci: Register an iommu fault handler

2021-04-11 Thread Eric Auger
Register an IOMMU fault handler which records faults in
the DMA FAULT region ring buffer. In a subsequent patch, we
will add the signaling of a specific eventfd to allow the
userspace to be notified whenever a new fault has shown up.

Signed-off-by: Eric Auger 

---
v11 -> v12:
- take the fault_queue_lock before reading header (Zenghui)
- also record recoverable errors
- only WARN_ON if the unregistration returns -EBUSY
- make vfio_pci_iommu_dev_fault_handler static

v10 -> v11:
- move iommu_unregister_device_fault_handler into
  vfio_pci_disable
- check fault_pages != 0

v8 -> v9:
- handler now takes an iommu_fault handle
- eventfd signaling moved to a subsequent patch
- check the fault type and return an error if != UNRECOV
- still the fault handler registration can fail. We need to
  reach an agreement about how to deal with the situation

v3 -> v4:
- move iommu_unregister_device_fault_handler to vfio_pci_release
---
 drivers/vfio/pci/vfio_pci.c | 48 -
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index f37cabddf1d6..92840e0f46bf 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "vfio_pci_private.h"
 
@@ -333,6 +334,41 @@ static const struct vfio_pci_regops 
vfio_pci_dma_fault_regops = {
.add_capability = vfio_pci_dma_fault_add_capability,
 };
 
+static int
+vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
+{
+   struct vfio_pci_device *vdev = (struct vfio_pci_device *)data;
+   struct vfio_region_dma_fault *reg =
+   (struct vfio_region_dma_fault *)vdev->fault_pages;
+   struct iommu_fault *new;
+   u32 head, tail, size;
+   int ret = -EINVAL;
+
+   if (WARN_ON(!reg))
+   return ret;
+
+   mutex_lock(>fault_queue_lock);
+
+   head = reg->head;
+   tail = reg->tail;
+   size = reg->nb_entries;
+
+   new = (struct iommu_fault *)(vdev->fault_pages + reg->offset +
+head * reg->entry_size);
+
+   if (CIRC_SPACE(head, tail, size) < 1) {
+   ret = -ENOSPC;
+   goto unlock;
+   }
+
+   *new = *fault;
+   reg->head = (head + 1) % size;
+   ret = 0;
+unlock:
+   mutex_unlock(>fault_queue_lock);
+   return ret;
+}
+
 #define DMA_FAULT_RING_LENGTH 512
 
 static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev)
@@ -377,6 +413,13 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
header->entry_size = sizeof(struct iommu_fault);
header->nb_entries = DMA_FAULT_RING_LENGTH;
header->offset = sizeof(struct vfio_region_dma_fault);
+
+   ret = iommu_register_device_fault_handler(>pdev->dev,
+   vfio_pci_iommu_dev_fault_handler,
+   vdev);
+   if (ret) /* the dma fault region is freed in vfio_pci_disable() */
+   goto out;
+
return 0;
 out:
kfree(vdev->fault_pages);
@@ -500,7 +543,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_dummy_resource *dummy_res, *tmp;
struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
-   int i, bar;
+   int i, bar, ret;
 
/* Stop the device from further DMA */
pci_clear_master(pdev);
@@ -509,6 +552,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
VFIO_IRQ_SET_ACTION_TRIGGER,
vdev->irq_type, 0, 0, NULL);
 
+   ret = iommu_unregister_device_fault_handler(>pdev->dev);
+   WARN_ON(ret == -EBUSY);
+
/* Device closed, don't need mutex here */
list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
 >ioeventfds_list, next) {
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v13 04/13] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type

2021-04-11 Thread Eric Auger
Add a new specific DMA_FAULT region aiming to exposed nested mode
translation faults. This region only is exposed if the device
is attached to a nested domain.

The region has a ring buffer that contains the actual fault
records plus a header allowing to handle it (tail/head indices,
max capacity, entry size). At the moment the region is dimensionned
for 512 fault records.

Signed-off-by: Eric Auger 

---
v11 -> v12:
- set fault_pages to NULL after free
- check new_tail >= header->nb_entries (Zenghui)
- update value of VFIO_REGION_TYPE_NESTED
- handle the case where the domain is NULL
- pass an int pointer to iommu_domain_get_attr [Shenming]

v10 -> v11:
- rename vfio_pci_init_dma_fault_region into
  vfio_pci_dma_fault_init
- free fault_pages in vfio_pci_dma_fault_release
- only register the region if the device is attached
  to a nested domain

v8 -> v9:
- Use a single region instead of a prod/cons region

v4 -> v5
- check cons is not null in vfio_pci_check_cons_fault

v3 -> v4:
- use 2 separate regions, respectively in read and write modes
- add the version capability
---
 drivers/vfio/pci/vfio_pci.c | 79 +
 drivers/vfio/pci/vfio_pci_private.h |  6 +++
 drivers/vfio/pci/vfio_pci_rdwr.c| 44 
 include/uapi/linux/vfio.h   | 35 +
 4 files changed, 164 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 65e7e6b44578..f37cabddf1d6 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -309,6 +309,81 @@ int vfio_pci_set_power_state(struct vfio_pci_device *vdev, 
pci_power_t state)
return ret;
 }
 
+static void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region)
+{
+   kfree(vdev->fault_pages);
+}
+
+static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
+struct vfio_pci_region *region,
+struct vfio_info_cap *caps)
+{
+   struct vfio_region_info_cap_fault cap = {
+   .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
+   .header.version = 1,
+   .version = 1,
+   };
+   return vfio_info_add_capability(caps, , sizeof(cap));
+}
+
+static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
+   .rw = vfio_pci_dma_fault_rw,
+   .release= vfio_pci_dma_fault_release,
+   .add_capability = vfio_pci_dma_fault_add_capability,
+};
+
+#define DMA_FAULT_RING_LENGTH 512
+
+static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev)
+{
+   struct vfio_region_dma_fault *header;
+   struct iommu_domain *domain;
+   size_t size;
+   int nested;
+   int ret;
+
+   domain = iommu_get_domain_for_dev(>pdev->dev);
+   if (!domain)
+   return 0;
+
+   ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, );
+   if (ret || !nested)
+   return ret;
+
+   mutex_init(>fault_queue_lock);
+
+   /*
+* We provision 1 page for the header and space for
+* DMA_FAULT_RING_LENGTH fault records in the ring buffer.
+*/
+   size = ALIGN(sizeof(struct iommu_fault) *
+DMA_FAULT_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE;
+
+   vdev->fault_pages = kzalloc(size, GFP_KERNEL);
+   if (!vdev->fault_pages)
+   return -ENOMEM;
+
+   ret = vfio_pci_register_dev_region(vdev,
+   VFIO_REGION_TYPE_NESTED,
+   VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
+   _pci_dma_fault_regops, size,
+   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+   vdev->fault_pages);
+   if (ret)
+   goto out;
+
+   header = (struct vfio_region_dma_fault *)vdev->fault_pages;
+   header->entry_size = sizeof(struct iommu_fault);
+   header->nb_entries = DMA_FAULT_RING_LENGTH;
+   header->offset = sizeof(struct vfio_region_dma_fault);
+   return 0;
+out:
+   kfree(vdev->fault_pages);
+   vdev->fault_pages = NULL;
+   return ret;
+}
+
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
struct pci_dev *pdev = vdev->pdev;
@@ -407,6 +482,10 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
}
}
 
+   ret = vfio_pci_dma_fault_init(vdev);
+   if (ret)
+   goto disable_exit;
+
vfio_pci_probe_mmaps(vdev);
 
return 0;
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 9cd1882a05af..76cb3bd11375 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -134,6 +134,8 @@ struct vfio_pci_device {
int ioeventfds_nr;
struct eventfd_ctx  *err_trigger;
struct eventfd_ctx 

[PATCH v13 03/13] vfio: VFIO_IOMMU_SET_MSI_BINDING

2021-04-11 Thread Eric Auger
This patch adds the VFIO_IOMMU_SET_MSI_BINDING ioctl which aim
to (un)register the guest MSI binding to the host. This latter
then can use those stage 1 bindings to build a nested stage
binding targeting the physical MSIs.

Signed-off-by: Eric Auger 

---

v11 -> v12:
- Share VFIO_BASE + 20 with VFIO_IOMMU_SPAPR_TCE_REMOVE
- rework returned values

v10 -> v11:
- renamed ustruct into msi_binding
- return 0 on unbind

v8 -> v9:
- merge VFIO_IOMMU_BIND_MSI/VFIO_IOMMU_UNBIND_MSI into a single
  VFIO_IOMMU_SET_MSI_BINDING ioctl
- ioctl id changed

v6 -> v7:
- removed the dev arg

v3 -> v4:
- add UNBIND
- unwind on BIND error

v2 -> v3:
- adapt to new proto of bind_guest_msi
- directly use vfio_iommu_for_each_dev

v1 -> v2:
- s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
---
 drivers/vfio/vfio_iommu_type1.c | 62 +
 include/uapi/linux/vfio.h   | 20 +++
 2 files changed, 82 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 34f8dca36ebe..5e9196ec9685 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2902,6 +2902,41 @@ static int vfio_cache_inv_fn(struct device *dev, void 
*data)
return iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg);
 }
 
+static int
+vfio_bind_msi(struct vfio_iommu *iommu,
+ dma_addr_t giova, phys_addr_t gpa, size_t size)
+{
+   struct vfio_domain *d;
+   int ret = 0;
+
+   mutex_lock(>lock);
+
+   list_for_each_entry(d, >domain_list, next) {
+   ret = iommu_bind_guest_msi(d->domain, giova, gpa, size);
+   if (ret)
+   goto unwind;
+   }
+   goto unlock;
+unwind:
+   list_for_each_entry_continue_reverse(d, >domain_list, next) {
+   iommu_unbind_guest_msi(d->domain, giova);
+   }
+unlock:
+   mutex_unlock(>lock);
+   return ret;
+}
+
+static void
+vfio_unbind_msi(struct vfio_iommu *iommu, dma_addr_t giova)
+{
+   struct vfio_domain *d;
+
+   mutex_lock(>lock);
+   list_for_each_entry(d, >domain_list, next)
+   iommu_unbind_guest_msi(d->domain, giova);
+   mutex_unlock(>lock);
+}
+
 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
   struct vfio_info_cap *caps)
 {
@@ -3114,6 +3149,31 @@ static int vfio_iommu_type1_cache_invalidate(struct 
vfio_iommu *iommu,
return ret;
 }
 
+static int vfio_iommu_type1_set_msi_binding(struct vfio_iommu *iommu,
+   unsigned long arg)
+{
+   struct vfio_iommu_type1_set_msi_binding msi_binding;
+   unsigned long minsz;
+
+   minsz = offsetofend(struct vfio_iommu_type1_set_msi_binding,
+   size);
+
+   if (copy_from_user(_binding, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (msi_binding.argsz < minsz)
+   return -EINVAL;
+
+   if (msi_binding.flags == VFIO_IOMMU_UNBIND_MSI) {
+   vfio_unbind_msi(iommu, msi_binding.iova);
+   return 0;
+   } else if (msi_binding.flags == VFIO_IOMMU_BIND_MSI) {
+   return vfio_bind_msi(iommu, msi_binding.iova,
+msi_binding.gpa, msi_binding.size);
+   }
+   return -EINVAL;
+}
+
 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
unsigned long arg)
 {
@@ -3238,6 +3298,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return vfio_iommu_type1_set_pasid_table(iommu, arg);
case VFIO_IOMMU_CACHE_INVALIDATE:
return vfio_iommu_type1_cache_invalidate(iommu, arg);
+   case VFIO_IOMMU_SET_MSI_BINDING:
+   return vfio_iommu_type1_set_msi_binding(iommu, arg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ada42121827a..afa8596c05d2 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1241,6 +1241,26 @@ struct vfio_iommu_type1_cache_invalidate {
 };
 #define VFIO_IOMMU_CACHE_INVALIDATE  _IO(VFIO_TYPE, VFIO_BASE + 19)
 
+/**
+ * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20,
+ * struct vfio_iommu_type1_set_msi_binding)
+ *
+ * Pass a stage 1 MSI doorbell mapping to the host so that this
+ * latter can build a nested stage2 mapping. Or conversely tear
+ * down a previously bound stage 1 MSI binding.
+ */
+struct vfio_iommu_type1_set_msi_binding {
+   __u32   argsz;
+   __u32   flags;
+#define VFIO_IOMMU_BIND_MSI(1 << 0)
+#define VFIO_IOMMU_UNBIND_MSI  (1 << 1)
+   __u64   iova;   /* MSI guest IOVA */
+   /* Fields below are used on BIND */
+   __u64   gpa;/* MSI guest physical address */
+   __u64   size;   /* size

[PATCH v13 02/13] vfio: VFIO_IOMMU_CACHE_INVALIDATE

2021-04-11 Thread Eric Auger
From: "Liu, Yi L" 

When the guest "owns" the stage 1 translation structures,  the host
IOMMU driver has no knowledge of caching structure updates unless
the guest invalidation requests are trapped and passed down to the
host.

This patch adds the VFIO_IOMMU_CACHE_INVALIDATE ioctl with aims
at propagating guest stage1 IOMMU cache invalidations to the host.

Signed-off-by: Liu, Yi L 
Signed-off-by: Eric Auger 

---

v12 -> v13:
- resolve minor conflict due to
  898b9eaeb3fe ("vfio/type1: block on invalid vaddr")
v11 -> v12:
- share VFIO_BASE + 19 with VFIO_IOMMU_SPAPR_TCE_CREATE

v10 -> v11:
- renamed ustruct into cache_inv

v8 -> v9:
- change the ioctl ID

v6 -> v7:
- Use iommu_capsule struct
- renamed vfio_iommu_for_each_dev into vfio_iommu_lookup_dev
  due to checkpatch error related to for_each_dev suffix

v2 -> v3:
- introduce vfio_iommu_for_each_dev back in this patch

v1 -> v2:
- s/TLB/CACHE
- remove vfio_iommu_task usage
- commit message rewording
---
 drivers/vfio/vfio_iommu_type1.c | 58 +
 include/uapi/linux/vfio.h   | 13 
 2 files changed, 71 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e575ef5dd6c6..34f8dca36ebe 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -159,6 +159,34 @@ struct vfio_regions {
 
 #define WAITED 1
 
+struct domain_capsule {
+   struct iommu_domain *domain;
+   void *data;
+};
+
+/* iommu->lock must be held */
+static int
+vfio_iommu_lookup_dev(struct vfio_iommu *iommu,
+ int (*fn)(struct device *dev, void *data),
+ unsigned long arg)
+{
+   struct domain_capsule dc = {.data = };
+   struct vfio_domain *d;
+   struct vfio_group *g;
+   int ret = 0;
+
+   list_for_each_entry(d, >domain_list, next) {
+   dc.domain = d->domain;
+   list_for_each_entry(g, >group_list, next) {
+   ret = iommu_group_for_each_dev(g->iommu_group,
+  , fn);
+   if (ret)
+   break;
+   }
+   }
+   return ret;
+}
+
 static int put_pfn(unsigned long pfn, int prot);
 
 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
@@ -2866,6 +2894,13 @@ vfio_attach_pasid_table(struct vfio_iommu *iommu, 
unsigned long arg)
mutex_unlock(>lock);
return ret;
 }
+static int vfio_cache_inv_fn(struct device *dev, void *data)
+{
+   struct domain_capsule *dc = (struct domain_capsule *)data;
+   unsigned long arg = *(unsigned long *)dc->data;
+
+   return iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg);
+}
 
 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
   struct vfio_info_cap *caps)
@@ -3058,6 +3093,27 @@ static int vfio_iommu_type1_set_pasid_table(struct 
vfio_iommu *iommu,
return -EINVAL;
 }
 
+static int vfio_iommu_type1_cache_invalidate(struct vfio_iommu *iommu,
+   unsigned long arg)
+{
+   struct vfio_iommu_type1_cache_invalidate cache_inv;
+   unsigned long minsz;
+   int ret;
+
+   minsz = offsetofend(struct vfio_iommu_type1_cache_invalidate, flags);
+
+   if (copy_from_user(_inv, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (cache_inv.argsz < minsz || cache_inv.flags)
+   return -EINVAL;
+
+   mutex_lock(>lock);
+   ret = vfio_iommu_lookup_dev(iommu, vfio_cache_inv_fn, arg + minsz);
+   mutex_unlock(>lock);
+   return ret;
+}
+
 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
unsigned long arg)
 {
@@ -3180,6 +3236,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return vfio_iommu_type1_dirty_pages(iommu, arg);
case VFIO_IOMMU_SET_PASID_TABLE:
return vfio_iommu_type1_set_pasid_table(iommu, arg);
+   case VFIO_IOMMU_CACHE_INVALIDATE:
+   return vfio_iommu_type1_cache_invalidate(iommu, arg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c86b7b4a5064..ada42121827a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1228,6 +1228,19 @@ struct vfio_iommu_type1_set_pasid_table {
 
 #define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18)
 
+/**
+ * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19,
+ * struct vfio_iommu_type1_cache_invalidate)
+ *
+ * Propagate guest IOMMU cache invalidation to the host.
+ */
+struct vfio_iommu_type1_cache_invalidate {
+   __u32   argsz;
+   __u32   flags;
+   struct iommu_cache_invalidate_in

[PATCH v13 01/13] vfio: VFIO_IOMMU_SET_PASID_TABLE

2021-04-11 Thread Eric Auger
From: "Liu, Yi L" 

This patch adds an VFIO_IOMMU_SET_PASID_TABLE ioctl
which aims to pass the virtual iommu guest configuration
to the host. This latter takes the form of the so-called
PASID table.

Signed-off-by: Jacob Pan 
Signed-off-by: Liu, Yi L 
Signed-off-by: Eric Auger 

---
v12 -> v13:
- reword the misleading doc comment of  VFIO_IOMMU_SET_PASID_TABLE

v11 -> v12:
- use iommu_uapi_set_pasid_table
- Rework the flags checks [Zenghui, Alex]
- use VFIO_BASE + 19 [Alex]
- rework the unwind in vfio_attach_pasid_table() [Alex]

v8 -> v9:
- Merge VFIO_IOMMU_ATTACH/DETACH_PASID_TABLE into a single
  VFIO_IOMMU_SET_PASID_TABLE ioctl.

v6 -> v7:
- add a comment related to VFIO_IOMMU_DETACH_PASID_TABLE

v3 -> v4:
- restore ATTACH/DETACH
- add unwind on failure

v2 -> v3:
- s/BIND_PASID_TABLE/SET_PASID_TABLE

v1 -> v2:
- s/BIND_GUEST_STAGE/BIND_PASID_TABLE
- remove the struct device arg
---
 drivers/vfio/vfio_iommu_type1.c | 58 +
 include/uapi/linux/vfio.h   | 20 
 2 files changed, 78 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 45cbfd4879a5..e575ef5dd6c6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2834,6 +2834,39 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
 }
 
+static void
+vfio_detach_pasid_table(struct vfio_iommu *iommu)
+{
+   struct vfio_domain *d;
+
+   mutex_lock(>lock);
+   list_for_each_entry(d, >domain_list, next)
+   iommu_detach_pasid_table(d->domain);
+
+   mutex_unlock(>lock);
+}
+
+static int
+vfio_attach_pasid_table(struct vfio_iommu *iommu, unsigned long arg)
+{
+   struct vfio_domain *d;
+   int ret = 0;
+
+   mutex_lock(>lock);
+
+   list_for_each_entry(d, >domain_list, next) {
+   ret = iommu_uapi_attach_pasid_table(d->domain, (void __user 
*)arg);
+   if (ret) {
+   list_for_each_entry_continue_reverse(d, 
>domain_list, next)
+   iommu_detach_pasid_table(d->domain);
+   break;
+   }
+   }
+
+   mutex_unlock(>lock);
+   return ret;
+}
+
 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
   struct vfio_info_cap *caps)
 {
@@ -3002,6 +3035,29 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu 
*iommu,
-EFAULT : 0;
 }
 
+static int vfio_iommu_type1_set_pasid_table(struct vfio_iommu *iommu,
+   unsigned long arg)
+{
+   struct vfio_iommu_type1_set_pasid_table spt;
+   unsigned long minsz;
+
+   minsz = offsetofend(struct vfio_iommu_type1_set_pasid_table, flags);
+
+   if (copy_from_user(, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (spt.argsz < minsz)
+   return -EINVAL;
+
+   if (spt.flags == VFIO_PASID_TABLE_FLAG_SET) {
+   return vfio_attach_pasid_table(iommu, arg + minsz);
+   } else if (spt.flags == VFIO_PASID_TABLE_FLAG_UNSET) {
+   vfio_detach_pasid_table(iommu);
+   return 0;
+   }
+   return -EINVAL;
+}
+
 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
unsigned long arg)
 {
@@ -3122,6 +3178,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return vfio_iommu_type1_unmap_dma(iommu, arg);
case VFIO_IOMMU_DIRTY_PAGES:
return vfio_iommu_type1_dirty_pages(iommu, arg);
+   case VFIO_IOMMU_SET_PASID_TABLE:
+   return vfio_iommu_type1_set_pasid_table(iommu, arg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 8ce36c1d53ca..c86b7b4a5064 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@
 
 #include 
 #include 
+#include 
 
 #define VFIO_API_VERSION   0
 
@@ -1208,6 +1209,25 @@ struct vfio_iommu_type1_dirty_bitmap_get {
 
 #define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
 
+/*
+ * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18,
+ * struct vfio_iommu_type1_set_pasid_table)
+ *
+ * The SET operation passes a PASID table to the host while the
+ * UNSET operation detaches the one currently programmed. It is
+ * allowed to "SET" the table several times without unsetting as
+ * long as the table config does not stay IOMMU_PASID_CONFIG_TRANSLATE.
+ */
+struct vfio_iommu_type1_set_pasid_table {
+   __u32   argsz;
+   __u32   flags;
+#define VFIO_PASID_TABLE_FLAG_SET  (1 << 0)
+#define VFIO_PASID_TABLE_FLAG_UNSET(1 << 1)
+   struct iommu_pasid_table_config config; /* used on SET */
+};
+
+#define VFIO_IOMMU_SET_PASI

[PATCH v13 00/13] SMMUv3 Nested Stage Setup (VFIO part)

2021-04-11 Thread Eric Auger
This series brings the VFIO part of HW nested paging support
in the SMMUv3.

The series depends on:
[PATCH v15 00/13] SMMUv3 Nested Stage Setup (IOMMU part)

3 new IOCTLs are introduced that allow the userspace to
1) pass the guest stage 1 configuration
2) pass stage 1 MSI bindings
3) invalidate stage 1 related caches

They map onto the related new IOMMU API functions.

We introduce the capability to register specific interrupt
indexes (see [1]). A new DMA_FAULT interrupt index allows to register
an eventfd to be signaled whenever a stage 1 related fault
is detected at physical level. Also two specific regions allow to
- expose the fault records to the user space and
- inject page responses.

This latter functionality is not exercised in this series
but is provided as a POC for further vSVA activities (Shameer's input).

Best Regards

Eric

This series can be found at:
https://github.com/eauger/linux/tree/v5.12-rc6-jean-iopf-14-2stage-v15

The series series includes Tina's patch steming from
[1] "[RFC PATCH v2 1/3] vfio: Use capability chains to handle device
specific irq" plus patches originally contributed by Yi.

History:

v12 -> v13:
- Comment fix for VFIO_IOMMU_SET_PASID_TABLE (Zenghui)

v11 -> v12:
- numerous fixes following feedbacks. Many thanks to all of you
- See individual history log.

v10 -> v11:
- rebase on top of v5.10-rc4
- adapt to changes on the IOMMU API (compliant with the doc
  written by Jacob/Yi)
- addition of the page response region
- Took into account Zenghui's comments
- In this version I have kept the ioctl separate. Since
  Yi's series [2] is currently stalled, I've just rebased here.

[2] [PATCH v7 00/16] vfio: expose virtual Shared Virtual Addressing
to VMs

v9 -> v10
- rebase on top of 5.6.0-rc3 (no change versus v9)

Eric Auger (10):
  vfio: VFIO_IOMMU_SET_MSI_BINDING
  vfio/pci: Add VFIO_REGION_TYPE_NESTED region type
  vfio/pci: Register an iommu fault handler
  vfio/pci: Allow to mmap the fault queue
  vfio/pci: Add framework for custom interrupt indices
  vfio: Add new IRQ for DMA fault reporting
  vfio/pci: Register and allow DMA FAULT IRQ signaling
  vfio: Document nested stage control
  vfio/pci: Register a DMA fault response region
  vfio/pci: Inject page response upon response region fill

Liu, Yi L (2):
  vfio: VFIO_IOMMU_SET_PASID_TABLE
  vfio: VFIO_IOMMU_CACHE_INVALIDATE

Tina Zhang (1):
  vfio: Use capability chains to handle device specific irq

 Documentation/driver-api/vfio.rst   |  77 +
 drivers/vfio/pci/vfio_pci.c | 447 ++--
 drivers/vfio/pci/vfio_pci_intrs.c   |  62 
 drivers/vfio/pci/vfio_pci_private.h |  33 ++
 drivers/vfio/pci/vfio_pci_rdwr.c|  84 ++
 drivers/vfio/vfio_iommu_type1.c | 178 +++
 include/uapi/linux/vfio.h   | 142 -
 7 files changed, 1003 insertions(+), 20 deletions(-)

-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v15 12/12] iommu/smmuv3: report additional recoverable faults

2021-04-11 Thread Eric Auger
Up to now we have only reported translation faults. Now that
the guest can induce some configuration faults, let's report them
too. Add propagation for BAD_SUBSTREAMID, CD_FETCH, BAD_CD, WALK_EABT.
We also fix the transcoding for some existing translation faults.

Signed-off-by: Eric Auger 

---

v14 -> v15:
- adapt to removal of IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID
  in [PATCH v13 10/10] iommu/arm-smmu-v3: Add stall support for
  platform devices
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 40 +++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  4 +++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 3b0a67434f7d..d5c39c8e95b0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1473,6 +1473,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
u32 perm = 0;
struct arm_smmu_master *master;
bool ssid_valid = evt[0] & EVTQ_0_SSV;
+   u8 type = FIELD_GET(EVTQ_0_ID, evt[0]);
u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
struct iommu_fault_event fault_evt = { };
struct iommu_fault *flt = _evt.fault;
@@ -1525,8 +1526,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
} else {
flt->type = IOMMU_FAULT_DMA_UNRECOV;
flt->event = (struct iommu_fault_unrecoverable) {
-   .reason = reason,
-   .flags = IOMMU_FAULT_UNRECOV_ADDR_VALID,
.perm = perm,
.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
};
@@ -1535,6 +1534,43 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
flt->event.flags |= IOMMU_FAULT_UNRECOV_PASID_VALID;
flt->event.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
}
+
+   switch (type) {
+   case EVT_ID_TRANSLATION_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_PTE_FETCH;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_ADDR_SIZE_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_OOR_ADDRESS;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_ACCESS_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_ACCESS;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_PERMISSION_FAULT:
+   flt->event.reason = IOMMU_FAULT_REASON_PERMISSION;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID;
+   break;
+   case EVT_ID_BAD_SUBSTREAMID:
+   flt->event.reason = IOMMU_FAULT_REASON_PASID_INVALID;
+   break;
+   case EVT_ID_CD_FETCH:
+   flt->event.reason = IOMMU_FAULT_REASON_PASID_FETCH;
+   flt->event.flags |= 
IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID;
+   break;
+   case EVT_ID_BAD_CD:
+   flt->event.reason = IOMMU_FAULT_REASON_BAD_PASID_ENTRY;
+   break;
+   case EVT_ID_WALK_EABT:
+   flt->event.reason = IOMMU_FAULT_REASON_WALK_EABT;
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_ADDR_VALID |
+   
IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID;
+   break;
+   default:
+   /* TODO: report other unrecoverable faults. */
+   return -EFAULT;
+   }
}
 
mutex_lock(>streams_mutex);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index eb0cc08e8240..9c37dbec75b2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -378,6 +378,10 @@
 
 #define EVTQ_0_ID  GENMASK_ULL(7, 0)
 
+#define EVT_ID_BAD_SUBSTREAMID 0x08
+#define EVT_ID_CD_FETCH0x09
+#define EVT_ID_BAD_CD  0x0a
+#define EVT_ID_WALK_EABT   0x0b
 #define EVT_ID_TRANSLATION_FAULT   0x10
 #define EVT_ID_ADDR_SIZE_FAULT 0x11
 #define EVT_ID_ACCESS_FAULT0x12
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v15 11/12] iommu/smmuv3: Implement bind/unbind_guest_msi

2021-04-11 Thread Eric Auger
The bind/unbind_guest_msi() callbacks check the domain
is NESTED and redirect to the dma-iommu implementation.

Signed-off-by: Eric Auger 

---

v6 -> v7:
- remove device handle argument
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 43 +
 1 file changed, 43 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ec75219d6a52..3b0a67434f7d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2934,6 +2934,47 @@ static void arm_smmu_get_resv_regions(struct device *dev,
iommu_dma_get_resv_regions(dev, head);
 }
 
+static int
+arm_smmu_bind_guest_msi(struct iommu_domain *domain,
+   dma_addr_t giova, phys_addr_t gpa, size_t size)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu;
+   int ret = -EINVAL;
+
+   mutex_lock(_domain->init_mutex);
+   smmu = smmu_domain->smmu;
+   if (!smmu)
+   goto out;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto out;
+
+   ret = iommu_dma_bind_guest_msi(domain, giova, gpa, size);
+out:
+   mutex_unlock(_domain->init_mutex);
+   return ret;
+}
+
+static void
+arm_smmu_unbind_guest_msi(struct iommu_domain *domain, dma_addr_t giova)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu;
+
+   mutex_lock(_domain->init_mutex);
+   smmu = smmu_domain->smmu;
+   if (!smmu)
+   goto unlock;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto unlock;
+
+   iommu_dma_unbind_guest_msi(domain, giova);
+unlock:
+   mutex_unlock(_domain->init_mutex);
+}
+
 static int arm_smmu_attach_pasid_table(struct iommu_domain *domain,
   struct iommu_pasid_table_config *cfg)
 {
@@ -3209,6 +3250,8 @@ static struct iommu_ops arm_smmu_ops = {
.attach_pasid_table = arm_smmu_attach_pasid_table,
.detach_pasid_table = arm_smmu_detach_pasid_table,
.cache_invalidate   = arm_smmu_cache_invalidate,
+   .bind_guest_msi = arm_smmu_bind_guest_msi,
+   .unbind_guest_msi   = arm_smmu_unbind_guest_msi,
.dev_has_feat   = arm_smmu_dev_has_feature,
.dev_feat_enabled   = arm_smmu_dev_feature_enabled,
.dev_enable_feat= arm_smmu_dev_enable_feature,
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v15 10/12] iommu/smmuv3: Enforce incompatibility between nested mode and HW MSI regions

2021-04-11 Thread Eric Auger
Nested mode currently is not compatible with HW MSI reserved regions.
Indeed MSI transactions targeting this MSI doorbells bypass the SMMU.

Let's check nested mode is not attempted in such configuration.

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 23 +++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c4794c21c35f..ec75219d6a52 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2472,6 +2472,23 @@ static bool arm_smmu_share_msi_domain(struct 
iommu_domain *domain,
return share;
 }
 
+static bool arm_smmu_has_hw_msi_resv_region(struct device *dev)
+{
+   struct iommu_resv_region *region;
+   bool has_msi_resv_region = false;
+   LIST_HEAD(resv_regions);
+
+   iommu_get_resv_regions(dev, _regions);
+   list_for_each_entry(region, _regions, list) {
+   if (region->type == IOMMU_RESV_MSI) {
+   has_msi_resv_region = true;
+   break;
+   }
+   }
+   iommu_put_resv_regions(dev, _regions);
+   return has_msi_resv_region;
+}
+
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
int ret = 0;
@@ -2532,10 +2549,12 @@ static int arm_smmu_attach_dev(struct iommu_domain 
*domain, struct device *dev)
/*
 * In nested mode we must check all devices belonging to the
 * domain share the same physical MSI doorbell. Otherwise nested
-* stage MSI binding is not supported.
+* stage MSI binding is not supported. Also nested mode is not
+* compatible with MSI HW reserved regions.
 */
if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED &&
-   !arm_smmu_share_msi_domain(domain, dev)) {
+   (!arm_smmu_share_msi_domain(domain, dev) ||
+arm_smmu_has_hw_msi_resv_region(dev))) {
ret = -EINVAL;
goto out_unlock;
}
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v15 09/12] iommu/smmuv3: Nested mode single MSI doorbell per domain enforcement

2021-04-11 Thread Eric Auger
In nested mode we enforce the rule that all devices belonging
to the same iommu_domain share the same msi_domain.

Indeed if there were several physical MSI doorbells being used
within a single iommu_domain, it becomes really difficult to
resolve the nested stage mapping translating into the correct
physical doorbell. So let's forbid this situation.

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 41 +
 1 file changed, 41 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index bfc112cc0d38..c4794c21c35f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2441,6 +2441,37 @@ static void arm_smmu_detach_dev(struct arm_smmu_master 
*master)
arm_smmu_install_ste_for_dev(master);
 }
 
+static bool arm_smmu_share_msi_domain(struct iommu_domain *domain,
+ struct device *dev)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct irq_domain *irqd = dev_get_msi_domain(dev);
+   struct arm_smmu_master *master;
+   unsigned long flags;
+   bool share = false;
+
+   if (!irqd)
+   return true;
+
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head) {
+   struct irq_domain *d = dev_get_msi_domain(master->dev);
+
+   if (!d)
+   continue;
+   if (irqd != d) {
+   dev_info(dev, "Nested mode forbids to attach devices "
+"using different physical MSI doorbells "
+"to the same iommu_domain");
+   goto unlock;
+   }
+   }
+   share = true;
+unlock:
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+   return share;
+}
+
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
int ret = 0;
@@ -2498,6 +2529,16 @@ static int arm_smmu_attach_dev(struct iommu_domain 
*domain, struct device *dev)
ret = -EINVAL;
goto out_unlock;
}
+   /*
+* In nested mode we must check all devices belonging to the
+* domain share the same physical MSI doorbell. Otherwise nested
+* stage MSI binding is not supported.
+*/
+   if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED &&
+   !arm_smmu_share_msi_domain(domain, dev)) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
 
master->domain = smmu_domain;
 
-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v15 08/12] dma-iommu: Implement NESTED_MSI cookie

2021-04-11 Thread Eric Auger
Up to now, when the type was UNMANAGED, we used to
allocate IOVA pages within a reserved IOVA MSI range.

If both the host and the guest are exposed with SMMUs, each
would allocate an IOVA. The guest allocates an IOVA (gIOVA)
to map onto the guest MSI doorbell (gDB). The Host allocates
another IOVA (hIOVA) to map onto the physical doorbell (hDB).

So we end up with 2 unrelated mappings, at S1 and S2:
 S1 S2
gIOVA-> gDB
   hIOVA->hDB

The PCI device would be programmed with hIOVA.
No stage 1 mapping would existing, causing the MSIs to fault.

iommu_dma_bind_guest_msi() allows to pass gIOVA/gDB
to the host so that gIOVA can be used by the host instead of
re-allocating a new hIOVA.

 S1   S2
gIOVA->gDB->hDB

this time, the PCI device can be programmed with the gIOVA MSI
doorbell which is correctly mapped through both stages.

Nested mode is not compatible with HW MSI regions as in that
case gDB and hDB should have a 1-1 mapping. This check will
be done when attaching each device to the IOMMU domain.

Signed-off-by: Eric Auger 

---

v14 -> v15:
Took into account Zenghui's comments
- remove duplicated mutex.h inclusion
- introduce iommu_dma_get_nested_msi_page(), take the spinlock there
- add a comment saying the msi_lock only is used in nested mode
- take the msi_lock in other places
- fix prot
- check the S1 granule is smaller than S2 one
- remove s2_unamp
- do not init msi_iova in nested mode

v10 -> v11:
- fix compilation if !CONFIG_IOMMU_DMA

v7 -> v8:
- correct iommu_dma_(un)bind_guest_msi when
  !CONFIG_IOMMU_DMA
- Mentioned nested mode is not compatible with HW MSI regions
  in commit message
- protect with msi_lock on unbind

v6 -> v7:
- removed device handle

v3 -> v4:
- change function names; add unregister
- protect with msi_lock

v2 -> v3:
- also store the device handle on S1 mapping registration.
  This garantees we associate the associated S2 mapping binds
  to the correct physical MSI controller.

v1 -> v2:
- unmap stage2 on put()
---
 drivers/iommu/dma-iommu.c | 180 +-
 include/linux/dma-iommu.h |  16 
 2 files changed, 192 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index af765c813cc8..9d77c62208bd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -29,12 +29,15 @@
 struct iommu_dma_msi_page {
struct list_headlist;
dma_addr_t  iova;
+   dma_addr_t  gpa;
phys_addr_t phys;
+   size_t  s1_granule;
 };
 
 enum iommu_dma_cookie_type {
IOMMU_DMA_IOVA_COOKIE,
IOMMU_DMA_MSI_COOKIE,
+   IOMMU_DMA_NESTED_MSI_COOKIE,
 };
 
 struct iommu_dma_cookie {
@@ -46,6 +49,8 @@ struct iommu_dma_cookie {
dma_addr_t  msi_iova;
};
struct list_headmsi_page_list;
+   /* used in nested mode only */
+   spinlock_t  msi_lock;
 
/* Domain for flush queue callback; NULL if flush queue not in use */
struct iommu_domain *fq_domain;
@@ -87,6 +92,7 @@ static struct iommu_dma_cookie *cookie_alloc(enum 
iommu_dma_cookie_type type)
 
cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
if (cookie) {
+   spin_lock_init(>msi_lock);
INIT_LIST_HEAD(>msi_page_list);
cookie->type = type;
}
@@ -120,14 +126,17 @@ EXPORT_SYMBOL(iommu_get_dma_cookie);
  *
  * Users who manage their own IOVA allocation and do not want DMA API support,
  * but would still like to take advantage of automatic MSI remapping, can use
- * this to initialise their own domain appropriately. Users should reserve a
+ * this to initialise their own domain appropriately. Users may reserve a
  * contiguous IOVA region, starting at @base, large enough to accommodate the
  * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
- * used by the devices attached to @domain.
+ * used by the devices attached to @domain. The other way round is to provide
+ * usable iova pages through the iommu_dma_bind_guest_msi API (nested stages
+ * use case)
  */
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
 {
struct iommu_dma_cookie *cookie;
+   int nesting, ret;
 
if (domain->type != IOMMU_DOMAIN_UNMANAGED)
return -EINVAL;
@@ -135,11 +144,17 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, 
dma_addr_t base)
if (domain->iova_cookie)
return -EEXIST;
 
-   cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+   ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, );
+   if (!ret && nesting)
+   cookie = cookie_alloc(IOMMU_DMA_NESTED_MSI_COOKIE);
+   else
+   cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE

[PATCH v15 07/12] iommu/smmuv3: Implement cache_invalidate

2021-04-11 Thread Eric Auger
Implement domain-selective, pasid selective and page-selective
IOTLB invalidations.

Signed-off-by: Eric Auger 

---
v4 -> v15:
- remove the redundant arm_smmu_cmdq_issue_sync(smmu)
  in IOMMU_INV_GRANU_ADDR case (Zenghui)
- if RIL is not supported by the host, make sure the granule_size
  that is passed by the userspace is supported or fix it
  (Chenxiang)

v13 -> v14:
- Add domain invalidation
- do global inval when asid is not provided with addr
  granularity

v7 -> v8:
- ASID based invalidation using iommu_inv_pasid_info
- check ARCHID/PASID flags in addr based invalidation
- use __arm_smmu_tlb_inv_context and __arm_smmu_tlb_inv_range_nosync

v6 -> v7
- check the uapi version

v3 -> v4:
- adapt to changes in the uapi
- add support for leaf parameter
- do not use arm_smmu_tlb_inv_range_nosync or arm_smmu_tlb_inv_context
  anymore

v2 -> v3:
- replace __arm_smmu_tlb_sync by arm_smmu_cmdq_issue_sync

v1 -> v2:
- properly pass the asid
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 89 +
 1 file changed, 89 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 56a301fbe75a..bfc112cc0d38 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2961,6 +2961,94 @@ static void arm_smmu_detach_pasid_table(struct 
iommu_domain *domain)
mutex_unlock(_domain->init_mutex);
 }
 
+static int
+arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+ struct iommu_cache_invalidate_info *inv_info)
+{
+   struct arm_smmu_cmdq_ent cmd = {.opcode = CMDQ_OP_TLBI_NSNH_ALL};
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   return -EINVAL;
+
+   if (!smmu)
+   return -EINVAL;
+
+   if (inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+   return -EINVAL;
+
+   if (inv_info->cache & IOMMU_CACHE_INV_TYPE_PASID ||
+   inv_info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB) {
+   return -ENOENT;
+   }
+
+   if (!(inv_info->cache & IOMMU_CACHE_INV_TYPE_IOTLB))
+   return -EINVAL;
+
+   /* IOTLB invalidation */
+
+   switch (inv_info->granularity) {
+   case IOMMU_INV_GRANU_PASID:
+   {
+   struct iommu_inv_pasid_info *info =
+   _info->granu.pasid_info;
+
+   if (info->flags & IOMMU_INV_ADDR_FLAGS_PASID)
+   return -ENOENT;
+   if (!(info->flags & IOMMU_INV_PASID_FLAGS_ARCHID))
+   return -EINVAL;
+
+   __arm_smmu_tlb_inv_context(smmu_domain, info->archid);
+   return 0;
+   }
+   case IOMMU_INV_GRANU_ADDR:
+   {
+   struct iommu_inv_addr_info *info = _info->granu.addr_info;
+   size_t granule_size  = info->granule_size;
+   size_t size = info->nb_granules * info->granule_size;
+   bool leaf = info->flags & IOMMU_INV_ADDR_FLAGS_LEAF;
+   int tg;
+
+   if (info->flags & IOMMU_INV_ADDR_FLAGS_PASID)
+   return -ENOENT;
+
+   if (!(info->flags & IOMMU_INV_ADDR_FLAGS_ARCHID))
+   break;
+
+   tg = __ffs(granule_size);
+   if (granule_size & ~(1 << tg))
+   return -EINVAL;
+   /*
+* When RIL is not supported, make sure the granule size that is
+* passed is supported. In RIL mode, this is enforced in
+* __arm_smmu_tlb_inv_range()
+*/
+   if (!(smmu->features & ARM_SMMU_FEAT_RANGE_INV) &&
+   !(granule_size & smmu_domain->domain.pgsize_bitmap)) {
+   tg = __ffs(smmu_domain->domain.pgsize_bitmap);
+   granule_size = 1 << tg;
+   size = size >> tg;
+   }
+
+   arm_smmu_tlb_inv_range_domain(info->addr, size,
+ granule_size, leaf,
+ info->archid, smmu_domain);
+   return 0;
+   }
+   case IOMMU_INV_GRANU_DOMAIN:
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   /* Global S1 invalidation */
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   arm_smmu_cmdq_issue_cmd(smmu, );
+   arm_smmu_cmdq_issue_sync(smmu);
+   return 0;
+}
+
 static bool arm_smmu_dev_has_feature(struct device *dev,
 enum iommu_dev_features feat)
 {
@@ -3060,6 +3148,7 @@ st

[PATCH v15 06/12] iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs

2021-04-11 Thread Eric Auger
With nested stage support, soon we will need to invalidate
S1 contexts and ranges tagged with an unmanaged asid, this
latter being managed by the guest. So let's introduce 2 helpers
that allow to invalidate with externally managed ASIDs

Signed-off-by: Eric Auger 

---

v14 -> v15:
- Always send CMDQ_OP_TLBI_NH_VA and do not test
  smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H as the guest does
  not run in hyp mode atm (Zenghui).

v13 -> v14
- Actually send the NH_ASID command (reported by Xingang Wang)
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 42 -
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 44cdc6df09c1..56a301fbe75a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1851,9 +1851,9 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain 
*smmu_domain, int ssid,
 }
 
 /* IO_PGTABLE API */
-static void arm_smmu_tlb_inv_context(void *cookie)
+static void __arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain,
+  int ext_asid)
 {
-   struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_cmdq_ent cmd;
 
@@ -1864,7 +1864,13 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 * insertion to guarantee those are observed before the TLBI. Do be
 * careful, 007.
 */
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+   if (ext_asid >= 0) { /* guest stage 1 invalidation */
+   cmd.opcode  = CMDQ_OP_TLBI_NH_ASID;
+   cmd.tlbi.asid   = ext_asid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   arm_smmu_cmdq_issue_cmd(smmu, );
+   arm_smmu_cmdq_issue_sync(smmu);
+   } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
arm_smmu_tlb_inv_asid(smmu, smmu_domain->s1_cfg.cd.asid);
} else {
cmd.opcode  = CMDQ_OP_TLBI_S12_VMALL;
@@ -1875,6 +1881,13 @@ static void arm_smmu_tlb_inv_context(void *cookie)
arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
 }
 
+static void arm_smmu_tlb_inv_context(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
+
+   __arm_smmu_tlb_inv_context(smmu_domain, -1);
+}
+
 static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 unsigned long iova, size_t size,
 size_t granule,
@@ -1934,9 +1947,10 @@ static void __arm_smmu_tlb_inv_range(struct 
arm_smmu_cmdq_ent *cmd,
arm_smmu_cmdq_batch_submit(smmu, );
 }
 
-static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
- size_t granule, bool leaf,
- struct arm_smmu_domain *smmu_domain)
+static void
+arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
+ size_t granule, bool leaf, int ext_asid,
+ struct arm_smmu_domain *smmu_domain)
 {
struct arm_smmu_cmdq_ent cmd = {
.tlbi = {
@@ -1944,7 +1958,16 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long 
iova, size_t size,
},
};
 
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+   if (ext_asid >= 0) {  /* guest stage 1 invalidation */
+   /*
+* At the moment the guest only uses NS-EL1, to be
+* revisited when nested virt gets supported with E2H
+* exposed.
+*/
+   cmd.opcode  = CMDQ_OP_TLBI_NH_VA;
+   cmd.tlbi.asid   = ext_asid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode  = smmu_domain->smmu->features & 
ARM_SMMU_FEAT_E2H ?
  CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA;
cmd.tlbi.asid   = smmu_domain->s1_cfg.cd.asid;
@@ -1952,6 +1975,7 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long 
iova, size_t size,
cmd.opcode  = CMDQ_OP_TLBI_S2_IPA;
cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
}
+
__arm_smmu_tlb_inv_range(, iova, size, granule, smmu_domain);
 
/*
@@ -1990,7 +2014,7 @@ static void arm_smmu_tlb_inv_page_nosync(struct 
iommu_iotlb_gather *gather,
 static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
  size_t granule, void *cookie)
 {
-   arm_smmu_tlb_inv_range_domain(iova, size, granule, false, cookie);
+   arm_smmu_tlb_inv_range_domain(iova, size, granule, false, -1, cookie);
 }
 
 static const struct iommu_flush_ops arm_smmu_flush_ops = {
@@ -2531,7 

[PATCH v15 05/12] iommu/smmuv3: Implement attach/detach_pasid_table

2021-04-11 Thread Eric Auger
On attach_pasid_table() we program STE S1 related info set
by the guest into the actual physical STEs. At minimum
we need to program the context descriptor GPA and compute
whether the stage1 is translated/bypassed or aborted.

On detach, the stage 1 config is unset and the abort flag is
unset.

Signed-off-by: Eric Auger 

---
v14 -> v15:
- add a comment before arm_smmu_get_cd_ptr to warn the
  developper this function must not be used in case of nested
  (Keqian)

v13 -> v14:
- on PASID table detach, reset the abort flag (Keqian)

v7 -> v8:
- remove smmu->features check, now done on domain finalize

v6 -> v7:
- check versions and comment the fact we don't need to take
  into account s1dss and s1fmt
v3 -> v4:
- adapt to changes in iommu_pasid_table_config
- different programming convention at s1_cfg/s2_cfg/ste.abort

v2 -> v3:
- callback now is named set_pasid_table and struct fields
  are laid out differently.

v1 -> v2:
- invalidate the STE before changing them
- hold init_mutex
- handle new fields
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 93 +
 1 file changed, 93 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 90763bc0c590..44cdc6df09c1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -988,6 +988,10 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
+/*
+ * Must not be used in case of nested mode where the CD table is owned
+ * by the guest
+ */
 static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain *smmu_domain,
   u32 ssid)
 {
@@ -2846,6 +2850,93 @@ static void arm_smmu_get_resv_regions(struct device *dev,
iommu_dma_get_resv_regions(dev, head);
 }
 
+static int arm_smmu_attach_pasid_table(struct iommu_domain *domain,
+  struct iommu_pasid_table_config *cfg)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_master *master;
+   struct arm_smmu_device *smmu;
+   unsigned long flags;
+   int ret = -EINVAL;
+
+   if (cfg->format != IOMMU_PASID_FORMAT_SMMUV3)
+   return -EINVAL;
+
+   if (cfg->version != PASID_TABLE_CFG_VERSION_1 ||
+   cfg->vendor_data.smmuv3.version != PASID_TABLE_SMMUV3_CFG_VERSION_1)
+   return -EINVAL;
+
+   mutex_lock(_domain->init_mutex);
+
+   smmu = smmu_domain->smmu;
+
+   if (!smmu)
+   goto out;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto out;
+
+   switch (cfg->config) {
+   case IOMMU_PASID_CONFIG_ABORT:
+   smmu_domain->s1_cfg.set = false;
+   smmu_domain->abort = true;
+   break;
+   case IOMMU_PASID_CONFIG_BYPASS:
+   smmu_domain->s1_cfg.set = false;
+   smmu_domain->abort = false;
+   break;
+   case IOMMU_PASID_CONFIG_TRANSLATE:
+   /* we do not support S1 <-> S1 transitions */
+   if (smmu_domain->s1_cfg.set)
+   goto out;
+
+   /*
+* we currently support a single CD so s1fmt and s1dss
+* fields are also ignored
+*/
+   if (cfg->pasid_bits)
+   goto out;
+
+   smmu_domain->s1_cfg.cdcfg.cdtab_dma = cfg->base_ptr;
+   smmu_domain->s1_cfg.set = true;
+   smmu_domain->abort = false;
+   break;
+   default:
+   goto out;
+   }
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head)
+   arm_smmu_install_ste_for_dev(master);
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+   ret = 0;
+out:
+   mutex_unlock(_domain->init_mutex);
+   return ret;
+}
+
+static void arm_smmu_detach_pasid_table(struct iommu_domain *domain)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_master *master;
+   unsigned long flags;
+
+   mutex_lock(_domain->init_mutex);
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto unlock;
+
+   smmu_domain->s1_cfg.set = false;
+   smmu_domain->abort = false;
+
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head)
+   arm_smmu_install_ste_for_dev(master);
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+
+unlock:
+   mutex_unlock(_domain->init_mutex);
+}
+
 static bool arm_smmu_dev_has_feature(struct device *dev,
 enum iommu_dev_features feat)
 {
@@ -2943,6 +3034,8 @@ st

[PATCH v15 03/12] iommu/smmuv3: Allow s1 and s2 configs to coexist

2021-04-11 Thread Eric Auger
In true nested mode, both s1_cfg and s2_cfg will coexist.
Let's remove the union and add a "set" field in each
config structure telling whether the config is set and needs
to be applied when writing the STE. In legacy nested mode,
only the second stage is used. In true nested mode, both stages
are used and the S1 config is "set" when the guest passes
its pasid table.

No functional change intended.

Signed-off-by: Eric Auger 

---

v13 -> v14:
- slight reword of the commit message

v12 -> v13:
- does not dynamically allocate s1-cfg and s2_cfg anymore. Add
  the set field
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 43 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  8 ++--
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 53abad8fdd91..22fb39a9ef31 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1239,8 +1239,8 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
u64 val = le64_to_cpu(dst[0]);
bool ste_live = false;
struct arm_smmu_device *smmu = NULL;
-   struct arm_smmu_s1_cfg *s1_cfg = NULL;
-   struct arm_smmu_s2_cfg *s2_cfg = NULL;
+   struct arm_smmu_s1_cfg *s1_cfg;
+   struct arm_smmu_s2_cfg *s2_cfg;
struct arm_smmu_domain *smmu_domain = NULL;
struct arm_smmu_cmdq_ent prefetch_cmd = {
.opcode = CMDQ_OP_PREFETCH_CFG,
@@ -1255,13 +1255,24 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
}
 
if (smmu_domain) {
+   s1_cfg = _domain->s1_cfg;
+   s2_cfg = _domain->s2_cfg;
+
switch (smmu_domain->stage) {
case ARM_SMMU_DOMAIN_S1:
-   s1_cfg = _domain->s1_cfg;
+   s1_cfg->set = true;
+   s2_cfg->set = false;
break;
case ARM_SMMU_DOMAIN_S2:
+   s1_cfg->set = false;
+   s2_cfg->set = true;
+   break;
case ARM_SMMU_DOMAIN_NESTED:
-   s2_cfg = _domain->s2_cfg;
+   /*
+* Actual usage of stage 1 depends on nested mode:
+* legacy (2d stage only) or true nested mode
+*/
+   s2_cfg->set = true;
break;
default:
break;
@@ -1288,7 +1299,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
val = STRTAB_STE_0_V;
 
/* Bypass/fault */
-   if (!smmu_domain || !(s1_cfg || s2_cfg)) {
+   if (!smmu_domain || !(s1_cfg->set || s2_cfg->set)) {
if (!smmu_domain && disable_bypass)
val |= FIELD_PREP(STRTAB_STE_0_CFG, 
STRTAB_STE_0_CFG_ABORT);
else
@@ -1307,7 +1318,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
return;
}
 
-   if (s1_cfg) {
+   if (s1_cfg->set) {
u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
 
@@ -1329,7 +1340,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
FIELD_PREP(STRTAB_STE_0_S1FMT, s1_cfg->s1fmt);
}
 
-   if (s2_cfg) {
+   if (s2_cfg->set) {
BUG_ON(ste_live);
dst[2] = cpu_to_le64(
 FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
@@ -2020,24 +2031,24 @@ static void arm_smmu_domain_free(struct iommu_domain 
*domain)
 {
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_device *smmu = smmu_domain->smmu;
+   struct arm_smmu_s1_cfg *s1_cfg = _domain->s1_cfg;
+   struct arm_smmu_s2_cfg *s2_cfg = _domain->s2_cfg;
 
iommu_put_dma_cookie(domain);
free_io_pgtable_ops(smmu_domain->pgtbl_ops);
 
/* Free the CD and ASID, if we allocated them */
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
-   struct arm_smmu_s1_cfg *cfg = _domain->s1_cfg;
-
+   if (s1_cfg->set) {
/* Prevent SVA from touching the CD while we're freeing it */
mutex_lock(_smmu_asid_lock);
-   if (cfg->cdcfg.cdtab)
+   if (s1_cfg->cdcfg.cdtab)
arm_smmu_free_cd_tables(smmu_domain);
-   arm_smmu_free_asid(>cd);
+   arm_smmu_free_asid(_cfg->cd);
mutex_unlock(_smmu_asid_lock);
-   } else {
-   struct arm_smmu_s2_cfg *cf

[PATCH v15 02/12] iommu: Introduce bind/unbind_guest_msi

2021-04-11 Thread Eric Auger
On ARM, MSI are translated by the SMMU. An IOVA is allocated
for each MSI doorbell. If both the host and the guest are exposed
with SMMUs, we end up with 2 different IOVAs allocated by each.
guest allocates an IOVA (gIOVA) to map onto the guest MSI
doorbell (gDB). The Host allocates another IOVA (hIOVA) to map
onto the physical doorbell (hDB).

So we end up with 2 untied mappings:
 S1S2
gIOVA->gDB
  hIOVA->hDB

Currently the PCI device is programmed by the host with hIOVA
as MSI doorbell. So this does not work.

This patch introduces an API to pass gIOVA/gDB to the host so
that gIOVA can be reused by the host instead of re-allocating
a new IOVA. So the goal is to create the following nested mapping:

 S1S2
gIOVA->gDB ->hDB

and program the PCI device with gIOVA MSI doorbell.

In case we have several devices attached to this nested domain
(devices belonging to the same group), they cannot be isolated
on guest side either. So they should also end up in the same domain
on guest side. We will enforce that all the devices attached to
the host iommu domain use the same physical doorbell and similarly
a single virtual doorbell mapping gets registered (1 single
virtual doorbell is used on guest as well).

Signed-off-by: Eric Auger 

---

v13 -> v14:
- s/iova/giova in iommu_unbind_guest_msi proto (Kequian)

v7 -> v8:
- dummy iommu_unbind_guest_msi turned into a void function

v6 -> v7:
- remove the device handle parameter.
- Add comments saying there can only be a single MSI binding
  registered per iommu_domain
v5 -> v6:
-fix compile issue when IOMMU_API is not set

v3 -> v4:
- add unbind

v2 -> v3:
- add a struct device handle
---
 drivers/iommu/iommu.c | 37 +
 include/linux/iommu.h | 20 
 2 files changed, 57 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 90bacf000789..1853279216eb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2282,6 +2282,43 @@ static void __iommu_detach_device(struct iommu_domain 
*domain,
trace_detach_device_from_domain(dev);
 }
 
+/**
+ * iommu_bind_guest_msi - Passes the stage1 GIOVA/GPA mapping of a
+ * virtual doorbell
+ *
+ * @domain: iommu domain the stage 1 mapping will be attached to
+ * @iova: iova allocated by the guest
+ * @gpa: guest physical address of the virtual doorbell
+ * @size: granule size used for the mapping
+ *
+ * The associated IOVA can be reused by the host to create a nested
+ * stage2 binding mapping translating into the physical doorbell used
+ * by the devices attached to the domain.
+ *
+ * All devices within the domain must share the same physical doorbell.
+ * A single MSI GIOVA/GPA mapping can be attached to an iommu_domain.
+ */
+
+int iommu_bind_guest_msi(struct iommu_domain *domain,
+dma_addr_t giova, phys_addr_t gpa, size_t size)
+{
+   if (unlikely(!domain->ops->bind_guest_msi))
+   return -ENODEV;
+
+   return domain->ops->bind_guest_msi(domain, giova, gpa, size);
+}
+EXPORT_SYMBOL_GPL(iommu_bind_guest_msi);
+
+void iommu_unbind_guest_msi(struct iommu_domain *domain,
+   dma_addr_t giova)
+{
+   if (unlikely(!domain->ops->unbind_guest_msi))
+   return;
+
+   domain->ops->unbind_guest_msi(domain, giova);
+}
+EXPORT_SYMBOL_GPL(iommu_unbind_guest_msi);
+
 void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 {
struct iommu_group *group;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c4422975359e..72bda5d93951 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -241,6 +241,8 @@ struct iommu_iotlb_gather {
  * @sva_unbind_gpasid: unbind guest pasid and mm
  * @attach_pasid_table: attach a pasid table
  * @detach_pasid_table: detach the pasid table
+ * @bind_guest_msi: provides a stage1 giova/gpa MSI doorbell mapping
+ * @unbind_guest_msi: withdraw a stage1 giova/gpa MSI doorbell mapping
  * @def_domain_type: device default domain type, return value:
  * - IOMMU_DOMAIN_IDENTITY: must use an identity domain
  * - IOMMU_DOMAIN_DMA: must use a dma domain
@@ -322,6 +324,10 @@ struct iommu_ops {
 
int (*def_domain_type)(struct device *dev);
 
+   int (*bind_guest_msi)(struct iommu_domain *domain,
+ dma_addr_t giova, phys_addr_t gpa, size_t size);
+   void (*unbind_guest_msi)(struct iommu_domain *domain, dma_addr_t giova);
+
unsigned long pgsize_bitmap;
struct module *owner;
 };
@@ -464,6 +470,10 @@ extern int iommu_attach_pasid_table(struct iommu_domain 
*domain,
 extern int iommu_uapi_attach_pasid_table(struct iommu_domain *domain,
 void __user *udata);
 extern void iommu_detach_pasid_table(struct iommu_domain *domain);
+exter

[PATCH v15 01/12] iommu: Introduce attach/detach_pasid_table API

2021-04-11 Thread Eric Auger
In virtualization use case, when a guest is assigned
a PCI host device, protected by a virtual IOMMU on the guest,
the physical IOMMU must be programmed to be consistent with
the guest mappings. If the physical IOMMU supports two
translation stages it makes sense to program guest mappings
onto the first stage/level (ARM/Intel terminology) while the host
owns the stage/level 2.

In that case, it is mandated to trap on guest configuration
settings and pass those to the physical iommu driver.

This patch adds a new API to the iommu subsystem that allows
to set/unset the pasid table information.

A generic iommu_pasid_table_config struct is introduced in
a new iommu.h uapi header. This is going to be used by the VFIO
user API.

Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
Signed-off-by: Jacob Pan 
Signed-off-by: Eric Auger 

---

v13 -> v14:
- export iommu_attach_pasid_table
- add dummy iommu_uapi_attach_pasid_table
- swap base_ptr and format in iommu_pasid_table_config

v12 -> v13:
- Fix config check

v11 -> v12:
- add argsz, name the union
---
 drivers/iommu/iommu.c  | 69 ++
 include/linux/iommu.h  | 27 +++
 include/uapi/linux/iommu.h | 54 +
 3 files changed, 150 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d0b0a15dba84..90bacf000789 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2200,6 +2200,75 @@ int iommu_uapi_sva_unbind_gpasid(struct iommu_domain 
*domain, struct device *dev
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
 
+int iommu_attach_pasid_table(struct iommu_domain *domain,
+struct iommu_pasid_table_config *cfg)
+{
+   if (unlikely(!domain->ops->attach_pasid_table))
+   return -ENODEV;
+
+   return domain->ops->attach_pasid_table(domain, cfg);
+}
+EXPORT_SYMBOL_GPL(iommu_attach_pasid_table);
+
+int iommu_uapi_attach_pasid_table(struct iommu_domain *domain,
+ void __user *uinfo)
+{
+   struct iommu_pasid_table_config pasid_table_data = { 0 };
+   u32 minsz;
+
+   if (unlikely(!domain->ops->attach_pasid_table))
+   return -ENODEV;
+
+   /*
+* No new spaces can be added before the variable sized union, the
+* minimum size is the offset to the union.
+*/
+   minsz = offsetof(struct iommu_pasid_table_config, vendor_data);
+
+   /* Copy minsz from user to get flags and argsz */
+   if (copy_from_user(_table_data, uinfo, minsz))
+   return -EFAULT;
+
+   /* Fields before the variable size union are mandatory */
+   if (pasid_table_data.argsz < minsz)
+   return -EINVAL;
+
+   /* PASID and address granu require additional info beyond minsz */
+   if (pasid_table_data.version != PASID_TABLE_CFG_VERSION_1)
+   return -EINVAL;
+   if (pasid_table_data.format == IOMMU_PASID_FORMAT_SMMUV3 &&
+   pasid_table_data.argsz <
+   offsetofend(struct iommu_pasid_table_config, 
vendor_data.smmuv3))
+   return -EINVAL;
+
+   /*
+* User might be using a newer UAPI header which has a larger data
+* size, we shall support the existing flags within the current
+* size. Copy the remaining user data _after_ minsz but not more
+* than the current kernel supported size.
+*/
+   if (copy_from_user((void *)_table_data + minsz, uinfo + minsz,
+  min_t(u32, pasid_table_data.argsz, 
sizeof(pasid_table_data)) - minsz))
+   return -EFAULT;
+
+   /* Now the argsz is validated, check the content */
+   if (pasid_table_data.config < IOMMU_PASID_CONFIG_TRANSLATE ||
+   pasid_table_data.config > IOMMU_PASID_CONFIG_ABORT)
+   return -EINVAL;
+
+   return domain->ops->attach_pasid_table(domain, _table_data);
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_attach_pasid_table);
+
+void iommu_detach_pasid_table(struct iommu_domain *domain)
+{
+   if (unlikely(!domain->ops->detach_pasid_table))
+   return;
+
+   domain->ops->detach_pasid_table(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 86d688c4418f..c4422975359e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -239,6 +239,8 @@ struct iommu_iotlb_gather {
  * @cache_invalidate: invalidate translation caches
  * @sva_bind_gpasid: bind guest pasid and mm
  * @sva_unbind_gpasid: unbind guest pasid and mm
+ * @attach_pasid_table: attach a pasid table
+ * @detach_pasid_table: detach the pasid table
  * @def_domain_type: device default domain type, return value:
  *

[PATCH v15 00/12] SMMUv3 Nested Stage Setup (IOMMU part)

2021-04-11 Thread Eric Auger
SMMUv3 Nested Stage Setup (IOMMU part)

This series brings the IOMMU part of HW nested paging support
in the SMMUv3. The VFIO part is submitted separately.

This is based on Jean-Philippe's
[PATCH v14 00/10] iommu: I/O page faults for SMMUv3
https://www.spinics.net/lists/arm-kernel/msg886518.html
(including the patches that were not pulled for 5.13)

The IOMMU API is extended to support 2 new API functionalities:
1) pass the guest stage 1 configuration
2) pass stage 1 MSI bindings

Then those capabilities gets implemented in the SMMUv3 driver.

The virtualizer passes information through the VFIO user API
which cascades them to the iommu subsystem. This allows the guest
to own stage 1 tables and context descriptors (so-called PASID
table) while the host owns stage 2 tables and main configuration
structures (STE).

Best Regards

Eric

This series can be found at:
v5.12-rc6-jean-iopf-14-2stage-v15
(including the VFIO part in its last version: v13)

The VFIO series is sent separately.

History:

Previous version:
https://github.com/eauger/linux/tree/v5.11-stallv12-2stage-v14

v14 -> v15:
- on S1 invalidation, always use CMDQ_OP_TLBI_NH_VA
  independently on host ARM_SMMU_FEAT_E2H support (Zenghui)
- remove  iommu/smmuv3: Accept configs with more than one
  context descriptor
- Remove spurious arm_smmu_cmdq_issue_sync in
  IOMMU_INV_GRANU_ADDR cache invalidation (Zenghui)
- dma-iommu.c changes induced by Zenghui's comments
  including the locking rework
- fix cache invalidation when guest uses RIL
  and host does not support it (Chenxiang)
- removed iommu/smmuv3: Accept configs with more than one
  context descriptor (Zenghui, Shameer)
- At this point I have kept the MSI binding API.

v13 -> v14:
- Took into account all received comments I think. Great
  thanks to all the testers for their effort and sometimes
  tentative fixes. I am really grateful to you!
- numerous fixes including guest running in
  noiommu, iommu.strict=0, iommu.passthrough=on,
  enable_unsafe_noiommu_mode

v12 -> v13:
- fixed compilation issue with CONFIG_ARM_SMMU_V3_SVA
  reported by Shameer. This urged me to revisit patch 4 into
  iommu/smmuv3: Allow s1 and s2 configs to coexist where
  s1_cfg and s2_cfg are not dynamically allocated anymore.
  Instead I use a new set field in existing structs
- fixed 2 others config checks
- Updated "iommu/arm-smmu-v3: Maintain a SID->device structure"
  according to the last version

v11 -> v12:
- rebase on top of v5.10-rc4


Eric Auger (12):
  iommu: Introduce attach/detach_pasid_table API
  iommu: Introduce bind/unbind_guest_msi
  iommu/smmuv3: Allow s1 and s2 configs to coexist
  iommu/smmuv3: Get prepared for nested stage support
  iommu/smmuv3: Implement attach/detach_pasid_table
  iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs
  iommu/smmuv3: Implement cache_invalidate
  dma-iommu: Implement NESTED_MSI cookie
  iommu/smmuv3: Nested mode single MSI doorbell per domain enforcement
  iommu/smmuv3: Enforce incompatibility between nested mode and HW MSI
regions
  iommu/smmuv3: Implement bind/unbind_guest_msi
  iommu/smmuv3: report additional recoverable faults

 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 463 ++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  14 +-
 drivers/iommu/dma-iommu.c   | 180 +++-
 drivers/iommu/iommu.c   | 106 +
 include/linux/dma-iommu.h   |  16 +
 include/linux/iommu.h   |  47 ++
 include/uapi/linux/iommu.h  |  54 +++
 7 files changed, 838 insertions(+), 42 deletions(-)

-- 
2.26.3

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v12 10/13] vfio/pci: Register and allow DMA FAULT IRQ signaling

2021-02-23 Thread Eric Auger
Register the VFIO_IRQ_TYPE_NESTED/VFIO_IRQ_SUBTYPE_DMA_FAULT
IRQ that allows to signal a nested mode DMA fault.

Signed-off-by: Eric Auger 

---

v10 -> v11:
- the irq now is registered in vfio_pci_dma_fault_init()
  in case the domain is nested
---
 drivers/vfio/pci/vfio_pci.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e6b94bad77da..f3fa6d4318ae 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -396,6 +396,7 @@ vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, 
void *data)
(struct vfio_region_dma_fault *)vdev->fault_pages;
struct iommu_fault *new;
u32 head, tail, size;
+   int ext_irq_index;
int ret = -EINVAL;
 
if (WARN_ON(!reg))
@@ -420,7 +421,19 @@ vfio_pci_iommu_dev_fault_handler(struct iommu_fault 
*fault, void *data)
ret = 0;
 unlock:
mutex_unlock(>fault_queue_lock);
-   return ret;
+   if (ret)
+   return ret;
+
+   ext_irq_index = vfio_pci_get_ext_irq_index(vdev, VFIO_IRQ_TYPE_NESTED,
+  VFIO_IRQ_SUBTYPE_DMA_FAULT);
+   if (ext_irq_index < 0)
+   return -EINVAL;
+
+   mutex_lock(>igate);
+   if (vdev->ext_irqs[ext_irq_index].trigger)
+   eventfd_signal(vdev->ext_irqs[ext_irq_index].trigger, 1);
+   mutex_unlock(>igate);
+   return 0;
 }
 
 #define DMA_FAULT_RING_LENGTH 512
@@ -475,6 +488,12 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
if (ret) /* the dma fault region is freed in vfio_pci_disable() */
goto out;
 
+   ret = vfio_pci_register_irq(vdev, VFIO_IRQ_TYPE_NESTED,
+   VFIO_IRQ_SUBTYPE_DMA_FAULT,
+   VFIO_IRQ_INFO_EVENTFD);
+   if (ret) /* the fault handler is also freed in vfio_pci_disable() */
+   goto out;
+
return 0;
 out:
kfree(vdev->fault_pages);
-- 
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v12 13/13] vfio/pci: Inject page response upon response region fill

2021-02-23 Thread Eric Auger
When the userspace increments the head of the page response
buffer ring, let's push the response into the iommu layer.
This is done through a workqueue that pops the responses from
the ring buffer and increment the tail.

Signed-off-by: Eric Auger 
---
 drivers/vfio/pci/vfio_pci.c | 40 +
 drivers/vfio/pci/vfio_pci_private.h |  7 +
 drivers/vfio/pci/vfio_pci_rdwr.c|  1 +
 3 files changed, 48 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 9f1f5008e556..a41497779a68 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -552,6 +552,32 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
return ret;
 }
 
+static void dma_response_inject(struct work_struct *work)
+{
+   struct vfio_pci_dma_fault_response_work *rwork =
+   container_of(work, struct vfio_pci_dma_fault_response_work, 
inject);
+   struct vfio_region_dma_fault_response *header = rwork->header;
+   struct vfio_pci_device *vdev = rwork->vdev;
+   struct iommu_page_response *resp;
+   u32 tail, head, size;
+
+   mutex_lock(>fault_response_queue_lock);
+
+   tail = header->tail;
+   head = header->head;
+   size = header->nb_entries;
+
+   while (CIRC_CNT(head, tail, size) >= 1) {
+   resp = (struct iommu_page_response 
*)(vdev->fault_response_pages + header->offset +
+   tail * header->entry_size);
+
+   /* TODO: properly handle the return value */
+   iommu_page_response(>pdev->dev, resp);
+   header->tail = tail = (tail + 1) % size;
+   }
+   mutex_unlock(>fault_response_queue_lock);
+}
+
 #define DMA_FAULT_RESPONSE_RING_LENGTH 512
 
 static int vfio_pci_dma_fault_response_init(struct vfio_pci_device *vdev)
@@ -597,8 +623,22 @@ static int vfio_pci_dma_fault_response_init(struct 
vfio_pci_device *vdev)
header->nb_entries = DMA_FAULT_RESPONSE_RING_LENGTH;
header->offset = PAGE_SIZE;
 
+   vdev->response_work = kzalloc(sizeof(*vdev->response_work), GFP_KERNEL);
+   if (!vdev->response_work)
+   goto out;
+   vdev->response_work->header = header;
+   vdev->response_work->vdev = vdev;
+
+   /* launch the thread that will extract the response */
+   INIT_WORK(>response_work->inject, dma_response_inject);
+   vdev->dma_fault_response_wq =
+   create_singlethread_workqueue("vfio-dma-fault-response");
+   if (!vdev->dma_fault_response_wq)
+   return -ENOMEM;
+
return 0;
 out:
+   kfree(vdev->fault_response_pages);
vdev->fault_response_pages = NULL;
return ret;
 }
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 82a883c101c9..5944f96ced0c 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -52,6 +52,12 @@ struct vfio_pci_irq_ctx {
struct irq_bypass_producer  producer;
 };
 
+struct vfio_pci_dma_fault_response_work {
+   struct work_struct inject;
+   struct vfio_region_dma_fault_response *header;
+   struct vfio_pci_device *vdev;
+};
+
 struct vfio_pci_device;
 struct vfio_pci_region;
 
@@ -146,6 +152,7 @@ struct vfio_pci_device {
u8  *fault_pages;
u8  *fault_response_pages;
struct workqueue_struct *dma_fault_response_wq;
+   struct vfio_pci_dma_fault_response_work *response_work;
struct mutexfault_queue_lock;
struct mutexfault_response_queue_lock;
struct list_headdummy_resources_list;
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index efde0793360b..78c494fe35cc 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -430,6 +430,7 @@ size_t vfio_pci_dma_fault_response_rw(struct 
vfio_pci_device *vdev, char __user
mutex_lock(>fault_response_queue_lock);
header->head = new_head;
mutex_unlock(>fault_response_queue_lock);
+   queue_work(vdev->dma_fault_response_wq, 
>response_work->inject);
} else {
if (copy_to_user(buf, base + pos, count))
return -EFAULT;
-- 
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v12 12/13] vfio/pci: Register a DMA fault response region

2021-02-23 Thread Eric Auger
In preparation for vSVA, let's register a DMA fault response region,
where the userspace will push the page responses and increment the
head of the buffer. The kernel will pop those responses and inject them
on iommu side.

Signed-off-by: Eric Auger 

---

v11 -> v12:
- use DMA_FAULT_RESPONSE cap [Shameer]
- struct vfio_pci_device dma_fault_response_wq field introduced in
  this patch
- return 0 if the domain is NULL
- pass an int pointer to iommu_domain_get_attr
---
 drivers/vfio/pci/vfio_pci.c | 125 ++--
 drivers/vfio/pci/vfio_pci_private.h |   6 ++
 drivers/vfio/pci/vfio_pci_rdwr.c|  39 +
 include/uapi/linux/vfio.h   |  32 +++
 4 files changed, 193 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index f3fa6d4318ae..9f1f5008e556 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -316,9 +316,20 @@ static void vfio_pci_dma_fault_release(struct 
vfio_pci_device *vdev,
kfree(vdev->fault_pages);
 }
 
-static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
-  struct vfio_pci_region *region,
-  struct vm_area_struct *vma)
+static void
+vfio_pci_dma_fault_response_release(struct vfio_pci_device *vdev,
+   struct vfio_pci_region *region)
+{
+   if (vdev->dma_fault_response_wq)
+   destroy_workqueue(vdev->dma_fault_response_wq);
+   kfree(vdev->fault_response_pages);
+   vdev->fault_response_pages = NULL;
+}
+
+static int __vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
+struct vfio_pci_region *region,
+struct vm_area_struct *vma,
+u8 *pages)
 {
u64 phys_len, req_len, pgoff, req_start;
unsigned long long addr;
@@ -331,14 +342,14 @@ static int vfio_pci_dma_fault_mmap(struct vfio_pci_device 
*vdev,
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
req_start = pgoff << PAGE_SHIFT;
 
-   /* only the second page of the producer fault region is mmappable */
+   /* only the second page of the fault region is mmappable */
if (req_start < PAGE_SIZE)
return -EINVAL;
 
if (req_start + req_len > phys_len)
return -EINVAL;
 
-   addr = virt_to_phys(vdev->fault_pages);
+   addr = virt_to_phys(pages);
vma->vm_private_data = vdev;
vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
 
@@ -347,13 +358,29 @@ static int vfio_pci_dma_fault_mmap(struct vfio_pci_device 
*vdev,
return ret;
 }
 
-static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
-struct vfio_pci_region *region,
-struct vfio_info_cap *caps)
+static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vm_area_struct *vma)
+{
+   return __vfio_pci_dma_fault_mmap(vdev, region, vma, vdev->fault_pages);
+}
+
+static int
+vfio_pci_dma_fault_response_mmap(struct vfio_pci_device *vdev,
+   struct vfio_pci_region *region,
+   struct vm_area_struct *vma)
+{
+   return __vfio_pci_dma_fault_mmap(vdev, region, vma, 
vdev->fault_response_pages);
+}
+
+static int __vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vfio_info_cap *caps,
+  u32 cap_id)
 {
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
struct vfio_region_info_cap_fault cap = {
-   .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
+   .header.id = cap_id,
.header.version = 1,
.version = 1,
};
@@ -381,6 +408,23 @@ static int vfio_pci_dma_fault_add_capability(struct 
vfio_pci_device *vdev,
return ret;
 }
 
+static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
+struct vfio_pci_region *region,
+struct vfio_info_cap *caps)
+{
+   return __vfio_pci_dma_fault_add_capability(vdev, region, caps,
+  
VFIO_REGION_INFO_CAP_DMA_FAULT);
+}
+
+static int
+vfio_pci_dma_fault_response_add_capability(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vfio_info_cap *caps)
+{
+   retu

[PATCH v12 08/13] vfio/pci: Add framework for custom interrupt indices

2021-02-23 Thread Eric Auger
Implement IRQ capability chain infrastructure. All interrupt
indexes beyond VFIO_PCI_NUM_IRQS are handled as extended
interrupts. They are registered with a specific type/subtype
and supported flags.

Signed-off-by: Eric Auger 

---

v11 -> v12:
- check !vdev->num_ext_irqs in vfio_pci_set_ext_irq_trigger()
  [Shameer, Qubingbing]
---
 drivers/vfio/pci/vfio_pci.c | 99 +++--
 drivers/vfio/pci/vfio_pci_intrs.c   | 62 ++
 drivers/vfio/pci/vfio_pci_private.h | 14 
 3 files changed, 157 insertions(+), 18 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index a528b72b57d2..e6b94bad77da 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -610,6 +610,14 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
ret = iommu_unregister_device_fault_handler(>pdev->dev);
WARN_ON(ret == -EBUSY);
 
+   for (i = 0; i < vdev->num_ext_irqs; i++)
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+   vdev->num_ext_irqs = 0;
+   kfree(vdev->ext_irqs);
+   vdev->ext_irqs = NULL;
+
/* Device closed, don't need mutex here */
list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
 >ioeventfds_list, next) {
@@ -825,6 +833,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
return 1;
} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
return 1;
+   } else if (irq_type >= VFIO_PCI_NUM_IRQS &&
+  irq_type < VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs) {
+   return 1;
}
 
return 0;
@@ -1010,7 +1021,7 @@ static long vfio_pci_ioctl(void *device_data,
info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
-   info.num_irqs = VFIO_PCI_NUM_IRQS;
+   info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs;
 
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
int ret = vfio_pci_info_zdev_add_caps(vdev, );
@@ -1189,36 +1200,87 @@ static long vfio_pci_ioctl(void *device_data,
 
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
struct vfio_irq_info info;
+   struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+   unsigned long capsz;
 
minsz = offsetofend(struct vfio_irq_info, count);
 
+   /* For backward compatibility, cannot require this */
+   capsz = offsetofend(struct vfio_irq_info, cap_offset);
+
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
-   if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+   if (info.argsz < minsz ||
+   info.index >= VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs)
return -EINVAL;
 
-   switch (info.index) {
-   case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
-   case VFIO_PCI_REQ_IRQ_INDEX:
-   break;
-   case VFIO_PCI_ERR_IRQ_INDEX:
-   if (pci_is_pcie(vdev->pdev))
-   break;
-   fallthrough;
-   default:
-   return -EINVAL;
-   }
+   if (info.argsz >= capsz)
+   minsz = capsz;
 
info.flags = VFIO_IRQ_INFO_EVENTFD;
 
-   info.count = vfio_pci_get_irq_count(vdev, info.index);
-
-   if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+   switch (info.index) {
+   case VFIO_PCI_INTX_IRQ_INDEX:
info.flags |= (VFIO_IRQ_INFO_MASKABLE |
   VFIO_IRQ_INFO_AUTOMASKED);
-   else
+   break;
+   case VFIO_PCI_MSI_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+   case VFIO_PCI_REQ_IRQ_INDEX:
info.flags |= VFIO_IRQ_INFO_NORESIZE;
+   break;
+   case VFIO_PCI_ERR_IRQ_INDEX:
+   info.flags |= VFIO_IRQ_INFO_NORESIZE;
+   if (!pci_is_pcie(vdev->pdev))
+   return -EINVAL;
+   break;
+   default:
+   {
+   struct vfio_irq_info_cap_type cap_type = {
+   .header.id = VFIO_IRQ_INFO_CAP_TYPE,
+   .header.version = 1 };
+   int ret, i;
+
+   if (info.index >= VFIO_P

[PATCH v12 11/13] vfio: Document nested stage control

2021-02-23 Thread Eric Auger
The VFIO API was enhanced to support nested stage control: a bunch of
new iotcls, one DMA FAULT region and an associated specific IRQ.

Let's document the process to follow to set up nested mode.

Signed-off-by: Eric Auger 

---

v11 -> v12:
s/VFIO_REGION_INFO_CAP_PRODUCER_FAULT/VFIO_REGION_INFO_CAP_DMA_FAULT

v8 -> v9:
- new names for SET_MSI_BINDING and SET_PASID_TABLE
- new layout for the DMA FAULT memory region and specific IRQ

v2 -> v3:
- document the new fault API

v1 -> v2:
- use the new ioctl names
- add doc related to fault handling
---
 Documentation/driver-api/vfio.rst | 77 +++
 1 file changed, 77 insertions(+)

diff --git a/Documentation/driver-api/vfio.rst 
b/Documentation/driver-api/vfio.rst
index f1a4d3c3ba0b..14e41324237d 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -239,6 +239,83 @@ group and can access them as follows::
/* Gratuitous device reset and go... */
ioctl(device, VFIO_DEVICE_RESET);
 
+IOMMU Dual Stage Control
+
+
+Some IOMMUs support 2 stages/levels of translation. "Stage" corresponds to
+the ARM terminology while "level" corresponds to Intel's VTD terminology. In
+the following text we use either without distinction.
+
+This is useful when the guest is exposed with a virtual IOMMU and some
+devices are assigned to the guest through VFIO. Then the guest OS can use
+stage 1 (IOVA -> GPA), while the hypervisor uses stage 2 for VM isolation
+(GPA -> HPA).
+
+The guest gets ownership of the stage 1 page tables and also owns stage 1
+configuration structures. The hypervisor owns the root configuration structure
+(for security reason), including stage 2 configuration. This works as long
+configuration structures and page table format are compatible between the
+virtual IOMMU and the physical IOMMU.
+
+Assuming the HW supports it, this nested mode is selected by choosing the
+VFIO_TYPE1_NESTING_IOMMU type through:
+
+ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_NESTING_IOMMU);
+
+This forces the hypervisor to use the stage 2, leaving stage 1 available for
+guest usage.
+
+Once groups are attached to the container, the guest stage 1 translation
+configuration data can be passed to VFIO by using
+
+ioctl(container, VFIO_IOMMU_SET_PASID_TABLE, _table_info);
+
+This allows to combine the guest stage 1 configuration structure along with
+the hypervisor stage 2 configuration structure. Stage 1 configuration
+structures are dependent on the IOMMU type.
+
+As the stage 1 translation is fully delegated to the HW, translation faults
+encountered during the translation process need to be propagated up to
+the virtualizer and re-injected into the guest.
+
+The userspace must be prepared to receive faults. The VFIO-PCI device
+exposes one dedicated DMA FAULT region: it contains a ring buffer and
+its header that allows to manage the head/tail indices. The region is
+identified by the following index/subindex:
+- VFIO_REGION_TYPE_NESTED/VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT
+
+The DMA FAULT region exposes a VFIO_REGION_INFO_CAP_DMA_FAULT
+region capability that allows the userspace to retrieve the ABI version
+of the fault records filled by the host.
+
+On top of that region, the userspace can be notified whenever a fault
+occurs at the physical level. It can use the VFIO_IRQ_TYPE_NESTED/
+VFIO_IRQ_SUBTYPE_DMA_FAULT specific IRQ to attach the eventfd to be
+signalled.
+
+The ring buffer containing the fault records can be mmapped. When
+the userspace consumes a fault in the queue, it should increment
+the consumer index to allow new fault records to replace the used ones.
+
+The queue size and the entry size can be retrieved in the header.
+The tail index should never overshoot the producer index as in any
+other circular buffer scheme. Also it must be less than the queue size
+otherwise the change fails.
+
+When the guest invalidates stage 1 related caches, invalidations must be
+forwarded to the host through
+ioctl(container, VFIO_IOMMU_CACHE_INVALIDATE, _data);
+Those invalidations can happen at various granularity levels, page, context, 
...
+
+The ARM SMMU specification introduces another challenge: MSIs are translated by
+both the virtual SMMU and the physical SMMU. To build a nested mapping for the
+IOVA programmed into the assigned device, the guest needs to pass its IOVA/MSI
+doorbell GPA binding to the host. Then the hypervisor can build a nested stage 
2
+binding eventually translating into the physical MSI doorbell.
+
+This is achieved by calling
+ioctl(container, VFIO_IOMMU_SET_MSI_BINDING, _binding);
+
 VFIO User API
 ---
 
-- 
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v12 06/13] vfio/pci: Allow to mmap the fault queue

2021-02-23 Thread Eric Auger
The DMA FAULT region contains the fault ring buffer.
There is benefit to let the userspace mmap this area.
Expose this mmappable area through a sparse mmap entry
and implement the mmap operation.

Signed-off-by: Eric Auger 

---

v8 -> v9:
- remove unused index local variable in vfio_pci_fault_mmap
---
 drivers/vfio/pci/vfio_pci.c | 61 +++--
 1 file changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index ad3fe0ce2e64..a528b72b57d2 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -316,21 +316,75 @@ static void vfio_pci_dma_fault_release(struct 
vfio_pci_device *vdev,
kfree(vdev->fault_pages);
 }
 
+static int vfio_pci_dma_fault_mmap(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region,
+  struct vm_area_struct *vma)
+{
+   u64 phys_len, req_len, pgoff, req_start;
+   unsigned long long addr;
+   unsigned int ret;
+
+   phys_len = region->size;
+
+   req_len = vma->vm_end - vma->vm_start;
+   pgoff = vma->vm_pgoff &
+   ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+   req_start = pgoff << PAGE_SHIFT;
+
+   /* only the second page of the producer fault region is mmappable */
+   if (req_start < PAGE_SIZE)
+   return -EINVAL;
+
+   if (req_start + req_len > phys_len)
+   return -EINVAL;
+
+   addr = virt_to_phys(vdev->fault_pages);
+   vma->vm_private_data = vdev;
+   vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
+
+   ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+ req_len, vma->vm_page_prot);
+   return ret;
+}
+
 static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
 struct vfio_pci_region *region,
 struct vfio_info_cap *caps)
 {
+   struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
struct vfio_region_info_cap_fault cap = {
.header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
.header.version = 1,
.version = 1,
};
-   return vfio_info_add_capability(caps, , sizeof(cap));
+   size_t size = sizeof(*sparse) + sizeof(*sparse->areas);
+   int ret;
+
+   ret = vfio_info_add_capability(caps, , sizeof(cap));
+   if (ret)
+   return ret;
+
+   sparse = kzalloc(size, GFP_KERNEL);
+   if (!sparse)
+   return -ENOMEM;
+
+   sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+   sparse->header.version = 1;
+   sparse->nr_areas = 1;
+   sparse->areas[0].offset = PAGE_SIZE;
+   sparse->areas[0].size = region->size - PAGE_SIZE;
+
+   ret = vfio_info_add_capability(caps, >header, size);
+   if (ret)
+   kfree(sparse);
+
+   return ret;
 }
 
 static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
.rw = vfio_pci_dma_fault_rw,
.release= vfio_pci_dma_fault_release,
+   .mmap   = vfio_pci_dma_fault_mmap,
.add_capability = vfio_pci_dma_fault_add_capability,
 };
 
@@ -404,7 +458,8 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
VFIO_REGION_TYPE_NESTED,
VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
_pci_dma_fault_regops, size,
-   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+   VFIO_REGION_INFO_FLAG_MMAP,
vdev->fault_pages);
if (ret)
goto out;
@@ -412,7 +467,7 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
header = (struct vfio_region_dma_fault *)vdev->fault_pages;
header->entry_size = sizeof(struct iommu_fault);
header->nb_entries = DMA_FAULT_RING_LENGTH;
-   header->offset = sizeof(struct vfio_region_dma_fault);
+   header->offset = PAGE_SIZE;
 
ret = iommu_register_device_fault_handler(>pdev->dev,
vfio_pci_iommu_dev_fault_handler,
-- 
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v12 05/13] vfio/pci: Register an iommu fault handler

2021-02-23 Thread Eric Auger
Register an IOMMU fault handler which records faults in
the DMA FAULT region ring buffer. In a subsequent patch, we
will add the signaling of a specific eventfd to allow the
userspace to be notified whenever a new fault has shown up.

Signed-off-by: Eric Auger 

---
v11 -> v12:
- take the fault_queue_lock before reading header (Zenghui)
- also record recoverable errors
- only WARN_ON if the unregistration returns -EBUSY
- make vfio_pci_iommu_dev_fault_handler static

v10 -> v11:
- move iommu_unregister_device_fault_handler into
  vfio_pci_disable
- check fault_pages != 0

v8 -> v9:
- handler now takes an iommu_fault handle
- eventfd signaling moved to a subsequent patch
- check the fault type and return an error if != UNRECOV
- still the fault handler registration can fail. We need to
  reach an agreement about how to deal with the situation

v3 -> v4:
- move iommu_unregister_device_fault_handler to vfio_pci_release
---
 drivers/vfio/pci/vfio_pci.c | 48 -
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e9a4a1c502c7..ad3fe0ce2e64 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "vfio_pci_private.h"
 
@@ -333,6 +334,41 @@ static const struct vfio_pci_regops 
vfio_pci_dma_fault_regops = {
.add_capability = vfio_pci_dma_fault_add_capability,
 };
 
+static int
+vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
+{
+   struct vfio_pci_device *vdev = (struct vfio_pci_device *)data;
+   struct vfio_region_dma_fault *reg =
+   (struct vfio_region_dma_fault *)vdev->fault_pages;
+   struct iommu_fault *new;
+   u32 head, tail, size;
+   int ret = -EINVAL;
+
+   if (WARN_ON(!reg))
+   return ret;
+
+   mutex_lock(>fault_queue_lock);
+
+   head = reg->head;
+   tail = reg->tail;
+   size = reg->nb_entries;
+
+   new = (struct iommu_fault *)(vdev->fault_pages + reg->offset +
+head * reg->entry_size);
+
+   if (CIRC_SPACE(head, tail, size) < 1) {
+   ret = -ENOSPC;
+   goto unlock;
+   }
+
+   *new = *fault;
+   reg->head = (head + 1) % size;
+   ret = 0;
+unlock:
+   mutex_unlock(>fault_queue_lock);
+   return ret;
+}
+
 #define DMA_FAULT_RING_LENGTH 512
 
 static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev)
@@ -377,6 +413,13 @@ static int vfio_pci_dma_fault_init(struct vfio_pci_device 
*vdev)
header->entry_size = sizeof(struct iommu_fault);
header->nb_entries = DMA_FAULT_RING_LENGTH;
header->offset = sizeof(struct vfio_region_dma_fault);
+
+   ret = iommu_register_device_fault_handler(>pdev->dev,
+   vfio_pci_iommu_dev_fault_handler,
+   vdev);
+   if (ret) /* the dma fault region is freed in vfio_pci_disable() */
+   goto out;
+
return 0;
 out:
kfree(vdev->fault_pages);
@@ -500,7 +543,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_dummy_resource *dummy_res, *tmp;
struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
-   int i, bar;
+   int i, bar, ret;
 
/* Stop the device from further DMA */
pci_clear_master(pdev);
@@ -509,6 +552,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
VFIO_IRQ_SET_ACTION_TRIGGER,
vdev->irq_type, 0, 0, NULL);
 
+   ret = iommu_unregister_device_fault_handler(>pdev->dev);
+   WARN_ON(ret == -EBUSY);
+
/* Device closed, don't need mutex here */
list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
 >ioeventfds_list, next) {
-- 
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v12 09/13] vfio: Add new IRQ for DMA fault reporting

2021-02-23 Thread Eric Auger
Add a new IRQ type/subtype to get notification on nested
stage DMA faults.

Signed-off-by: Eric Auger 
---
 include/uapi/linux/vfio.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 3688215843fe..f0eea0f9305a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -723,6 +723,9 @@ struct vfio_irq_info_cap_type {
__u32 subtype;  /* type specific */
 };
 
+#define VFIO_IRQ_TYPE_NESTED   (1)
+#define VFIO_IRQ_SUBTYPE_DMA_FAULT (1)
+
 /**
  * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
  *
-- 
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v12 00/13] SMMUv3 Nested Stage Setup (VFIO part)

2021-02-23 Thread Eric Auger
This series brings the VFIO part of HW nested paging support
in the SMMUv3.

This is a rebase on top of v5.11

The series depends on:
[PATCH v14 00/13] SMMUv3 Nested Stage Setup (IOMMU part)

3 new IOCTLs are introduced that allow the userspace to
1) pass the guest stage 1 configuration
2) pass stage 1 MSI bindings
3) invalidate stage 1 related caches

They map onto the related new IOMMU API functions.

We introduce the capability to register specific interrupt
indexes (see [1]). A new DMA_FAULT interrupt index allows to register
an eventfd to be signaled whenever a stage 1 related fault
is detected at physical level. Also two specific regions allow to
- expose the fault records to the user space and
- inject page responses.

This latter functionality is not exercised in this series
but is provided as a POC for further vSVA activities (Shameer's input).

Best Regards

Eric

This series can be found at:
https://github.com/eauger/linux/tree/v5.11-stallv12-2stage-v14

The series series includes Tina's patch steming from
[1] "[RFC PATCH v2 1/3] vfio: Use capability chains to handle device
specific irq" plus patches originally contributed by Yi.

History:

v11 -> v12:
- numerous fixes following feedbacks. Many thanks to all of you
- See individual history log.

v10 -> v11:
- rebase on top of v5.10-rc4
- adapt to changes on the IOMMU API (compliant with the doc
  written by Jacob/Yi)
- addition of the page response region
- Took into account Zenghui's comments
- In this version I have kept the ioctl separate. Since
  Yi's series [2] is currently stalled, I've just rebased here.

[2] [PATCH v7 00/16] vfio: expose virtual Shared Virtual Addressing
to VMs

v9 -> v10
- rebase on top of 5.6.0-rc3 (no change versus v9)

v8 -> v9:
- introduce specific irq framework
- single fault region
- iommu_unregister_device_fault_handler failure case not handled
  yet.

v7 -> v8:
- rebase on top of v5.2-rc1 and especially
  8be39a1a04c1  iommu/arm-smmu-v3: Add a master->domain pointer
- dynamic alloc of s1_cfg/s2_cfg
- __arm_smmu_tlb_inv_asid/s1_range_nosync
- check there is no HW MSI regions
- asid invalidation using pasid extended struct (change in the uapi)
- add s1_live/s2_live checks
- move check about support of nested stages in domain finalise
- fixes in error reporting according to the discussion with Robin
- reordered the patches to have first iommu/smmuv3 patches and then
  VFIO patches

v6 -> v7:
- removed device handle from bind/unbind_guest_msi
- added "iommu/smmuv3: Nested mode single MSI doorbell per domain
  enforcement"
- added few uapi comments as suggested by Jean, Jacop and Alex

v5 -> v6:
- Fix compilation issue when CONFIG_IOMMU_API is unset

v4 -> v5:
- fix bug reported by Vincent: fault handler unregistration now happens in
  vfio_pci_release
- IOMMU_FAULT_PERM_* moved outside of struct definition + small
  uapi changes suggested by Kean-Philippe (except fetch_addr)
- iommu: introduce device fault report API: removed the PRI part.
- see individual logs for more details
- reset the ste abort flag on detach

v3 -> v4:
- took into account Alex, jean-Philippe and Robin's comments on v3
- rework of the smmuv3 driver integration
- add tear down ops for msi binding and PASID table binding
- fix S1 fault propagation
- put fault reporting patches at the beginning of the series following
  Jean-Philippe's request
- update of the cache invalidate and fault API uapis
- VFIO fault reporting rework with 2 separate regions and one mmappable
  segment for the fault queue
- moved to PATCH

v2 -> v3:
- When registering the S1 MSI binding we now store the device handle. This
  addresses Robin's comment about discimination of devices beonging to
  different S1 groups and using different physical MSI doorbells.
- Change the fault reporting API: use VFIO_PCI_DMA_FAULT_IRQ_INDEX to
  set the eventfd and expose the faults through an mmappable fault region

v1 -> v2:
- Added the fault reporting capability
- asid properly passed on invalidation (fix assignment of multiple
  devices)
- see individual change logs for more info


Eric Auger (10):
  vfio: VFIO_IOMMU_SET_MSI_BINDING
  vfio/pci: Add VFIO_REGION_TYPE_NESTED region type
  vfio/pci: Register an iommu fault handler
  vfio/pci: Allow to mmap the fault queue
  vfio/pci: Add framework for custom interrupt indices
  vfio: Add new IRQ for DMA fault reporting
  vfio/pci: Register and allow DMA FAULT IRQ signaling
  vfio: Document nested stage control
  vfio/pci: Register a DMA fault response region
  vfio/pci: Inject page response upon response region fill

Liu, Yi L (2):
  vfio: VFIO_IOMMU_SET_PASID_TABLE
  vfio: VFIO_IOMMU_CACHE_INVALIDATE

Tina Zhang (1):
  vfio: Use capability chains to handle device specific irq

 Documentation/driver-api/vfio.rst   |  77 +
 drivers/vfio/pci/vfio_pci.c | 447 ++--
 drivers/vfio/pci/vfio_pci_intrs.c   |  62 
 drivers/vfio/pci/vfio_pci_priva

[PATCH v12 01/13] vfio: VFIO_IOMMU_SET_PASID_TABLE

2021-02-23 Thread Eric Auger
From: "Liu, Yi L" 

This patch adds an VFIO_IOMMU_SET_PASID_TABLE ioctl
which aims to pass the virtual iommu guest configuration
to the host. This latter takes the form of the so-called
PASID table.

Signed-off-by: Jacob Pan 
Signed-off-by: Liu, Yi L 
Signed-off-by: Eric Auger 

---
v11 -> v12:
- use iommu_uapi_set_pasid_table
- Rework the flags checks [Zenghui, Alex]
- use VFIO_BASE + 19 [Alex]
- rework the unwind in vfio_attach_pasid_table() [Alex]

v8 -> v9:
- Merge VFIO_IOMMU_ATTACH/DETACH_PASID_TABLE into a single
  VFIO_IOMMU_SET_PASID_TABLE ioctl.

v6 -> v7:
- add a comment related to VFIO_IOMMU_DETACH_PASID_TABLE

v3 -> v4:
- restore ATTACH/DETACH
- add unwind on failure

v2 -> v3:
- s/BIND_PASID_TABLE/SET_PASID_TABLE

v1 -> v2:
- s/BIND_GUEST_STAGE/BIND_PASID_TABLE
- remove the struct device arg
---
 drivers/vfio/vfio_iommu_type1.c | 58 +
 include/uapi/linux/vfio.h   | 19 +++
 2 files changed, 77 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0b4dedaa9128..50c4f8ae5b03 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2587,6 +2587,39 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
 }
 
+static void
+vfio_detach_pasid_table(struct vfio_iommu *iommu)
+{
+   struct vfio_domain *d;
+
+   mutex_lock(>lock);
+   list_for_each_entry(d, >domain_list, next)
+   iommu_detach_pasid_table(d->domain);
+
+   mutex_unlock(>lock);
+}
+
+static int
+vfio_attach_pasid_table(struct vfio_iommu *iommu, unsigned long arg)
+{
+   struct vfio_domain *d;
+   int ret = 0;
+
+   mutex_lock(>lock);
+
+   list_for_each_entry(d, >domain_list, next) {
+   ret = iommu_uapi_attach_pasid_table(d->domain, (void __user 
*)arg);
+   if (ret) {
+   list_for_each_entry_continue_reverse(d, 
>domain_list, next)
+   iommu_detach_pasid_table(d->domain);
+   break;
+   }
+   }
+
+   mutex_unlock(>lock);
+   return ret;
+}
+
 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
   struct vfio_info_cap *caps)
 {
@@ -2747,6 +2780,29 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu 
*iommu,
-EFAULT : 0;
 }
 
+static int vfio_iommu_type1_set_pasid_table(struct vfio_iommu *iommu,
+   unsigned long arg)
+{
+   struct vfio_iommu_type1_set_pasid_table spt;
+   unsigned long minsz;
+
+   minsz = offsetofend(struct vfio_iommu_type1_set_pasid_table, flags);
+
+   if (copy_from_user(, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (spt.argsz < minsz)
+   return -EINVAL;
+
+   if (spt.flags == VFIO_PASID_TABLE_FLAG_SET) {
+   return vfio_attach_pasid_table(iommu, arg + minsz);
+   } else if (spt.flags == VFIO_PASID_TABLE_FLAG_UNSET) {
+   vfio_detach_pasid_table(iommu);
+   return 0;
+   }
+   return -EINVAL;
+}
+
 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
unsigned long arg)
 {
@@ -2867,6 +2923,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return vfio_iommu_type1_unmap_dma(iommu, arg);
case VFIO_IOMMU_DIRTY_PAGES:
return vfio_iommu_type1_dirty_pages(iommu, arg);
+   case VFIO_IOMMU_SET_PASID_TABLE:
+   return vfio_iommu_type1_set_pasid_table(iommu, arg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d1812777139f..6d9c941d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@
 
 #include 
 #include 
+#include 
 
 #define VFIO_API_VERSION   0
 
@@ -1181,6 +1182,24 @@ struct vfio_iommu_type1_dirty_bitmap_get {
 
 #define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
 
+/*
+ * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18,
+ * struct vfio_iommu_type1_set_pasid_table)
+ *
+ * The SET operation passes a PASID table to the host while the
+ * UNSET operation detaches the one currently programmed. Setting
+ * a table while another is already programmed replaces the old table.
+ */
+struct vfio_iommu_type1_set_pasid_table {
+   __u32   argsz;
+   __u32   flags;
+#define VFIO_PASID_TABLE_FLAG_SET  (1 << 0)
+#define VFIO_PASID_TABLE_FLAG_UNSET(1 << 1)
+   struct iommu_pasid_table_config config; /* used on SET */
+};
+
+#define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.26.2

___

[PATCH v12 03/13] vfio: VFIO_IOMMU_SET_MSI_BINDING

2021-02-23 Thread Eric Auger
This patch adds the VFIO_IOMMU_SET_MSI_BINDING ioctl which aim
to (un)register the guest MSI binding to the host. This latter
then can use those stage 1 bindings to build a nested stage
binding targeting the physical MSIs.

Signed-off-by: Eric Auger 

---

v11 -> v12:
- Share VFIO_BASE + 20 with VFIO_IOMMU_SPAPR_TCE_REMOVE
- rework returned values

v10 -> v11:
- renamed ustruct into msi_binding
- return 0 on unbind

v8 -> v9:
- merge VFIO_IOMMU_BIND_MSI/VFIO_IOMMU_UNBIND_MSI into a single
  VFIO_IOMMU_SET_MSI_BINDING ioctl
- ioctl id changed

v6 -> v7:
- removed the dev arg

v3 -> v4:
- add UNBIND
- unwind on BIND error

v2 -> v3:
- adapt to new proto of bind_guest_msi
- directly use vfio_iommu_for_each_dev

v1 -> v2:
- s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
---
 drivers/vfio/vfio_iommu_type1.c | 62 +
 include/uapi/linux/vfio.h   | 20 +++
 2 files changed, 82 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b4057ce809b0..0e6af4fe8c3d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2655,6 +2655,41 @@ static int vfio_cache_inv_fn(struct device *dev, void 
*data)
return iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg);
 }
 
+static int
+vfio_bind_msi(struct vfio_iommu *iommu,
+ dma_addr_t giova, phys_addr_t gpa, size_t size)
+{
+   struct vfio_domain *d;
+   int ret = 0;
+
+   mutex_lock(>lock);
+
+   list_for_each_entry(d, >domain_list, next) {
+   ret = iommu_bind_guest_msi(d->domain, giova, gpa, size);
+   if (ret)
+   goto unwind;
+   }
+   goto unlock;
+unwind:
+   list_for_each_entry_continue_reverse(d, >domain_list, next) {
+   iommu_unbind_guest_msi(d->domain, giova);
+   }
+unlock:
+   mutex_unlock(>lock);
+   return ret;
+}
+
+static void
+vfio_unbind_msi(struct vfio_iommu *iommu, dma_addr_t giova)
+{
+   struct vfio_domain *d;
+
+   mutex_lock(>lock);
+   list_for_each_entry(d, >domain_list, next)
+   iommu_unbind_guest_msi(d->domain, giova);
+   mutex_unlock(>lock);
+}
+
 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
   struct vfio_info_cap *caps)
 {
@@ -2859,6 +2894,31 @@ static int vfio_iommu_type1_cache_invalidate(struct 
vfio_iommu *iommu,
return ret;
 }
 
+static int vfio_iommu_type1_set_msi_binding(struct vfio_iommu *iommu,
+   unsigned long arg)
+{
+   struct vfio_iommu_type1_set_msi_binding msi_binding;
+   unsigned long minsz;
+
+   minsz = offsetofend(struct vfio_iommu_type1_set_msi_binding,
+   size);
+
+   if (copy_from_user(_binding, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (msi_binding.argsz < minsz)
+   return -EINVAL;
+
+   if (msi_binding.flags == VFIO_IOMMU_UNBIND_MSI) {
+   vfio_unbind_msi(iommu, msi_binding.iova);
+   return 0;
+   } else if (msi_binding.flags == VFIO_IOMMU_BIND_MSI) {
+   return vfio_bind_msi(iommu, msi_binding.iova,
+msi_binding.gpa, msi_binding.size);
+   }
+   return -EINVAL;
+}
+
 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
unsigned long arg)
 {
@@ -2983,6 +3043,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return vfio_iommu_type1_set_pasid_table(iommu, arg);
case VFIO_IOMMU_CACHE_INVALIDATE:
return vfio_iommu_type1_cache_invalidate(iommu, arg);
+   case VFIO_IOMMU_SET_MSI_BINDING:
+   return vfio_iommu_type1_set_msi_binding(iommu, arg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ee6747ff8006..4c82e8f21618 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1213,6 +1213,26 @@ struct vfio_iommu_type1_cache_invalidate {
 };
 #define VFIO_IOMMU_CACHE_INVALIDATE  _IO(VFIO_TYPE, VFIO_BASE + 19)
 
+/**
+ * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20,
+ * struct vfio_iommu_type1_set_msi_binding)
+ *
+ * Pass a stage 1 MSI doorbell mapping to the host so that this
+ * latter can build a nested stage2 mapping. Or conversely tear
+ * down a previously bound stage 1 MSI binding.
+ */
+struct vfio_iommu_type1_set_msi_binding {
+   __u32   argsz;
+   __u32   flags;
+#define VFIO_IOMMU_BIND_MSI(1 << 0)
+#define VFIO_IOMMU_UNBIND_MSI  (1 << 1)
+   __u64   iova;   /* MSI guest IOVA */
+   /* Fields below are used on BIND */
+   __u64   gpa;/* MSI guest physical address */
+   __u64   size;   /* size

[PATCH v12 04/13] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type

2021-02-23 Thread Eric Auger
Add a new specific DMA_FAULT region aiming to exposed nested mode
translation faults. This region only is exposed if the device
is attached to a nested domain.

The region has a ring buffer that contains the actual fault
records plus a header allowing to handle it (tail/head indices,
max capacity, entry size). At the moment the region is dimensionned
for 512 fault records.

Signed-off-by: Eric Auger 

---
v11 -> v12:
- set fault_pages to NULL after free
- check new_tail >= header->nb_entries (Zenghui)
- update value of VFIO_REGION_TYPE_NESTED
- handle the case where the domain is NULL
- pass an int pointer to iommu_domain_get_attr [Shenming]

v10 -> v11:
- rename vfio_pci_init_dma_fault_region into
  vfio_pci_dma_fault_init
- free fault_pages in vfio_pci_dma_fault_release
- only register the region if the device is attached
  to a nested domain

v8 -> v9:
- Use a single region instead of a prod/cons region

v4 -> v5
- check cons is not null in vfio_pci_check_cons_fault

v3 -> v4:
- use 2 separate regions, respectively in read and write modes
- add the version capability
---
 drivers/vfio/pci/vfio_pci.c | 79 +
 drivers/vfio/pci/vfio_pci_private.h |  6 +++
 drivers/vfio/pci/vfio_pci_rdwr.c| 44 
 include/uapi/linux/vfio.h   | 35 +
 4 files changed, 164 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 706de3ef94bb..e9a4a1c502c7 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -309,6 +309,81 @@ int vfio_pci_set_power_state(struct vfio_pci_device *vdev, 
pci_power_t state)
return ret;
 }
 
+static void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev,
+  struct vfio_pci_region *region)
+{
+   kfree(vdev->fault_pages);
+}
+
+static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
+struct vfio_pci_region *region,
+struct vfio_info_cap *caps)
+{
+   struct vfio_region_info_cap_fault cap = {
+   .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
+   .header.version = 1,
+   .version = 1,
+   };
+   return vfio_info_add_capability(caps, , sizeof(cap));
+}
+
+static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
+   .rw = vfio_pci_dma_fault_rw,
+   .release= vfio_pci_dma_fault_release,
+   .add_capability = vfio_pci_dma_fault_add_capability,
+};
+
+#define DMA_FAULT_RING_LENGTH 512
+
+static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev)
+{
+   struct vfio_region_dma_fault *header;
+   struct iommu_domain *domain;
+   size_t size;
+   int nested;
+   int ret;
+
+   domain = iommu_get_domain_for_dev(>pdev->dev);
+   if (!domain)
+   return 0;
+
+   ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, );
+   if (ret || !nested)
+   return ret;
+
+   mutex_init(>fault_queue_lock);
+
+   /*
+* We provision 1 page for the header and space for
+* DMA_FAULT_RING_LENGTH fault records in the ring buffer.
+*/
+   size = ALIGN(sizeof(struct iommu_fault) *
+DMA_FAULT_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE;
+
+   vdev->fault_pages = kzalloc(size, GFP_KERNEL);
+   if (!vdev->fault_pages)
+   return -ENOMEM;
+
+   ret = vfio_pci_register_dev_region(vdev,
+   VFIO_REGION_TYPE_NESTED,
+   VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
+   _pci_dma_fault_regops, size,
+   VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+   vdev->fault_pages);
+   if (ret)
+   goto out;
+
+   header = (struct vfio_region_dma_fault *)vdev->fault_pages;
+   header->entry_size = sizeof(struct iommu_fault);
+   header->nb_entries = DMA_FAULT_RING_LENGTH;
+   header->offset = sizeof(struct vfio_region_dma_fault);
+   return 0;
+out:
+   kfree(vdev->fault_pages);
+   vdev->fault_pages = NULL;
+   return ret;
+}
+
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
struct pci_dev *pdev = vdev->pdev;
@@ -407,6 +482,10 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
}
}
 
+   ret = vfio_pci_dma_fault_init(vdev);
+   if (ret)
+   goto disable_exit;
+
vfio_pci_probe_mmaps(vdev);
 
return 0;
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 5c90e560c5c7..1d9b0f648133 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -134,6 +134,8 @@ struct vfio_pci_device {
int ioeventfds_nr;
struct eventfd_ctx  *err_trigger;
struct eventfd_ctx 

[PATCH v12 02/13] vfio: VFIO_IOMMU_CACHE_INVALIDATE

2021-02-23 Thread Eric Auger
From: "Liu, Yi L" 

When the guest "owns" the stage 1 translation structures,  the host
IOMMU driver has no knowledge of caching structure updates unless
the guest invalidation requests are trapped and passed down to the
host.

This patch adds the VFIO_IOMMU_CACHE_INVALIDATE ioctl with aims
at propagating guest stage1 IOMMU cache invalidations to the host.

Signed-off-by: Liu, Yi L 
Signed-off-by: Eric Auger 

---
v11 -> v12:
- share VFIO_BASE + 19 with VFIO_IOMMU_SPAPR_TCE_CREATE

v10 -> v11:
- renamed ustruct into cache_inv

v8 -> v9:
- change the ioctl ID

v6 -> v7:
- Use iommu_capsule struct
- renamed vfio_iommu_for_each_dev into vfio_iommu_lookup_dev
  due to checkpatch error related to for_each_dev suffix

v2 -> v3:
- introduce vfio_iommu_for_each_dev back in this patch

v1 -> v2:
- s/TLB/CACHE
- remove vfio_iommu_task usage
- commit message rewording
---
 drivers/vfio/vfio_iommu_type1.c | 58 +
 include/uapi/linux/vfio.h   | 13 
 2 files changed, 71 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 50c4f8ae5b03..b4057ce809b0 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -143,6 +143,34 @@ struct vfio_regions {
 #define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
 #define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
+struct domain_capsule {
+   struct iommu_domain *domain;
+   void *data;
+};
+
+/* iommu->lock must be held */
+static int
+vfio_iommu_lookup_dev(struct vfio_iommu *iommu,
+ int (*fn)(struct device *dev, void *data),
+ unsigned long arg)
+{
+   struct domain_capsule dc = {.data = };
+   struct vfio_domain *d;
+   struct vfio_group *g;
+   int ret = 0;
+
+   list_for_each_entry(d, >domain_list, next) {
+   dc.domain = d->domain;
+   list_for_each_entry(g, >group_list, next) {
+   ret = iommu_group_for_each_dev(g->iommu_group,
+  , fn);
+   if (ret)
+   break;
+   }
+   }
+   return ret;
+}
+
 static int put_pfn(unsigned long pfn, int prot);
 
 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
@@ -2619,6 +2647,13 @@ vfio_attach_pasid_table(struct vfio_iommu *iommu, 
unsigned long arg)
mutex_unlock(>lock);
return ret;
 }
+static int vfio_cache_inv_fn(struct device *dev, void *data)
+{
+   struct domain_capsule *dc = (struct domain_capsule *)data;
+   unsigned long arg = *(unsigned long *)dc->data;
+
+   return iommu_uapi_cache_invalidate(dc->domain, dev, (void __user *)arg);
+}
 
 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
   struct vfio_info_cap *caps)
@@ -2803,6 +2838,27 @@ static int vfio_iommu_type1_set_pasid_table(struct 
vfio_iommu *iommu,
return -EINVAL;
 }
 
+static int vfio_iommu_type1_cache_invalidate(struct vfio_iommu *iommu,
+   unsigned long arg)
+{
+   struct vfio_iommu_type1_cache_invalidate cache_inv;
+   unsigned long minsz;
+   int ret;
+
+   minsz = offsetofend(struct vfio_iommu_type1_cache_invalidate, flags);
+
+   if (copy_from_user(_inv, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (cache_inv.argsz < minsz || cache_inv.flags)
+   return -EINVAL;
+
+   mutex_lock(>lock);
+   ret = vfio_iommu_lookup_dev(iommu, vfio_cache_inv_fn, arg + minsz);
+   mutex_unlock(>lock);
+   return ret;
+}
+
 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
unsigned long arg)
 {
@@ -2925,6 +2981,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return vfio_iommu_type1_dirty_pages(iommu, arg);
case VFIO_IOMMU_SET_PASID_TABLE:
return vfio_iommu_type1_set_pasid_table(iommu, arg);
+   case VFIO_IOMMU_CACHE_INVALIDATE:
+   return vfio_iommu_type1_cache_invalidate(iommu, arg);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 6d9c941d..ee6747ff8006 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1200,6 +1200,19 @@ struct vfio_iommu_type1_set_pasid_table {
 
 #define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18)
 
+/**
+ * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19,
+ * struct vfio_iommu_type1_cache_invalidate)
+ *
+ * Propagate guest IOMMU cache invalidation to the host.
+ */
+struct vfio_iommu_type1_cache_invalidate {
+   __u32   argsz;
+   __u32   flags;
+   struct iommu_cache_invalidate_info info;
+};

[PATCH v14 13/13] iommu/smmuv3: Accept configs with more than one context descriptor

2021-02-23 Thread Eric Auger
In preparation for vSVA, let's accept userspace provided configs
with more than one CD. We check the max CD against the host iommu
capability and also the format (linear versus 2 level).

Signed-off-by: Eric Auger 
Signed-off-by: Shameer Kolothum 
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 332d31c0680f..ab74a0289893 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3038,14 +3038,17 @@ static int arm_smmu_attach_pasid_table(struct 
iommu_domain *domain,
if (smmu_domain->s1_cfg.set)
goto out;
 
-   /*
-* we currently support a single CD so s1fmt and s1dss
-* fields are also ignored
-*/
-   if (cfg->pasid_bits)
+   list_for_each_entry(master, _domain->devices, domain_head) 
{
+   if (cfg->pasid_bits > master->ssid_bits)
+   goto out;
+   }
+   if (cfg->vendor_data.smmuv3.s1fmt == STRTAB_STE_0_S1FMT_64K_L2 
&&
+   !(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB))
goto out;
 
smmu_domain->s1_cfg.cdcfg.cdtab_dma = cfg->base_ptr;
+   smmu_domain->s1_cfg.s1cdmax = cfg->pasid_bits;
+   smmu_domain->s1_cfg.s1fmt = cfg->vendor_data.smmuv3.s1fmt;
smmu_domain->s1_cfg.set = true;
smmu_domain->abort = false;
break;
-- 
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v14 08/13] dma-iommu: Implement NESTED_MSI cookie

2021-02-23 Thread Eric Auger
Up to now, when the type was UNMANAGED, we used to
allocate IOVA pages within a reserved IOVA MSI range.

If both the host and the guest are exposed with SMMUs, each
would allocate an IOVA. The guest allocates an IOVA (gIOVA)
to map onto the guest MSI doorbell (gDB). The Host allocates
another IOVA (hIOVA) to map onto the physical doorbell (hDB).

So we end up with 2 unrelated mappings, at S1 and S2:
 S1 S2
gIOVA-> gDB
   hIOVA->hDB

The PCI device would be programmed with hIOVA.
No stage 1 mapping would existing, causing the MSIs to fault.

iommu_dma_bind_guest_msi() allows to pass gIOVA/gDB
to the host so that gIOVA can be used by the host instead of
re-allocating a new hIOVA.

 S1   S2
gIOVA->gDB->hDB

this time, the PCI device can be programmed with the gIOVA MSI
doorbell which is correctly mapped through both stages.

Nested mode is not compatible with HW MSI regions as in that
case gDB and hDB should have a 1-1 mapping. This check will
be done when attaching each device to the IOMMU domain.

Signed-off-by: Eric Auger 

---

v10 -> v11:
- fix compilation if !CONFIG_IOMMU_DMA

v7 -> v8:
- correct iommu_dma_(un)bind_guest_msi when
  !CONFIG_IOMMU_DMA
- Mentioned nested mode is not compatible with HW MSI regions
  in commit message
- protect with msi_lock on unbind

v6 -> v7:
- removed device handle

v3 -> v4:
- change function names; add unregister
- protect with msi_lock

v2 -> v3:
- also store the device handle on S1 mapping registration.
  This garantees we associate the associated S2 mapping binds
  to the correct physical MSI controller.

v1 -> v2:
- unmap stage2 on put()
---
 drivers/iommu/dma-iommu.c | 142 +-
 include/linux/dma-iommu.h |  16 +
 2 files changed, 155 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f659395e7959..d25eb7cecaa7 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -29,12 +30,15 @@
 struct iommu_dma_msi_page {
struct list_headlist;
dma_addr_t  iova;
+   dma_addr_t  gpa;
phys_addr_t phys;
+   size_t  s1_granule;
 };
 
 enum iommu_dma_cookie_type {
IOMMU_DMA_IOVA_COOKIE,
IOMMU_DMA_MSI_COOKIE,
+   IOMMU_DMA_NESTED_MSI_COOKIE,
 };
 
 struct iommu_dma_cookie {
@@ -46,6 +50,7 @@ struct iommu_dma_cookie {
dma_addr_t  msi_iova;
};
struct list_headmsi_page_list;
+   spinlock_t  msi_lock;
 
/* Domain for flush queue callback; NULL if flush queue not in use */
struct iommu_domain *fq_domain;
@@ -87,6 +92,7 @@ static struct iommu_dma_cookie *cookie_alloc(enum 
iommu_dma_cookie_type type)
 
cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
if (cookie) {
+   spin_lock_init(>msi_lock);
INIT_LIST_HEAD(>msi_page_list);
cookie->type = type;
}
@@ -120,14 +126,17 @@ EXPORT_SYMBOL(iommu_get_dma_cookie);
  *
  * Users who manage their own IOVA allocation and do not want DMA API support,
  * but would still like to take advantage of automatic MSI remapping, can use
- * this to initialise their own domain appropriately. Users should reserve a
+ * this to initialise their own domain appropriately. Users may reserve a
  * contiguous IOVA region, starting at @base, large enough to accommodate the
  * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
- * used by the devices attached to @domain.
+ * used by the devices attached to @domain. The other way round is to provide
+ * usable iova pages through the iommu_dma_bind_doorbell API (nested stages
+ * use case)
  */
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
 {
struct iommu_dma_cookie *cookie;
+   int nesting, ret;
 
if (domain->type != IOMMU_DOMAIN_UNMANAGED)
return -EINVAL;
@@ -135,7 +144,12 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, 
dma_addr_t base)
if (domain->iova_cookie)
return -EEXIST;
 
-   cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+   ret =  iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, );
+   if (!ret && nesting)
+   cookie = cookie_alloc(IOMMU_DMA_NESTED_MSI_COOKIE);
+   else
+   cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+
if (!cookie)
return -ENOMEM;
 
@@ -156,6 +170,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 {
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iommu_dma_msi_page *msi, *tmp;
+   bool s2_unmap = false;
 
if (!cookie)
retur

  1   2   3   4   5   6   7   8   9   10   >