Re: [PATCH RFC 09/19] iommu/amd: Access/Dirty bit support in IOPTEs

2022-05-31 Thread Joao Martins
On 5/31/22 12:34, Suravee Suthikulpanit wrote:
> Joao,
> 
> On 4/29/22 4:09 AM, Joao Martins wrote:
>> .
>> +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
>> +bool enable)
>> +{
>> +struct protection_domain *pdomain = to_pdomain(domain);
>> +struct iommu_dev_data *dev_data;
>> +bool dom_flush = false;
>> +
>> +if (!amd_iommu_had_support)
>> +return -EOPNOTSUPP;
>> +
>> +list_for_each_entry(dev_data, >dev_list, list) {
> 
> Since we iterate through device list for the domain, we would need to
> call spin_lock_irqsave(>lock, flags) here.
> 
Ugh, yes. Will fix.

>> +struct amd_iommu *iommu;
>> +u64 pte_root;
>> +
>> +iommu = amd_iommu_rlookup_table[dev_data->devid];
>> +pte_root = amd_iommu_dev_table[dev_data->devid].data[0];
>> +
>> +/* No change? */
>> +if (!(enable ^ !!(pte_root & DTE_FLAG_HAD)))
>> +continue;
>> +
>> +pte_root = (enable ?
>> +pte_root | DTE_FLAG_HAD : pte_root & ~DTE_FLAG_HAD);
>> +
>> +/* Flush device DTE */
>> +amd_iommu_dev_table[dev_data->devid].data[0] = pte_root;
>> +device_flush_dte(dev_data);
>> +dom_flush = true;
>> +}
>> +
>> +/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
>> +if (dom_flush) {
>> +unsigned long flags;
>> +
>> +spin_lock_irqsave(>lock, flags);
>> +amd_iommu_domain_flush_tlb_pde(pdomain);
>> +amd_iommu_domain_flush_complete(pdomain);
>> +spin_unlock_irqrestore(>lock, flags);
>> +}
> 
> And call spin_unlock_irqrestore(>lock, flags); here.

ack

Additionally, something that I am thinking for v2 was going to have
@had bool field in iommu_dev_data. That would align better with the
rest of amd iommu code rather than me introducing this pattern of
using hardware location of PTE roots. Let me know if you disagree.

>> +
>> +return 0;
>> +}
>> +
>> +static bool amd_iommu_get_dirty_tracking(struct iommu_domain *domain)
>> +{
>> +struct protection_domain *pdomain = to_pdomain(domain);
>> +struct iommu_dev_data *dev_data;
>> +u64 dte;
>> +
> 
> Also call spin_lock_irqsave(>lock, flags) here
> 
ack
>> +list_for_each_entry(dev_data, >dev_list, list) {
>> +dte = amd_iommu_dev_table[dev_data->devid].data[0];
>> +if (!(dte & DTE_FLAG_HAD))
>> +return false;
>> +}
>> +
> 
> And call spin_unlock_irqsave(>lock, flags) here
> 
ack

Same comment as I was saying above, and replace the @dte checking
to just instead check this new variable.

>> +return true;
>> +}
>> +
>> +static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
>> +  unsigned long iova, size_t size,
>> +  struct iommu_dirty_bitmap *dirty)
>> +{
>> +struct protection_domain *pdomain = to_pdomain(domain);
>> +struct io_pgtable_ops *ops = >iop.iop.ops;
>> +
>> +if (!amd_iommu_get_dirty_tracking(domain))
>> +return -EOPNOTSUPP;
>> +
>> +if (!ops || !ops->read_and_clear_dirty)
>> +return -ENODEV;
> 
> We move this check before the amd_iommu_get_dirty_tracking().
> 

Yeap, better fail earlier.

> Best Regards,
> Suravee
> 
>> +
>> +return ops->read_and_clear_dirty(ops, iova, size, dirty);
>> +}
>> +
>> +
>>   static void amd_iommu_get_resv_regions(struct device *dev,
>> struct list_head *head)
>>   {
>> @@ -2293,6 +2368,8 @@ const struct iommu_ops amd_iommu_ops = {
>>  .flush_iotlb_all = amd_iommu_flush_iotlb_all,
>>  .iotlb_sync = amd_iommu_iotlb_sync,
>>  .free   = amd_iommu_domain_free,
>> +.set_dirty_tracking = amd_iommu_set_dirty_tracking,
>> +.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
>>  }
>>   };
>>   
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 09/19] iommu/amd: Access/Dirty bit support in IOPTEs

2022-05-31 Thread Baolu Lu

Hi Suravee ,

On 2022/5/31 19:34, Suravee Suthikulpanit wrote:

On 4/29/22 4:09 AM, Joao Martins wrote:

.
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+    bool enable)
+{
+    struct protection_domain *pdomain = to_pdomain(domain);
+    struct iommu_dev_data *dev_data;
+    bool dom_flush = false;
+
+    if (!amd_iommu_had_support)
+    return -EOPNOTSUPP;
+
+    list_for_each_entry(dev_data, >dev_list, list) {


Since we iterate through device list for the domain, we would need to
call spin_lock_irqsave(>lock, flags) here.


Not related, just out of curiosity. Does it really need to disable the
interrupt while holding this lock? Any case this list would be traversed
in any interrupt context? Perhaps I missed anything?

Best regards,
baolu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH RFC 09/19] iommu/amd: Access/Dirty bit support in IOPTEs

2022-05-31 Thread Suravee Suthikulpanit via iommu

Joao,

On 4/29/22 4:09 AM, Joao Martins wrote:

.
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+   bool enable)
+{
+   struct protection_domain *pdomain = to_pdomain(domain);
+   struct iommu_dev_data *dev_data;
+   bool dom_flush = false;
+
+   if (!amd_iommu_had_support)
+   return -EOPNOTSUPP;
+
+   list_for_each_entry(dev_data, >dev_list, list) {


Since we iterate through device list for the domain, we would need to
call spin_lock_irqsave(>lock, flags) here.


+   struct amd_iommu *iommu;
+   u64 pte_root;
+
+   iommu = amd_iommu_rlookup_table[dev_data->devid];
+   pte_root = amd_iommu_dev_table[dev_data->devid].data[0];
+
+   /* No change? */
+   if (!(enable ^ !!(pte_root & DTE_FLAG_HAD)))
+   continue;
+
+   pte_root = (enable ?
+   pte_root | DTE_FLAG_HAD : pte_root & ~DTE_FLAG_HAD);
+
+   /* Flush device DTE */
+   amd_iommu_dev_table[dev_data->devid].data[0] = pte_root;
+   device_flush_dte(dev_data);
+   dom_flush = true;
+   }
+
+   /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
+   if (dom_flush) {
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   amd_iommu_domain_flush_tlb_pde(pdomain);
+   amd_iommu_domain_flush_complete(pdomain);
+   spin_unlock_irqrestore(>lock, flags);
+   }


And call spin_unlock_irqrestore(>lock, flags); here.

+
+   return 0;
+}
+
+static bool amd_iommu_get_dirty_tracking(struct iommu_domain *domain)
+{
+   struct protection_domain *pdomain = to_pdomain(domain);
+   struct iommu_dev_data *dev_data;
+   u64 dte;
+


Also call spin_lock_irqsave(>lock, flags) here


+   list_for_each_entry(dev_data, >dev_list, list) {
+   dte = amd_iommu_dev_table[dev_data->devid].data[0];
+   if (!(dte & DTE_FLAG_HAD))
+   return false;
+   }
+


And call spin_unlock_irqsave(>lock, flags) here


+   return true;
+}
+
+static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ struct iommu_dirty_bitmap *dirty)
+{
+   struct protection_domain *pdomain = to_pdomain(domain);
+   struct io_pgtable_ops *ops = >iop.iop.ops;
+
+   if (!amd_iommu_get_dirty_tracking(domain))
+   return -EOPNOTSUPP;
+
+   if (!ops || !ops->read_and_clear_dirty)
+   return -ENODEV;


We move this check before the amd_iommu_get_dirty_tracking().

Best Regards,
Suravee


+
+   return ops->read_and_clear_dirty(ops, iova, size, dirty);
+}
+
+
  static void amd_iommu_get_resv_regions(struct device *dev,
   struct list_head *head)
  {
@@ -2293,6 +2368,8 @@ const struct iommu_ops amd_iommu_ops = {
.flush_iotlb_all = amd_iommu_flush_iotlb_all,
.iotlb_sync = amd_iommu_iotlb_sync,
.free   = amd_iommu_domain_free,
+   .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+   .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
}
  };
  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 09/19] iommu/amd: Access/Dirty bit support in IOPTEs

2022-04-28 Thread Joao Martins
IOMMU advertises Access/Dirty bits if the extended feature register
reports it. Relevant AMD IOMMU SDM ref[0]
"1.3.8 Enhanced Support for Access and Dirty Bits"

To enable it set the DTE flag in bits 7 and 8 to enable access, or
access+dirty. With that, the IOMMU starts marking the D and A flags on
every Memory Request or ATS translation request. It is on the VMM side
to steer whether to enable dirty tracking or not, rather than wrongly
doing in IOMMU. Relevant AMD IOMMU SDM ref [0], "Table 7. Device Table
Entry (DTE) Field Definitions" particularly the entry "HAD".

To actually toggle on and off it's relatively simple as it's setting
2 bits on DTE and flush the device DTE cache.

To get what's dirtied use existing AMD io-pgtable support, by walking
the pagetables over each IOVA, with fetch_pte().  The IOTLB flushing is
left to the caller (much like unmap), and iommu_dirty_bitmap_record() is
the one adding page-ranges to invalidate. This allows caller to batch
the flush over a big span of IOVA space, without the iommu wondering
about when to flush.

Worthwhile sections from AMD IOMMU SDM:

"2.2.3.1 Host Access Support"
"2.2.3.2 Host Dirty Support"

For details on how IOMMU hardware updates the dirty bit see,
and expects from its consequent clearing by CPU:

"2.2.7.4 Updating Accessed and Dirty Bits in the Guest Address Tables"
"2.2.7.5 Clearing Accessed and Dirty Bits"

Quoting the SDM:

"The setting of accessed and dirty status bits in the page tables is
visible to both the CPU and the peripheral when sharing guest page
tables. The IOMMU interlocked operations to update A and D bits must be
64-bit operations and naturally aligned on a 64-bit boundary"

.. and for the IOMMU update sequence to Dirty bit, essentially is states:

1. Decodes the read and write intent from the memory access.
2. If P=0 in the page descriptor, fail the access.
3. Compare the A & D bits in the descriptor with the read and write
intent in the request.
4. If the A or D bits need to be updated in the descriptor:
* Start atomic operation.
* Read the descriptor as a 64-bit access.
* If the descriptor no longer appears to require an update, release the
atomic lock with
no further action and continue to step 5.
* Calculate the new A & D bits.
* Write the descriptor as a 64-bit access.
* End atomic operation.
5. Continue to the next stage of translation or to the memory access.

Access/Dirty bits readout also need to consider the default
non-page-size (aka replicated PTEs as mentined by manual), as AMD
supports all powers of two page sizes (except 512G) even though the
underlying IOTLB mappings are restricted to the same ones as supported
by the CPU (4K, 2M, 1G). It makes one wonder whether AMD_IOMMU_PGSIZES
ought to avoid advertising non-default page sizes at all, when creating
an UNMANAGED DOMAIN, or when dirty tracking is toggling in.

Signed-off-by: Joao Martins 
---
 drivers/iommu/amd/amd_iommu.h   |  1 +
 drivers/iommu/amd/amd_iommu_types.h | 11 +
 drivers/iommu/amd/init.c|  8 ++-
 drivers/iommu/amd/io_pgtable.c  | 56 +
 drivers/iommu/amd/iommu.c   | 77 +
 5 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 1ab31074f5b3..2f16ad8f7514 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -34,6 +34,7 @@ extern int amd_iommu_reenable(int);
 extern int amd_iommu_enable_faulting(void);
 extern int amd_iommu_guest_ir;
 extern enum io_pgtable_fmt amd_iommu_pgtable;
+extern bool amd_iommu_had_support;
 
 /* IOMMUv2 specific functions */
 struct iommu_domain;
diff --git a/drivers/iommu/amd/amd_iommu_types.h 
b/drivers/iommu/amd/amd_iommu_types.h
index 47108ed44fbb..c1eba8fce4bb 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -93,7 +93,9 @@
 #define FEATURE_HE (1ULL<<8)
 #define FEATURE_PC (1ULL<<9)
 #define FEATURE_GAM_VAPIC  (1ULL<<21)
+#define FEATURE_HASUP  (1ULL<<49)
 #define FEATURE_EPHSUP (1ULL<<50)
+#define FEATURE_HDSUP  (1ULL<<52)
 #define FEATURE_SNP(1ULL<<63)
 
 #define FEATURE_PASID_SHIFT32
@@ -197,6 +199,7 @@
 /* macros and definitions for device table entries */
 #define DEV_ENTRY_VALID 0x00
 #define DEV_ENTRY_TRANSLATION   0x01
+#define DEV_ENTRY_HAD   0x07
 #define DEV_ENTRY_PPR   0x34
 #define DEV_ENTRY_IR0x3d
 #define DEV_ENTRY_IW0x3e
@@ -350,10 +353,16 @@
 #define PTE_LEVEL_PAGE_SIZE(level) \
(1ULL << (12 + (9 * (level
 
+/*
+ * The IOPTE dirty bit
+ */
+#define IOMMU_PTE_HD_BIT (6)
+
 /*
  * Bit value definition for I/O PTE fields
  */
 #define IOMMU_PTE_PR (1ULL << 0)
+#define IOMMU_PTE_HD (1ULL << IOMMU_PTE_HD_BIT)
 #define IOMMU_PTE_U  (1ULL << 59)
 #define IOMMU_PTE_FC (1ULL << 60)
 #define IOMMU_PTE_IR (1ULL << 61)
@@ -364,6 +373,7 @@
  */