> On Apr 8, 2020, at 10:19 AM, Joerg Roedel <j...@8bytes.org> wrote:
> 
> Hi Qian,
> 
> On Tue, Apr 07, 2020 at 11:36:05AM -0400, Qian Cai wrote:
>> After further testing, the change along is insufficient. What I am chasing 
>> right
>> now is the swap device will go offline after heavy memory pressure below. The
>> symptom is similar to what we have in the commit,
>> 
>> 754265bcab78 (“iommu/amd: Fix race in increase_address_space()”)
>> 
>> Apparently, it is no possible to take the domain->lock in fetch_pte() 
>> because it
>> could sleep.
> 
> Thanks a lot for finding and tracking down another race in the AMD IOMMU
> page-table code.  The domain->lock is a spin-lock and taking it can't
> sleep. But fetch_pte() is a fast-path and must not take any locks.
> 
> I think the best fix is to update the pt_root and mode of the domain
> atomically by storing the mode in the lower 12 bits of pt_root. This way
> they are stored together and can be read/write atomically.

Like this?

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 20cce366e951..b36c6b07cbfd 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1397,13 +1397,13 @@ static struct page *free_sub_pt(unsigned long root, int 
mode,
 
 static void free_pagetable(struct protection_domain *domain)
 {
-       unsigned long root = (unsigned long)domain->pt_root;
+       int level = iommu_get_mode(domain->pt_root);
+       unsigned long root = iommu_get_root(domain->pt_root);
        struct page *freelist = NULL;
 
-       BUG_ON(domain->mode < PAGE_MODE_NONE ||
-              domain->mode > PAGE_MODE_6_LEVEL);
+       BUG_ON(level < PAGE_MODE_NONE || level > PAGE_MODE_6_LEVEL);
 
-       freelist = free_sub_pt(root, domain->mode, freelist);
+       freelist = free_sub_pt(root, level, freelist);
 
        free_page_list(freelist);
 }
@@ -1417,24 +1417,27 @@ static bool increase_address_space(struct 
protection_domain *domain,
                                   unsigned long address,
                                   gfp_t gfp)
 {
+       int level;
        unsigned long flags;
        bool ret = false;
        u64 *pte;
 
        spin_lock_irqsave(&domain->lock, flags);
 
-       if (address <= PM_LEVEL_SIZE(domain->mode) ||
-           WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
+       level = iommu_get_mode(domain->pt_root);
+
+       if (address <= PM_LEVEL_SIZE(level) ||
+           WARN_ON_ONCE(level == PAGE_MODE_6_LEVEL))
                goto out;
 
        pte = (void *)get_zeroed_page(gfp);
        if (!pte)
                goto out;
 
-       *pte             = PM_LEVEL_PDE(domain->mode,
-                                       iommu_virt_to_phys(domain->pt_root));
-       domain->pt_root  = pte;
-       domain->mode    += 1;
+       *pte = PM_LEVEL_PDE(level,
+               iommu_virt_to_phys((void *)iommu_get_root(domain->pt_root)));
+
+       WRITE_ONCE(domain->pt_root, (unsigned long)pte + level + 1);
 
        ret = true;
 
@@ -1452,15 +1455,17 @@ static u64 *alloc_pte(struct protection_domain *domain,
                      bool *updated)
 {
        int level, end_lvl;
-       u64 *pte, *page;
+       u64 *pte, *page, *pt_root, *root;
 
        BUG_ON(!is_power_of_2(page_size));
 
-       while (address > PM_LEVEL_SIZE(domain->mode))
+       while (address > PM_LEVEL_SIZE(iommu_get_mode(domain->pt_root)))
                *updated = increase_address_space(domain, address, gfp) || 
*updated;
 
-       level   = domain->mode - 1;
-       pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+       pt_root = READ_ONCE(domain->pt_root);
+       root    = (void *)iommu_get_root(pt_root);
+       level   = iommu_get_mode(pt_root) - 1;
+       pte     = &root[PM_LEVEL_INDEX(level, address)];
        address = PAGE_SIZE_ALIGN(address, page_size);
        end_lvl = PAGE_SIZE_LEVEL(page_size);
 
@@ -1536,16 +1541,18 @@ static u64 *fetch_pte(struct protection_domain *domain,
                      unsigned long address,
                      unsigned long *page_size)
 {
-       int level;
        u64 *pte;
+       u64 *pt_root = READ_ONCE(domain->pt_root);
+       u64 *root    = (void *)iommu_get_root(pt_root);
+       int level    = iommu_get_mode(pt_root);
 
        *page_size = 0;
 
-       if (address > PM_LEVEL_SIZE(domain->mode))
+       if (address > PM_LEVEL_SIZE(level))
                return NULL;
 
-       level      =  domain->mode - 1;
-       pte        = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+       level--;
+       pte        = &root[PM_LEVEL_INDEX(level, address)];
        *page_size =  PTE_LEVEL_PAGE_SIZE(level);
 
        while (level > 0) {
@@ -1814,12 +1821,13 @@ static struct protection_domain 
*dma_ops_domain_alloc(void)
        if (protection_domain_init(domain))
                goto free_domain;
 
-       domain->mode = PAGE_MODE_3_LEVEL;
        domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
        domain->flags = PD_DMA_OPS_MASK;
        if (!domain->pt_root)
                goto free_domain;
 
+       domain->pt_root = iommu_set_mode(domain->pt_root, PAGE_MODE_3_LEVEL);
+
        if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM)
                goto free_domain;
 
@@ -1847,10 +1855,10 @@ static void set_dte_entry(u16 devid, struct 
protection_domain *domain,
        u64 flags = 0;
        u32 old_domid;
 
-       if (domain->mode != PAGE_MODE_NONE)
-               pte_root = iommu_virt_to_phys(domain->pt_root);
+       if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
+               pte_root = iommu_virt_to_phys((void 
*)iommu_get_root(domain->pt_root));
 
-       pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+       pte_root |= ((unsigned long)domain->pt_root & DEV_ENTRY_MODE_MASK)
                    << DEV_ENTRY_MODE_SHIFT;
        pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV;
 
@@ -2382,13 +2390,14 @@ static struct iommu_domain 
*amd_iommu_domain_alloc(unsigned type)
                if (!pdomain)
                        return NULL;
 
-               pdomain->mode    = PAGE_MODE_3_LEVEL;
                pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
                if (!pdomain->pt_root) {
                        protection_domain_free(pdomain);
                        return NULL;
                }
 
+               pdomain->pt_root = iommu_set_mode(pdomain->pt_root,
+                                                 PAGE_MODE_3_LEVEL);
                pdomain->domain.geometry.aperture_start = 0;
                pdomain->domain.geometry.aperture_end   = ~0ULL;
                pdomain->domain.geometry.force_aperture = true;
@@ -2406,7 +2415,8 @@ static struct iommu_domain 
*amd_iommu_domain_alloc(unsigned type)
                if (!pdomain)
                        return NULL;
 
-               pdomain->mode = PAGE_MODE_NONE;
+               pdomain->pt_root = iommu_set_mode(pdomain->pt_root,
+                                                 PAGE_MODE_NONE);
                break;
        default:
                return NULL;
@@ -2435,7 +2445,7 @@ static void amd_iommu_domain_free(struct iommu_domain 
*dom)
                dma_ops_domain_free(domain);
                break;
        default:
-               if (domain->mode != PAGE_MODE_NONE)
+               if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
                        free_pagetable(domain);
 
                if (domain->flags & PD_IOMMUV2_MASK)
@@ -2521,7 +2531,7 @@ static int amd_iommu_map(struct iommu_domain *dom, 
unsigned long iova,
        int prot = 0;
        int ret;
 
-       if (domain->mode == PAGE_MODE_NONE)
+       if (iommu_get_mode(domain->pt_root) == PAGE_MODE_NONE)
                return -EINVAL;
 
        if (iommu_prot & IOMMU_READ)
@@ -2542,7 +2552,7 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, 
unsigned long iova,
 {
        struct protection_domain *domain = to_pdomain(dom);
 
-       if (domain->mode == PAGE_MODE_NONE)
+       if (iommu_get_mode(domain->pt_root) == PAGE_MODE_NONE)
                return 0;
 
        return iommu_unmap_page(domain, iova, page_size);
@@ -2555,7 +2565,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct 
iommu_domain *dom,
        unsigned long offset_mask, pte_pgsize;
        u64 *pte, __pte;
 
-       if (domain->mode == PAGE_MODE_NONE)
+       if (iommu_get_mode(domain->pt_root) == PAGE_MODE_NONE)
                return iova;
 
        pte = fetch_pte(domain, iova, &pte_pgsize);
@@ -2713,7 +2723,7 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom)
        spin_lock_irqsave(&domain->lock, flags);
 
        /* Update data structure */
-       domain->mode    = PAGE_MODE_NONE;
+       domain->pt_root = iommu_set_mode(domain->pt_root, PAGE_MODE_NONE);
 
        /* Make changes visible to IOMMUs */
        update_domain(domain);
@@ -2910,7 +2920,7 @@ static int __set_gcr3(struct protection_domain *domain, 
int pasid,
 {
        u64 *pte;
 
-       if (domain->mode != PAGE_MODE_NONE)
+       if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
                return -EINVAL;
 
        pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
@@ -2926,7 +2936,7 @@ static int __clear_gcr3(struct protection_domain *domain, 
int pasid)
 {
        u64 *pte;
 
-       if (domain->mode != PAGE_MODE_NONE)
+       if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
                return -EINVAL;
 
        pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 92c2ba6468a0..34d4dd66cf9b 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -67,6 +67,21 @@ static inline int amd_iommu_create_irq_domain(struct 
amd_iommu *iommu)
 extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
                                  int status, int tag);
 
+static inline int iommu_get_mode(void *pt_root)
+{
+       return (unsigned long)pt_root & ~PAGE_MASK;
+}
+
+static inline unsigned long iommu_get_root(void *pt_root)
+{
+       return (unsigned long)pt_root & PAGE_MASK;
+}
+
+static inline void *iommu_set_mode(void *pt_root, int mode)
+{
+       return (void *)(iommu_get_root(pt_root) + mode);
+}
+
 static inline bool is_rd890_iommu(struct pci_dev *pdev)
 {
        return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index ca8c4522045b..221adefa56a0 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -468,8 +468,8 @@ struct protection_domain {
                                       iommu core code */
        spinlock_t lock;        /* mostly used to lock the page table*/
        u16 id;                 /* the domain id written to the device table */
-       int mode;               /* paging mode (0-6 levels) */
-       u64 *pt_root;           /* page table root pointer */
+       u64 *pt_root;           /* page table root pointer and paging mode (0-6
+                                  levels) */
        int glx;                /* Number of levels for GCR3 table */
        u64 *gcr3_tbl;          /* Guest CR3 table */
        unsigned long flags;    /* flags to find out type of domain */
-- 
2.21.0 (Apple Git-122.2)
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Reply via email to