On Mon, 2013-09-02 at 02:24 +0000, Shankar, Hari wrote:
> Patch Description: The patch is for Intel IOMMU driver. It forces
> IOMMU TLB flush for the particular domain and return correct unmap
> size when intel_iommu_unmap() is called

I *hate* the bizarre calling convention for iommu_unmap(). Is it
actually clearly documented anywhere? Why on earth is it not just
returning void, and expected to unmap what it was *asked* to unmap?

I keep getting patches which "fix" the return value, with no clear
explanation of what it's actually supposed to do. I ask sometimes, but
the answer is always so weird that I end up forgetting it again.

The main user seems to be kvm_iommu_put_pages() in virt/kvm/iommu.c
which appears to be *designed* to get poor performance, making a single
call (with associated clflush and IOTLB flush) for every individual
*page* instead of just unmapping a whole virtual range at a time.

One thing you neglected to fix in this patch is the fact that the
intel_iommu_unmap() function doesn't call dma_pte_free_pagetable() after
it calls dma_pte_clear_range(). For reasons not entirely clear to me,
adding that call fixes a significant performance issue where the time
taken to do unmaps is exponential w.r.t. the size of the region.

And in fact, if we fix that then you suffer the same bug that exists
everywhere *else* we do unmaps... which is that we actually need to
flush the IOTLB (with the IH bit cleared) *before* we free the page
table pages, because the hardware could have cached them and could still
be doing a page walk after you've returned the pages to the pool.

Here's a completely untested work-in-progress that attempts to fix that.
I'll be able to test it myself on about Tuesday when I'm home from New
Orleans and awake...

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 15e9b57..69980a1 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -413,6 +413,7 @@ struct deferred_flush_tables {
        int next;
        struct iova *iova[HIGH_WATER_MARK];
        struct dmar_domain *domain[HIGH_WATER_MARK];
+       struct page *freelist[HIGH_WATER_MARK];
 };
 
 static struct deferred_flush_tables *deferred_flush;
@@ -945,6 +946,114 @@ static void dma_pte_free_pagetable(struct dmar_domain 
*domain,
        }
 }
 
+/* When a page at a given level is being unlinked from its parent, we don't
+   need to *modify* it at all. All we need to do is make a list of all the
+   pages which can be freed just as soon as we've flushed the IOTLB and we
+   know the hardware page-walk will no longer touch them.
+   The 'pte' argument is the *parent* PTE, pointing to the page that is to
+   be freed. */
+static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, int 
level,
+                                           struct dma_pte *pte, struct page 
*freelist)
+{
+       struct page *pg;
+
+       pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
+       pg->freelist = freelist;
+       freelist = pg;
+
+       if (level > 1) {
+               pte = page_address(pg);
+
+               do {
+                       if (dma_pte_present(pte) && !dma_pte_superpage(pte))
+                               freelist = dma_pte_list_pagetables(domain, 
level - 1, pte, freelist);
+
+               } while (!first_pte_in_page(++pte));
+       }
+
+       return freelist;
+}
+
+static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+                                       struct dma_pte *pte, unsigned long pfn,
+                                       unsigned long start_pfn, unsigned long 
last_pfn,
+                                       struct page *freelist)
+{
+       struct dma_pte *first_pte = NULL, *last_pte = NULL;
+
+       pfn = max(start_pfn, pfn);
+       pte = &pte[pfn_level_offset(pfn, level)];
+
+       do {
+               unsigned long level_pfn;
+               struct dma_pte *level_pte;
+
+               if (!dma_pte_present(pte))
+                       goto next;
+
+               level_pfn = pfn & level_mask(level - 1);
+               level_pte = phys_to_virt(dma_pte_addr(pte));
+
+               /* If range covers entire pagetable, free it */
+               if (start_pfn <= level_pfn &&
+                   last_pfn >= level_pfn + level_size(level)) {
+
+                       /* These suborbinate page tables are going away 
entirely. Don't
+                          bother to clear them; we're just going to *free* 
them. */
+                       if (level > 1 && !dma_pte_superpage(pte))
+                               freelist = dma_pte_list_pagetables(domain, 
level - 1, pte, freelist);
+
+                       dma_clear_pte(pte);
+                       if (!first_pte)
+                               first_pte = pte;
+                       last_pte = pte;
+               } else {
+                       /* Recurse down into a level that isn't *entirely* 
obsolete */
+                       freelist = dma_pte_clear_level(domain, level - 1, 
level_pte,
+                                                      level_pfn, start_pfn, 
last_pfn, freelist);
+               }
+next:
+               pfn += level_size(level);
+       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+
+       if (first_pte)
+               domain_flush_cache(domain, first_pte,
+                                  (void *)++last_pte - (void *)first_pte);
+
+       return freelist;
+}
+
+/* We can't just free the pages because the IOMMU may still be walking
+   the page tables, and may have cached the intermediate levels. The
+   pages can only be freed after the IOTLB flush has been done. */
+struct page *domain_unmap(struct dmar_domain *domain,
+                         unsigned long start_pfn,
+                         unsigned long last_pfn)
+{
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       struct page *freelist = NULL;
+
+       BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
+       BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
+       BUG_ON(start_pfn > last_pfn);
+
+       /* we don't need lock here; nobody else touches the iova range */
+       freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
+                                      domain->pgd, 0, start_pfn, last_pfn, 
NULL);
+
+       return freelist;
+}
+
+void dma_free_pagelist(struct page *freelist)
+{
+       struct page *pg;
+
+       while ((pg = freelist)) {
+               freelist = pg->freelist;
+               free_pgtable_page(pg);
+       }
+}
+
 /* iommu handling */
 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 {
@@ -1054,7 +1163,7 @@ static void __iommu_flush_iotlb(struct intel_iommu 
*iommu, u16 did,
                break;
        case DMA_TLB_PSI_FLUSH:
                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
-               /* Note: always flush non-leaf currently */
+               /* IH bit is passed in as part of address */
                val_iva = size_order | addr;
                break;
        default:
@@ -1165,13 +1274,15 @@ static void iommu_flush_dev_iotlb(struct dmar_domain 
*domain,
 }
 
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-                                 unsigned long pfn, unsigned int pages, int 
map)
+                                 unsigned long pfn, unsigned int pages, int 
ih, int map)
 {
        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
 
        BUG_ON(pages == 0);
 
+       if (ih)
+               ih = 1 << 6;
        /*
         * Fallback to domain selective flush if no PSI support or the size is
         * too big.
@@ -1182,7 +1293,7 @@ static void iommu_flush_iotlb_psi(struct intel_iommu 
*iommu, u16 did,
                iommu->flush.flush_iotlb(iommu, did, 0, 0,
                                                DMA_TLB_DSI_FLUSH);
        else
-               iommu->flush.flush_iotlb(iommu, did, addr, mask,
+               iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
                                                DMA_TLB_PSI_FLUSH);
 
        /*
@@ -2850,7 +2961,7 @@ static dma_addr_t __intel_map_single(struct device 
*hwdev, phys_addr_t paddr,
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, 
mm_to_dma_pfn(iova->pfn_lo), size, 1);
+               iommu_flush_iotlb_psi(iommu, domain->id, 
mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -2902,13 +3013,16 @@ static void flush_unmaps(void)
                        /* On real hardware multiple invalidations are 
expensive */
                        if (cap_caching_mode(iommu->cap))
                                iommu_flush_iotlb_psi(iommu, domain->id,
-                               iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 
0);
+                                       iova->pfn_lo, iova->pfn_hi - 
iova->pfn_lo + 1,
+                                       !deferred_flush[i].freelist[j], 0);
                        else {
                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - 
iova->pfn_lo + 1));
                                
iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
                                                (uint64_t)iova->pfn_lo << 
PAGE_SHIFT, mask);
                        }
                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+                       if (deferred_flush[i].freelist[j])
+                               
dma_free_pagelist(deferred_flush[i].freelist[j]);
                }
                deferred_flush[i].next = 0;
        }
@@ -2925,7 +3039,7 @@ static void flush_unmaps_timeout(unsigned long data)
        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page 
*freelist)
 {
        unsigned long flags;
        int next, iommu_id;
@@ -2941,6 +3055,7 @@ static void add_unmap(struct dmar_domain *dom, struct 
iova *iova)
        next = deferred_flush[iommu_id].next;
        deferred_flush[iommu_id].domain[next] = dom;
        deferred_flush[iommu_id].iova[next] = iova;
+       deferred_flush[iommu_id].freelist[next] = freelist;
        deferred_flush[iommu_id].next++;
 
        if (!timer_on) {
@@ -2960,6 +3075,7 @@ static void intel_unmap_page(struct device *dev, 
dma_addr_t dev_addr,
        unsigned long start_pfn, last_pfn;
        struct iova *iova;
        struct intel_iommu *iommu;
+       struct page *freelist;
 
        if (iommu_no_mapping(dev))
                return;
@@ -2980,19 +3096,16 @@ static void intel_unmap_page(struct device *dev, 
dma_addr_t dev_addr,
        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
                 pci_name(pdev), start_pfn, last_pfn);
 
-       /*  clear the whole page */
-       dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+       freelist = domain_unmap(domain, start_pfn, last_pfn);
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1, 0);
+                                     last_pfn - start_pfn + 1, !freelist, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
+               dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova);
+               add_unmap(domain, iova, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
@@ -3054,6 +3167,7 @@ static void intel_unmap_sg(struct device *hwdev, struct 
scatterlist *sglist,
        unsigned long start_pfn, last_pfn;
        struct iova *iova;
        struct intel_iommu *iommu;
+       struct page *freelist;
 
        if (iommu_no_mapping(hwdev))
                return;
@@ -3071,19 +3185,16 @@ static void intel_unmap_sg(struct device *hwdev, struct 
scatterlist *sglist,
        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
 
-       /*  clear the whole page */
-       dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+       freelist = domain_unmap(domain, start_pfn, last_pfn);
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1, 0);
+                                     last_pfn - start_pfn + 1, !freelist, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
+               dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova);
+               add_unmap(domain, iova, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
@@ -3166,7 +3277,7 @@ static int intel_map_sg(struct device *hwdev, struct 
scatterlist *sglist, int ne
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
+               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 
1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -4116,7 +4227,10 @@ static size_t intel_iommu_unmap(struct iommu_domain 
*domain,
        int order;
 
        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
-                           (iova + size - 1) >> VTD_PAGE_SHIFT);
+                                   (iova + size - 1) >> VTD_PAGE_SHIFT);
+
+       dma_pte_free_pagetable(dmar_domain, iova >> VTD_PAGE_SHIFT,
+                              (iova + size - 1) >> VTD_PAGE_SHIFT);
 
        if (dmar_domain->max_addr == iova + size)
                dmar_domain->max_addr = iova;



-- 
David Woodhouse                            Open Source Technology Centre
david.woodho...@intel.com                              Intel Corporation

Attachment: smime.p7s
Description: S/MIME cryptographic signature

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Reply via email to