Re: [PATCH 3/5] perf/core: Fix arch_perf_get_page_size()

2020-11-16 Thread Liang, Kan




On 11/13/2020 6:19 AM, Peter Zijlstra wrote:

The (new) page-table walker in arch_perf_get_page_size() is broken in
various ways. Specifically while it is used in a locless manner, it
doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
offset methods, nor is careful to only read each entry only once.

Also the hugetlb support is broken due to calling pte_page() without
first checking pte_special().

Rewrite the whole thing to be a proper lockless page-table walker and
employ the new pXX_leaf_size() pgtable functions to determine the TLB
size without looking at the page-frames.

Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")


The issue 
(https://lkml.kernel.org/r/8e88ba79-7c40-ea32-a7ed-bdc4fc04b...@linux.intel.com) 
has been fixed by this patch set.


Tested-by: Kan Liang 



Signed-off-by: Peter Zijlstra (Intel) 
---
  arch/arm64/include/asm/pgtable.h|3 +
  arch/sparc/include/asm/pgtable_64.h |   13 
  arch/sparc/mm/hugetlbpage.c |   19 --
  include/linux/pgtable.h |   16 +
  kernel/events/core.c|  102 
+---
  5 files changed, 82 insertions(+), 71 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
  }
  
-#ifdef CONFIG_MMU

-
  /*
- * Return the MMU page size of a given virtual address.
- *
- * This generic implementation handles page-table aligned huge pages, as well
- * as non-page-table aligned hugetlbfs compound pages.
- *
- * If an architecture supports and uses non-page-table aligned pages in their
- * kernel mapping it will need to provide it's own implementation of this
- * function.
+ * Return the MMU/TLB page size of a given virtual address.
   */
-__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 perf_get_tlb_page_size(struct mm_struct *mm, unsigned long addr)
  {
-   struct page *page;
-   pgd_t *pgd;
-   p4d_t *p4d;
-   pud_t *pud;
-   pmd_t *pmd;
-   pte_t *pte;
+   u64 size = 0;
  
-	pgd = pgd_offset(mm, addr);

-   if (pgd_none(*pgd))
-   return 0;
+#ifdef CONFIG_HAVE_FAST_GUP
+   pgd_t *pgdp, pgd;
+   p4d_t *p4dp, p4d;
+   pud_t *pudp, pud;
+   pmd_t *pmdp, pmd;
+   pte_t *ptep, pte;
  
-	p4d = p4d_offset(pgd, addr);

-   if (!p4d_present(*p4d))
+   pgdp = pgd_offset(mm, addr);
+   pgd = READ_ONCE(*pgdp);
+   if (pgd_none(pgd))
return 0;
  
-	if (p4d_leaf(*p4d))

-   return 1ULL << P4D_SHIFT;
+   if (pgd_leaf(pgd))
+   return pgd_leaf_size(pgd);
  
-	pud = pud_offset(p4d, addr);

-   if (!pud_present(*pud))
+   p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+   p4d = READ_ONCE(*p4dp);
+   if (!p4d_present(p4d))
return 0;
  
-	if (pud_leaf(*pud)) {

-#ifdef pud_page
-   page = pud_page(*pud);
-   if (PageHuge(page))
-   return page_size(compound_head(page));
-#endif
-   return 1ULL << PUD_SHIFT;
-   }
+   if (p4d_leaf(p4d))
+   return p4d_leaf_size(p4d);
  
-	pmd = pmd_offset(pud, addr);

-   if (!pmd_present(*pmd))
+   pudp = pud_offset_lockless(p4dp, p4d, addr);
+   pud = READ_ONCE(*pudp);
+   if (!pud_present(pud))
return 0;
  
-	if (pmd_leaf(*pmd)) {

-#ifdef pmd_page
-   page = pmd_page(*pmd);
-   if (PageHuge(page))
-   return page_size(compound_head(page));
-#endif
-   return 1ULL << PMD_SHIFT;
-   }
+   if (pud_leaf(pud))
+   return pud_leaf_size(pud);
  
-	pte = pte_offset_map(pmd, addr);

-   if (!pte_present(*pte)) {
-   pte_unmap(pte);
+   pmdp = pmd_offset_lockless(pudp, pud, addr);
+   pmd = READ_ONCE(*pmdp);
+   if (!pmd_present(pmd))
return 0;
-   }
  
-	page = pte_page(*pte);

-   if (PageHuge(page)) {
-   u64 size = page_size(compound_head(page));
-   pte_unmap(pte);
-   return size;
-   }
-
-   pte_unmap(pte);
-   return PAGE_SIZE;
-}
+   if (pmd_leaf(pmd))
+   return pmd_leaf_size(pmd);
  
-#else

+   ptep = pte_offset_map(, addr);
+   pte = ptep_get_lockless(ptep);
+   if (pte_present(pte))
+   size = pte_leaf_size(pte);
+   pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
  
-static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)

-{
-   return 0;
+   return size;
  }
  
-#endif

-
  static u64 perf_get_page_size(unsigned long addr)
  {
struct mm_struct *mm;
@@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l
mm = _mm;
}
  
-	size = arch_perf_get_page_size(mm, addr);

+   

[PATCH 3/5] perf/core: Fix arch_perf_get_page_size()

2020-11-13 Thread Peter Zijlstra
The (new) page-table walker in arch_perf_get_page_size() is broken in
various ways. Specifically while it is used in a locless manner, it
doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless
offset methods, nor is careful to only read each entry only once.

Also the hugetlb support is broken due to calling pte_page() without
first checking pte_special().

Rewrite the whole thing to be a proper lockless page-table walker and
employ the new pXX_leaf_size() pgtable functions to determine the TLB
size without looking at the page-frames.

Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs")
Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE")
Signed-off-by: Peter Zijlstra (Intel) 
---
 arch/arm64/include/asm/pgtable.h|3 +
 arch/sparc/include/asm/pgtable_64.h |   13 
 arch/sparc/mm/hugetlbpage.c |   19 --
 include/linux/pgtable.h |   16 +
 kernel/events/core.c|  102 +---
 5 files changed, 82 insertions(+), 71 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
 }
 
-#ifdef CONFIG_MMU
-
 /*
- * Return the MMU page size of a given virtual address.
- *
- * This generic implementation handles page-table aligned huge pages, as well
- * as non-page-table aligned hugetlbfs compound pages.
- *
- * If an architecture supports and uses non-page-table aligned pages in their
- * kernel mapping it will need to provide it's own implementation of this
- * function.
+ * Return the MMU/TLB page size of a given virtual address.
  */
-__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 perf_get_tlb_page_size(struct mm_struct *mm, unsigned long addr)
 {
-   struct page *page;
-   pgd_t *pgd;
-   p4d_t *p4d;
-   pud_t *pud;
-   pmd_t *pmd;
-   pte_t *pte;
+   u64 size = 0;
 
-   pgd = pgd_offset(mm, addr);
-   if (pgd_none(*pgd))
-   return 0;
+#ifdef CONFIG_HAVE_FAST_GUP
+   pgd_t *pgdp, pgd;
+   p4d_t *p4dp, p4d;
+   pud_t *pudp, pud;
+   pmd_t *pmdp, pmd;
+   pte_t *ptep, pte;
 
-   p4d = p4d_offset(pgd, addr);
-   if (!p4d_present(*p4d))
+   pgdp = pgd_offset(mm, addr);
+   pgd = READ_ONCE(*pgdp);
+   if (pgd_none(pgd))
return 0;
 
-   if (p4d_leaf(*p4d))
-   return 1ULL << P4D_SHIFT;
+   if (pgd_leaf(pgd))
+   return pgd_leaf_size(pgd);
 
-   pud = pud_offset(p4d, addr);
-   if (!pud_present(*pud))
+   p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+   p4d = READ_ONCE(*p4dp);
+   if (!p4d_present(p4d))
return 0;
 
-   if (pud_leaf(*pud)) {
-#ifdef pud_page
-   page = pud_page(*pud);
-   if (PageHuge(page))
-   return page_size(compound_head(page));
-#endif
-   return 1ULL << PUD_SHIFT;
-   }
+   if (p4d_leaf(p4d))
+   return p4d_leaf_size(p4d);
 
-   pmd = pmd_offset(pud, addr);
-   if (!pmd_present(*pmd))
+   pudp = pud_offset_lockless(p4dp, p4d, addr);
+   pud = READ_ONCE(*pudp);
+   if (!pud_present(pud))
return 0;
 
-   if (pmd_leaf(*pmd)) {
-#ifdef pmd_page
-   page = pmd_page(*pmd);
-   if (PageHuge(page))
-   return page_size(compound_head(page));
-#endif
-   return 1ULL << PMD_SHIFT;
-   }
+   if (pud_leaf(pud))
+   return pud_leaf_size(pud);
 
-   pte = pte_offset_map(pmd, addr);
-   if (!pte_present(*pte)) {
-   pte_unmap(pte);
+   pmdp = pmd_offset_lockless(pudp, pud, addr);
+   pmd = READ_ONCE(*pmdp);
+   if (!pmd_present(pmd))
return 0;
-   }
 
-   page = pte_page(*pte);
-   if (PageHuge(page)) {
-   u64 size = page_size(compound_head(page));
-   pte_unmap(pte);
-   return size;
-   }
-
-   pte_unmap(pte);
-   return PAGE_SIZE;
-}
+   if (pmd_leaf(pmd))
+   return pmd_leaf_size(pmd);
 
-#else
+   ptep = pte_offset_map(, addr);
+   pte = ptep_get_lockless(ptep);
+   if (pte_present(pte))
+   size = pte_leaf_size(pte);
+   pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
 
-static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
-{
-   return 0;
+   return size;
 }
 
-#endif
-
 static u64 perf_get_page_size(unsigned long addr)
 {
struct mm_struct *mm;
@@ -7109,7 +7081,7 @@ static u64 perf_get_page_size(unsigned l
mm = _mm;
}
 
-   size = arch_perf_get_page_size(mm, addr);
+   size = perf_get_tlb_page_size(mm, addr);
 
local_irq_restore(flags);