Current perf can report both virtual addresses and physical addresses,
but not the page size. Without the page size information of the utilized
page, users cannot decide whether to promote/demote large pages to
optimize memory usage.

Add a new sample type for the data page size.

Current perf already has a facility to collect data virtual addresses.
A page walker is required to walk the pages tables and calculate the
page size from a given virtual address.

On some platforms, e.g., X86, the page walker is invoked in an NMI
handler. So the page walker must be IRQ-safe and low overhead. Besides,
the page walker should work for both user and kernel virtual address.
The existing generic page walker, e.g., walk_page_range_novma(), is a
little bit complex and doesn't guarantee the IRQ-safe. The follow_page()
is only for user-virtual address.

Add a new function perf_get_page_size() to walk the page tables and
calculate the page size. In the function:
- Interrupts have to be disabled to prevent any teardown of the page
  tables.
- The size of a normal page is from the pre-defined page size macros.
- The size of a compound page is retrieved from the helper function,
  page_size().

Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
---
 include/linux/perf_event.h      |   1 +
 include/uapi/linux/perf_event.h |   4 +-
 kernel/events/core.c            | 121 ++++++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3737e653f47e..5de95f36d7a8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1035,6 +1035,7 @@ struct perf_sample_data {
 
        u64                             phys_addr;
        u64                             cgroup;
+       u64                             data_page_size;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 52ca2093831c..32484accc7a3 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -143,8 +143,9 @@ enum perf_event_sample_format {
        PERF_SAMPLE_PHYS_ADDR                   = 1U << 19,
        PERF_SAMPLE_AUX                         = 1U << 20,
        PERF_SAMPLE_CGROUP                      = 1U << 21,
+       PERF_SAMPLE_DATA_PAGE_SIZE              = 1U << 22,
 
-       PERF_SAMPLE_MAX = 1U << 22,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 23,             /* non-ABI */
 
        __PERF_SAMPLE_CALLCHAIN_EARLY           = 1ULL << 63, /* non-ABI; 
internal use */
 };
@@ -879,6 +880,7 @@ enum perf_event_type {
         *      { u64                   phys_addr;} && PERF_SAMPLE_PHYS_ADDR
         *      { u64                   size;
         *        char                  data[size]; } && PERF_SAMPLE_AUX
+        *      { u64                   data_page_size;} && 
PERF_SAMPLE_DATA_PAGE_SIZE
         * };
         */
        PERF_RECORD_SAMPLE                      = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0fbf17ff7cc5..00becacfd15e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,7 @@
 #include <linux/proc_ns.h>
 #include <linux/mount.h>
 #include <linux/min_heap.h>
+#include <linux/highmem.h>
 
 #include "internal.h"
 
@@ -1895,6 +1896,9 @@ static void __perf_event_header_size(struct perf_event 
*event, u64 sample_type)
        if (sample_type & PERF_SAMPLE_CGROUP)
                size += sizeof(data->cgroup);
 
+       if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+               size += sizeof(data->data_page_size);
+
        event->header_size = size;
 }
 
@@ -6939,6 +6943,9 @@ void perf_output_sample(struct perf_output_handle *handle,
        if (sample_type & PERF_SAMPLE_CGROUP)
                perf_output_put(handle, data->cgroup);
 
+       if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+               perf_output_put(handle, data->data_page_size);
+
        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);
 
@@ -6996,6 +7003,117 @@ static u64 perf_virt_to_phys(u64 virt)
        return phys_addr;
 }
 
+#ifdef CONFIG_MMU
+
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+       struct page *page;
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset(mm, addr);
+       if (pgd_none(*pgd))
+               return 0;
+
+       p4d = p4d_offset(pgd, addr);
+       if (!p4d_present(*p4d))
+               return 0;
+
+#if (defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+       if (p4d_leaf(*p4d)) {
+               page = p4d_page(*p4d);
+
+               if (PageCompound(page))
+                       return page_size(compound_head(page));
+
+               return P4D_SIZE;
+       }
+#endif
+
+       pud = pud_offset(p4d, addr);
+       if (!pud_present(*pud))
+               return 0;
+
+#if (defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+       if (pud_leaf(*pud)) {
+               page = pud_page(*pud);
+
+               if (PageCompound(page))
+                       return page_size(compound_head(page));
+
+               return PUD_SIZE;
+       }
+#endif
+
+       pmd = pmd_offset(pud, addr);
+       if (!pmd_present(*pmd))
+               return 0;
+
+#if (defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+       if (pmd_leaf(*pmd)) {
+               page = pmd_page(*pmd);
+
+               if (PageCompound(page))
+                       return page_size(compound_head(page));
+
+               return PMD_SIZE;
+       }
+#endif
+
+       pte = pte_offset_map(pmd, addr);
+       if (!pte_present(*pte)) {
+               pte_unmap(pte);
+               return 0;
+       }
+
+       pte_unmap(pte);
+       return PAGE_SIZE;
+}
+
+#else
+
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+       return 0;
+}
+
+#endif
+
+/* Return the page size of a given virtual address. */
+static u64 perf_get_page_size(unsigned long addr)
+{
+       struct mm_struct *mm;
+       unsigned long flags;
+       u64 size;
+
+       if (!addr)
+               return 0;
+
+       /*
+        * Software page-table walkers must disable IRQs,
+        * which prevents any tear down of the page tables.
+        */
+       local_irq_save(flags);
+
+       mm = current->mm;
+       if (!mm) {
+               /*
+                * For kernel threads and the like, use init_mm so that
+                * we can find kernel memory.
+                */
+               mm = &init_mm;
+       }
+
+       size = __perf_get_page_size(mm, addr);
+
+       local_irq_restore(flags);
+
+       return size;
+}
+
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
 struct perf_callchain_entry *
@@ -7151,6 +7269,9 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
 #endif
 
+       if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+               data->data_page_size = perf_get_page_size(data->addr);
+
        if (sample_type & PERF_SAMPLE_AUX) {
                u64 size;
 
-- 
2.17.1

Reply via email to