[PATCH v5 18/18] mm: Remove CONFIG_ARCH_HAS_HUGEPD

2024-06-10 Thread Christophe Leroy
powerpc was the only user of CONFIG_ARCH_HAS_HUGEPD and doesn't
use it anymore, so remove all related code.

Signed-off-by: Christophe Leroy 
Acked-by: Oscar Salvador 
---
v4: Rebased on v6.10-rc1
---
 include/linux/hugetlb.h |   6 --
 mm/Kconfig  |  10 ---
 mm/gup.c| 183 ++--
 mm/pagewalk.c   |  57 +
 4 files changed, 9 insertions(+), 247 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2b3c3a404769..58daf7d14bf4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,12 +20,6 @@ struct user_struct;
 struct mmu_gather;
 struct node;
 
-#ifndef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define is_hugepd(hugepd) (0)
-#define __hugepd(x) ((hugepd_t) { (x) })
-#endif
-
 void free_huge_folio(struct folio *folio);
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/Kconfig b/mm/Kconfig
index b4cb45255a54..049d29ec6e20 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1119,16 +1119,6 @@ config DMAPOOL_TEST
 config ARCH_HAS_PTE_SPECIAL
bool
 
-#
-# Some architectures require a special hugepage directory format that is
-# required to support multiple hugepage sizes. For example a4fe3ce76
-# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
-# introduced it on powerpc.  This allows for a more flexible hugepage
-# pagetable layouts.
-#
-config ARCH_HAS_HUGEPD
-   bool
-
 config MAPPING_DIRTY_HELPERS
 bool
 
diff --git a/mm/gup.c b/mm/gup.c
index 43491246f39d..f8e982a42bba 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -501,7 +501,7 @@ static inline void mm_set_has_pinned_flag(unsigned long 
*mm_flags)
 
 #ifdef CONFIG_MMU
 
-#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST)
+#ifdef CONFIG_HAVE_GUP_FAST
 static int record_subpages(struct page *page, unsigned long sz,
   unsigned long addr, unsigned long end,
   struct page **pages)
@@ -515,147 +515,7 @@ static int record_subpages(struct page *page, unsigned 
long sz,
 
return nr;
 }
-#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */
-
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
- unsigned long sz)
-{
-   unsigned long __boundary = (addr + sz) & ~(sz-1);
-   return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
-/*
- * Returns 1 if succeeded, 0 if failed, -EMLINK if unshare needed.
- *
- * NOTE: for the same entry, gup-fast and gup-slow can return different
- * results (0 v.s. -EMLINK) depending on whether vma is available.  This is
- * the expected behavior, where we simply want gup-fast to fallback to
- * gup-slow to take the vma reference first.
- */
-static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long 
sz,
-  unsigned long addr, unsigned long end, unsigned int 
flags,
-  struct page **pages, int *nr)
-{
-   unsigned long pte_end;
-   struct page *page;
-   struct folio *folio;
-   pte_t pte;
-   int refs;
-
-   pte_end = (addr + sz) & ~(sz-1);
-   if (pte_end < end)
-   end = pte_end;
-
-   pte = huge_ptep_get(vma->vm_mm, addr, ptep);
-
-   if (!pte_access_permitted(pte, flags & FOLL_WRITE))
-   return 0;
-
-   /* hugepages are never "special" */
-   VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-   page = pte_page(pte);
-   refs = record_subpages(page, sz, addr, end, pages + *nr);
-
-   folio = try_grab_folio(page, refs, flags);
-   if (!folio)
-   return 0;
-
-   if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   if (!pte_write(pte) && gup_must_unshare(vma, flags, >page)) {
-   gup_put_folio(folio, refs, flags);
-   return -EMLINK;
-   }
-
-   *nr += refs;
-   folio_set_referenced(folio);
-   return 1;
-}
-
-/*
- * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
- * systems on Power, which does not have issue with folio writeback against
- * GUP updates.  When hugepd will be extended to support non-hugetlbfs or
- * even anonymous memory, we need to do extra check as what we do with most
- * of the other folios. See writable_file_mapping_allowed() and
- * gup_fast_folio_allowed() for more information.
- */
-static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
- unsigned long addr, unsigned int pdshift,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
-{
-   pte_t *ptep;
-   unsigned long sz = 1UL << hugepd_shift(hugepd);
-   unsigned long next;
-   int ret;
-
-   ptep = hugepte_offset

[PATCH v5 17/18] powerpc/mm: Remove hugepd leftovers

2024-06-10 Thread Christophe Leroy
All targets have now opted out of CONFIG_ARCH_HAS_HUGEPD so
remove left over code.

Signed-off-by: Christophe Leroy 
Acked-by: Oscar Salvador 
---
v5: Fix a forgotten #endif which ended up in following patch
---
 arch/powerpc/include/asm/hugetlb.h  |   7 -
 arch/powerpc/include/asm/page.h |   6 -
 arch/powerpc/include/asm/pgtable-be-types.h |  10 -
 arch/powerpc/include/asm/pgtable-types.h|   9 -
 arch/powerpc/mm/hugetlbpage.c   | 413 
 arch/powerpc/mm/init-common.c   |   8 +-
 arch/powerpc/mm/pgtable.c   |  27 +-
 7 files changed, 3 insertions(+), 477 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index e959c26c0b52..18a3028ac3b6 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,13 +30,6 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
-   unsigned long end, unsigned long floor,
-   unsigned long ceiling);
-#endif
-
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 pte_t pte, unsigned long sz);
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index c0af246a64ff..83d0a4fc5f75 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -269,12 +269,6 @@ static inline const void *pfn_to_kaddr(unsigned long pfn)
 #define is_kernel_addr(x)  ((x) >= TASK_SIZE)
 #endif
 
-/*
- * Some number of bits at the level of the page table that points to
- * a hugepte are used to encode the size.  This masks those bits.
- */
-#define HUGEPD_SHIFT_MASK 0x3f
-
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h 
b/arch/powerpc/include/asm/pgtable-be-types.h
index 82633200b500..6bd8f89b25dc 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -101,14 +101,4 @@ static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t 
new)
return pmd_raw(old) == prev;
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { __be64 pdbe; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { cpu_to_be64(x) })
-
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return be64_to_cpu(x.pdbe);
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index db965d98e0ae..7b3d4c592a10 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -87,13 +87,4 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t 
new)
 }
 #endif
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { (x) })
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return x.pd;
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1fe2843f5b12..6b043180220a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -28,8 +28,6 @@
 
 bool hugetlb_disabled = false;
 
-#define hugepd_none(hpd)   (hpd_val(hpd) == 0)
-
 #define PTE_T_ORDER(__builtin_ffs(sizeof(pte_basic_t)) - \
 __builtin_ffs(sizeof(void *)))
 
@@ -42,156 +40,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-  unsigned long address, unsigned int pdshift,
-  unsigned int pshift, spinlock_t *ptl)
-{
-   struct kmem_cache *cachep;
-   pte_t *new;
-   int i;
-   int num_hugepd;
-
-   if (pshift >= pdshift) {
-   cachep = PGT_CACHE(PTE_T_ORDER);
-   num_hugepd = 1 << (pshift - pdshift);
-   } else {
-   cachep = PGT_CACHE(pdshift - pshift);
-   num_hugepd = 1;
-   }
-
-   if (!cachep) {
-   WARN_ONCE(1, "No page table cache created for hugetlb tables");
-   return -ENOMEM;
-   }
-
-   new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
-
-   BUG_ON(pshift > HUGEPD_SHIFT_MASK);
-   BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
-
-   if (!new)
-   return -ENOMEM;
-
-   /*
-* Make sure other cpus find the hugepd set only after a
-* properly initialized page table is visible to them.
-* For more details

[PATCH v5 16/18] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-06-10 Thread Christophe Leroy
On book3s/64, the only user of hugepd is hash in 4k mode.

All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.

Rework hash-4k to use contiguous PMD and PUD instead.

In that setup there are only two huge page sizes: 16M and 16G.

16M sits at PMD level and 16G at PUD level.

pte_update doesn't know page size, lets use the same trick as
hpte_need_flush() to get page size from segment properties. That's
not the most efficient way but let's do that until callers of
pte_update() provide page size instead of just a huge flag.

Signed-off-by: Christophe Leroy 
---
v3:
- Add missing pmd_leaf_size() and pud_leaf_size()
- More cleanup in hugetlbpage_init()
- Take a page fault when DIRTY or ACCESSED is missing on hash-4 hugepage

v4: Rebased on v6.10-rc1
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 15 --
 arch/powerpc/include/asm/book3s/64/hash.h | 38 ---
 arch/powerpc/include/asm/book3s/64/hugetlb.h  | 38 ---
 .../include/asm/book3s/64/pgtable-4k.h| 47 ---
 .../include/asm/book3s/64/pgtable-64k.h   | 20 
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 22 +++--
 arch/powerpc/include/asm/hugetlb.h|  4 ++
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  4 --
 arch/powerpc/include/asm/page.h   |  8 
 arch/powerpc/mm/book3s64/hash_utils.c | 11 +++--
 arch/powerpc/mm/book3s64/hugetlbpage.c| 10 
 arch/powerpc/mm/book3s64/pgtable.c| 12 -
 arch/powerpc/mm/hugetlbpage.c | 26 --
 arch/powerpc/mm/pgtable.c |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 15 files changed, 72 insertions(+), 186 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/book3s/64/pgtable-4k.h

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 6472b08fa1b0..c654c376ef8b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -74,21 +74,6 @@
 #define remap_4k_pfn(vma, addr, pfn, prot) \
remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
-#ifdef CONFIG_HUGETLB_PAGE
-static inline int hash__hugepd_ok(hugepd_t hpd)
-{
-   unsigned long hpdval = hpd_val(hpd);
-   /*
-* if it is not a pte and have hugepd shift mask
-* set, then it is a hugepd directory pointer
-*/
-   if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
-   ((hpdval & HUGEPD_SHIFT_MASK) != 0))
-   return true;
-   return false;
-}
-#endif
-
 /*
  * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just
  * a matter of returning the PTE bits that need to be modified. On 64K PTE,
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index faf3e3b4e4b2..8202c27afe23 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include 
+#include 
 
 /*
  * Common bits between 4K and 64K pages in a linux-style PTE.
@@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, 
unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
 unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long 
flags);
 /* Atomic PTE updates */
-static inline unsigned long hash__pte_update(struct mm_struct *mm,
-unsigned long addr,
-pte_t *ptep, unsigned long clr,
-unsigned long set,
-int huge)
+static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long 
clr,
+unsigned long set)
 {
__be64 old_be, tmp_be;
-   unsigned long old;
 
__asm__ __volatile__(
"1: ldarx   %0,0,%3 # pte_update\n\
@@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct 
mm_struct *mm,
: "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
: "cc" );
+
+   return be64_to_cpu(old_be);
+}
+
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+unsigned long addr,
+pte_t *ptep, unsigned long clr,
+unsigned long set,
+int huge)
+{
+   unsigned long old;
+
+   old = hash__pte_update_one(ptep, clr, set);
+
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
+   unsigned int psize = get_slice_psize(mm, addr);
+   int nb, i;
+
+   if (psize == MMU_PAG

[PATCH v5 15/18] powerpc/e500: Use contiguous PMD instead of hugepd

2024-06-10 Thread Christophe Leroy
e500 supports many page sizes among which the following size are
implemented in the kernel at the time being: 4M, 16M, 64M, 256M, 1G.

On e500, TLB miss for hugepages is exclusively handled by SW even
on e6500 which has HW assistance for 4k pages, so there are no
constraints like on the 8xx.

On e500/32, all are at PGD/PMD level and can be handled as
cont-PMD.

On e500/64, smaller ones are on PMD while bigger ones are on PUD.
Again, they can easily be handled as cont-PMD and cont-PUD instead
of hugepd.

On e500/32, use the pagesize bits in PTE to know if it is a PMD or
a leaf entry. This works because the pagesize bits are in the last
12 bits and page tables are 4k aligned.

On e500/64, use highest bit which is always 1 on PxD (Because PxD
contains virtual address of a kernel memory) and always 0 on PTEs
because not all bits of RPN are used/possible.

Signed-off-by: Christophe Leroy 
---
v3: Add missing pmd_leaf_size() and pud_leaf_size()
v4: Rebased of v6.10-rc1 : pmd_huge() and pud_huge() are gone
v5:
- Define pte_huge_size() to reduce redundant and handle the case of mpc85xx 
with 32 bits PTE.
- Replace hard-coded page shift offset (10) with new macro 
_PAGE_PSIZE_SHIFT_OFFSET
- Use TSIZE encoding in SIZE field in PTE
- On 64 bits, use highest bit to know if it is a PUD/PMD or leaf entry. Avoids 
problem with page table alignment.
---
 .../powerpc/include/asm/nohash/hugetlb-e500.h | 32 ++
 arch/powerpc/include/asm/nohash/pgalloc.h |  2 -
 arch/powerpc/include/asm/nohash/pgtable.h | 42 ++-
 arch/powerpc/include/asm/nohash/pte-e500.h| 33 +++
 arch/powerpc/include/asm/page.h   | 15 +--
 arch/powerpc/kernel/head_85xx.S   | 21 --
 arch/powerpc/mm/hugetlbpage.c |  2 -
 arch/powerpc/mm/nohash/tlb_low_64e.S  |  7 +---
 arch/powerpc/mm/pgtable.c | 31 ++
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 10 files changed, 107 insertions(+), 79 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index c6a0938e86fd..812c71fc5eb1 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -2,38 +2,12 @@
 #ifndef _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 #define _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   if (WARN_ON(!hugepd_ok(hpd)))
-   return NULL;
-
-   return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return hpd_val(hpd) & HUGEPD_SHIFT_MASK;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   /*
-* On FSL BookE, we have multiple higher-level table entries that
-* point to the same hugepte.  Just use the first one since they're all
-* identical.  So for that case, idx=0.
-*/
-   return hugepd_page(hpd);
-}
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+pte_t pte, unsigned long sz);
 
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
-{
-   /* We use the old format for PPC_E500 */
-   *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
-}
-
 static inline int check_and_get_huge_psize(int shift)
 {
if (shift & 1)  /* Not a power of 4 */
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h 
b/arch/powerpc/include/asm/nohash/pgalloc.h
index 4b62376318e1..d06efac6d7aa 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -44,8 +44,6 @@ static inline void pgtable_free(void *table, int shift)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int 
shift)
 {
unsigned long pgf = (unsigned long)table;
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 90d6a0943b35..8d1f0b7062eb 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -31,6 +31,13 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, 
unsigned long addr, p
 
 extern int icache_44x_need_flush;
 
+#ifndef pte_huge_size
+static inline unsigned long pte_huge_size(pte_t pte)
+{
+   return PAGE_SIZE;
+}
+#endif
+
 /*
  * PTE updates. This function is called whenever an existing
  * valid PTE is updated. This does -not- include set_pte_at()
@@ -52,11 +59,34 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, 
unsigned long addr, p
 {
pte_basic_t old = pte_val(*p);
pte_basic_t new = (old &am

[PATCH v5 14/18] powerpc/e500: Free r10 for FIND_PTE

2024-06-10 Thread Christophe Leroy
Move r13 load after the call to FIND_PTE, and use r13 instead of
r10 for storing fault address. This will allow using r10 freely
in FIND_PTE in following patch to handle hugepage size.

Signed-off-by: Christophe Leroy 
---
v5: New
---
 arch/powerpc/kernel/head_85xx.S | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index 282e49c51deb..226f88e77d6d 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -294,9 +294,10 @@ set_ivor:
 /* Macros to hide the PTE size differences
  *
  * FIND_PTE -- walks the page tables given EA & pgdir pointer
- *   r10 -- EA of fault
+ *   r10 -- free
  *   r11 -- PGDIR pointer
  *   r12 -- free
+ *   r13 -- EA of fault
  *   label 2: is the bailout case
  *
  * if we find the pte (fall through):
@@ -307,7 +308,7 @@ set_ivor:
 #ifdef CONFIG_PTE_64BIT
 #ifdef CONFIG_HUGETLB_PAGE
 #define FIND_PTE   \
-   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   rlwinm  r12, r13, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
add r12, r11, r12;  \
lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
@@ -317,26 +318,26 @@ set_ivor:
andi.   r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \
xor r12, r10, r11;  /* drop size bits from pointer */ \
b   1001f;  \
-1000:  rlwimi  r12, r10, 23, 20, 28;   /* Compute pte address */   \
+1000:  rlwimi  r12, r13, 23, 20, 28;   /* Compute pte address */   \
li  r10, 0; /* clear r10 */ \
 1001:  lwz r11, 4(r12);/* Get pte entry */
 #else
 #define FIND_PTE   \
-   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   rlwinm  r12, r13, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
add r12, r11, r12;  \
lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
beq 2f; /* Bail if no table */  \
-   rlwimi  r12, r10, 23, 20, 28;   /* Compute pte address */   \
+   rlwimi  r12, r13, 23, 20, 28;   /* Compute pte address */   \
lwz r11, 4(r12);/* Get pte entry */
 #endif /* HUGEPAGE */
 #else /* !PTE_64BIT */
 #define FIND_PTE   \
-   rlwimi  r11, r10, 12, 20, 29;   /* Create L1 (pgdir/pmd) address */ 
\
+   rlwimi  r11, r13, 12, 20, 29;   /* Create L1 (pgdir/pmd) address */ 
\
lwz r11, 0(r11);/* Get L1 entry */  
\
rlwinm. r12, r11, 0, 0, 19; /* Extract L2 (pte) base address */ 
\
beq 2f; /* Bail if no table */  
\
-   rlwimi  r12, r10, 22, 20, 29;   /* Compute PTE address */   
\
+   rlwimi  r12, r13, 22, 20, 29;   /* Compute PTE address */   
\
lwz r11, 0(r12);/* Get Linux PTE */
 #endif
 
@@ -443,13 +444,13 @@ START_BTB_FLUSH_SECTION
BTB_FLUSH(r10)
 1:
 END_BTB_FLUSH_SECTION
-   mfspr   r10, SPRN_DEAR  /* Get faulting address */
+   mfspr   r13, SPRN_DEAR  /* Get faulting address */
 
/* If we are faulting a kernel address, we have to use the
 * kernel page tables.
 */
lis r11, PAGE_OFFSET@h
-   cmplw   5, r10, r11
+   cmplw   5, r13, r11
blt 5, 3f
lis r11, swapper_pg_dir@h
ori r11, r11, swapper_pg_dir@l
@@ -472,14 +473,14 @@ END_BTB_FLUSH_SECTION
 #endif
 
 4:
+   FIND_PTE
+
 #ifdef CONFIG_PTE_64BIT
li  r13,_PAGE_PRESENT|_PAGE_BAP_SR
orisr13,r13,_PAGE_ACCESSED@h
 #else
li  r13,_PAGE_PRESENT|_PAGE_READ|_PAGE_ACCESSED
 #endif
-
-   FIND_PTE
andc.   r13,r13,r11 /* Check permission */
 
 #ifdef CONFIG_PTE_64BIT
@@ -536,13 +537,13 @@ START_BTB_FLUSH_SECTION
 1:
 END_BTB_FLUSH_SECTION
 
-   mfspr   r10, SPRN_SRR0  /* Get faulting address */
+   mfspr   r13, SPRN_SRR0  /* Get faulting address */
 
/* If we are faulting a kernel address, we have to use the
 * kernel page tables.
 */
lis r11, PAGE_OFFSET@h
-   cmplw   5, r10, r11
+   cmplw   5, r13, r11
blt 5, 3f
lis r11, swapper_pg_dir@h
ori r11, r11, swapper_pg_dir@l
@@ -551,6 +552,7 @@ END_BTB_FLUSH_SECTION
rlwinm  r12,r12,0,16,1
mtspr   SPRN_MAS1,r12
 
+   FIND_PTE
/* Make up the required permissions for kernel code */
 #ifdef CONFIG_PTE_64BIT
li  r13,_PAGE_PRESENT | _PAGE_BA

[PATCH v5 13/18] powerpc/e500: Don't pre-check write access on data TLB error

2024-06-10 Thread Christophe Leroy
Don't pre-check write access on read-only pages on data TLB error.

Load the TLB anyway and take a DSI exception when it happens. This
avoids reading SPRN_ESR at every data TLB error exception.

Signed-off-by: Christophe Leroy 
---
v5: New
---
 arch/powerpc/kernel/head_85xx.S | 15 ---
 1 file changed, 15 deletions(-)

diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index a305244afc9f..282e49c51deb 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -472,27 +472,12 @@ END_BTB_FLUSH_SECTION
 #endif
 
 4:
-   /* Mask of required permission bits. Note that while we
-* do copy ESR:ST to _PAGE_WRITE position as trying to write
-* to an RO page is pretty common, we don't do it with
-* _PAGE_DIRTY. We could do it, but it's a fairly rare
-* event so I'd rather take the overhead when it happens
-* rather than adding an instruction here. We should measure
-* whether the whole thing is worth it in the first place
-* as we could avoid loading SPRN_ESR completely in the first
-* place...
-*
-* TODO: Is it worth doing that mfspr & rlwimi in the first
-*   place or can we save a couple of instructions here ?
-*/
-   mfspr   r12,SPRN_ESR
 #ifdef CONFIG_PTE_64BIT
li  r13,_PAGE_PRESENT|_PAGE_BAP_SR
orisr13,r13,_PAGE_ACCESSED@h
 #else
li  r13,_PAGE_PRESENT|_PAGE_READ|_PAGE_ACCESSED
 #endif
-   rlwimi  r13,r12,11,29,29
 
FIND_PTE
andc.   r13,r13,r11 /* Check permission */
-- 
2.44.0



[PATCH v5 12/18] powerpc/e500: Encode hugepage size in PTE bits

2024-06-10 Thread Christophe Leroy
Use PTE page size bits to encode hugepage size with the following
format corresponding to the values expected in bits 52-55 in MAS1
register. Those bits are called TSIZE:
00014 Kbyte
001016 Kbyte
001164 Kbyte
0100256 Kbyte
01011 Mbyte
01104 Mbyte
011116 Mbyte
100064 Mbyte
1001256 Mbyte
10101 Gbyte
10114 Gbyte
110016 Gbyte
110164 Gbyte
1110256 Gbyte
1 Tbyte

It corresponds to shift value minus 10 with lowest bit removed.

It is not the value expected in the PTE in that field, but only
e6500 performs HW based TLB loading and the e6500 reference manual
explicitely says that this field is ignored.

Also add pte_huge_size() which will be used later.

Signed-off-by: Christophe Leroy 
---
v5: Use PAGE SIZE field instead of U0-U3 because on some HW U2-U3 are used for 
something else.
---
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  9 ++
 arch/powerpc/include/asm/nohash/pte-e500.h| 28 +--
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index 8f04ad20e040..c6a0938e86fd 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -42,4 +42,13 @@ static inline int check_and_get_huge_psize(int shift)
return shift_to_mmu_psize(shift);
 }
 
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, 
vm_flags_t flags)
+{
+   unsigned int tsize = shift - _PAGE_PSIZE_SHIFT_OFFSET;
+   pte_basic_t val = (tsize << _PAGE_PSIZE_SHIFT) & _PAGE_PSIZE_MSK;
+
+   return __pte((pte_val(entry) & ~(pte_basic_t)_PAGE_PSIZE_MSK) | val);
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
 #endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
b/arch/powerpc/include/asm/nohash/pte-e500.h
index 975facc7e38e..6dac1c0a6937 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -19,20 +19,7 @@
 #define _PAGE_BAP_SX   0x40
 #define _PAGE_BAP_UX   0x80
 #define _PAGE_PSIZE_MSK0x000f00
-#define _PAGE_PSIZE_4K 0x000200
-#define _PAGE_PSIZE_8K 0x000300
-#define _PAGE_PSIZE_16K0x000400
-#define _PAGE_PSIZE_32K0x000500
-#define _PAGE_PSIZE_64K0x000600
-#define _PAGE_PSIZE_128K   0x000700
-#define _PAGE_PSIZE_256K   0x000800
-#define _PAGE_PSIZE_512K   0x000900
-#define _PAGE_PSIZE_1M 0x000a00
-#define _PAGE_PSIZE_2M 0x000b00
-#define _PAGE_PSIZE_4M 0x000c00
-#define _PAGE_PSIZE_8M 0x000d00
-#define _PAGE_PSIZE_16M0x000e00
-#define _PAGE_PSIZE_32M0x000f00
+#define _PAGE_TSIZE_4K 0x000100
 #define _PAGE_DIRTY0x001000 /* C: page changed */
 #define _PAGE_SW0  0x002000
 #define _PAGE_U3   0x004000
@@ -46,6 +33,9 @@
 #define _PAGE_NO_CACHE 0x40 /* I: cache inhibit */
 #define _PAGE_WRITETHRU0x80 /* W: cache write-through */
 
+#define _PAGE_PSIZE_SHIFT  7
+#define _PAGE_PSIZE_SHIFT_OFFSET   10
+
 /* "Higher level" linux bit combinations */
 #define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was 
cache cleaned */
 #define _PAGE_READ (_PAGE_BAP_SR | _PAGE_BAP_UR) /* User read 
permission */
@@ -87,7 +77,7 @@
  * pages. We always set _PAGE_COHERENT when SMP is enabled or
  * the processor might need it for DMA coherency.
  */
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE_4K)
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_TSIZE_4K)
 #if defined(CONFIG_SMP)
 #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT)
 #else
@@ -103,6 +93,14 @@ static inline pte_t pte_mkexec(pte_t pte)
 }
 #define pte_mkexec pte_mkexec
 
+static inline unsigned long pte_huge_size(pte_t pte)
+{
+   pte_basic_t val = pte_val(pte);
+
+   return 1UL << (((val & _PAGE_PSIZE_MSK) >> _PAGE_PSIZE_SHIFT) + 
_PAGE_PSIZE_SHIFT_OFFSET);
+}
+#define pte_huge_size pte_huge_size
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
-- 
2.44.0



[PATCH v5 11/18] powerpc/e500: Switch to 64 bits PGD on 85xx (32 bits)

2024-06-09 Thread Christophe Leroy
At the time being when CONFIG_PTE_64BIT is selected, PTE entries are
64 bits but PGD entries are still 32 bits.

In order to allow leaf PMD entries, switch the PGD to 64 bits entries.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/pgtable-types.h |  4 
 arch/powerpc/kernel/head_85xx.S  | 10 ++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index 082c85cc09b1..db965d98e0ae 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -49,7 +49,11 @@ static inline unsigned long pud_val(pud_t x)
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
+#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT)
+typedef struct { unsigned long long pgd; } pgd_t;
+#else
 typedef struct { unsigned long pgd; } pgd_t;
+#endif
 #define __pgd(x)   ((pgd_t) { (x) })
 static inline unsigned long pgd_val(pgd_t x)
 {
diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index 39724ff5ae1f..a305244afc9f 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -307,8 +307,9 @@ set_ivor:
 #ifdef CONFIG_PTE_64BIT
 #ifdef CONFIG_HUGETLB_PAGE
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
blt 1000f;  /* Normal non-huge page */  \
beq 2f; /* Bail if no table */  \
@@ -321,8 +322,9 @@ set_ivor:
 1001:  lwz r11, 4(r12);/* Get pte entry */
 #else
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
beq 2f; /* Bail if no table */  \
rlwimi  r12, r10, 23, 20, 28;   /* Compute pte address */   \
-- 
2.44.0



[PATCH v5 10/18] powerpc/e500: Remove enc and ind fields from struct mmu_psize_def

2024-06-09 Thread Christophe Leroy
enc field is hidden behind BOOK3E_PAGESZ_XX macros, and when you look
closer you realise that this field is nothing else than the value of
shift minus ten.

So remove enc field and calculate tsize from shift field.

Also remove inc field which is unused.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h | 3 ---
 arch/powerpc/mm/nohash/book3e_pgtable.c| 4 ++--
 arch/powerpc/mm/nohash/tlb.c   | 9 +
 arch/powerpc/mm/nohash/tlb_64e.c   | 2 +-
 4 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 7dc24b8632d7..b281d9eeaf1e 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -244,14 +244,11 @@ typedef struct {
 /* Page size definitions, common between 32 and 64-bit
  *
  *shift : is the "PAGE_SHIFT" value for that page size
- *penc  : is the pte encoding mask
  *
  */
 struct mmu_psize_def
 {
unsigned intshift;  /* number of bits */
-   unsigned intenc;/* PTE encoding */
-   unsigned intind;/* Corresponding indirect page size shift */
unsigned intflags;
 #define MMU_PAGE_SIZE_DIRECT   0x1 /* Supported as a direct size */
 #define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c 
b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 1c5e4ecbebeb..ad2a7c26f2a0 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -29,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start,
_PAGE_KERNEL_RW;
 
/* PTEs only contain page size encodings up to 32M */
-   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf);
 
/* Encode the size in the PTE */
-   flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+   flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8;
 
/* For each PTE for that area, map things. Note that we don't
 * increment phys because all PTEs are of the large size and
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index f57dc721d063..b653a7be4cb1 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -53,37 +53,30 @@
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift  = 12,
-   .enc= BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_2M] = {
.shift  = 21,
-   .enc= BOOK3E_PAGESZ_2M,
},
[MMU_PAGE_4M] = {
.shift  = 22,
-   .enc= BOOK3E_PAGESZ_4M,
},
[MMU_PAGE_16M] = {
.shift  = 24,
-   .enc= BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_64M] = {
.shift  = 26,
-   .enc= BOOK3E_PAGESZ_64M,
},
[MMU_PAGE_256M] = {
.shift  = 28,
-   .enc= BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift  = 30,
-   .enc= BOOK3E_PAGESZ_1GB,
},
 };
 
 static inline int mmu_get_tsize(int psize)
 {
-   return mmu_psize_defs[psize].enc;
+   return mmu_psize_defs[psize].shift - 10;
 }
 #else
 static inline int mmu_get_tsize(int psize)
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
index 053128a5636c..7988238496d7 100644
--- a/arch/powerpc/mm/nohash/tlb_64e.c
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -53,7 +53,7 @@ int extlb_level_exc;
  */
 void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 {
-   int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+   int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10;
 
if (book3e_htw_mode != PPC_HTW_NONE) {
unsigned long start = address & PMD_MASK;
-- 
2.44.0



[PATCH v5 09/18] powerpc/8xx: Simplify struct mmu_psize_def

2024-06-09 Thread Christophe Leroy
On 8xx, only the shift field is used in struct mmu_psize_def

Remove other fields and related macros.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h 
b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 141d82e249a8..a756a1e59c54 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -189,19 +189,14 @@ typedef struct {
 
 #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff8)
 
-/* Page size definitions, common between 32 and 64-bit
+/*
+ * Page size definitions for 8xx
  *
  *shift : is the "PAGE_SHIFT" value for that page size
- *penc  : is the pte encoding mask
  *
  */
 struct mmu_psize_def {
unsigned intshift;  /* number of bits */
-   unsigned intenc;/* PTE encoding */
-   unsigned intind;/* Corresponding indirect page size shift */
-   unsigned intflags;
-#define MMU_PAGE_SIZE_DIRECT   0x1 /* Supported as a direct size */
-#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
 };
 
 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-- 
2.44.0



[PATCH v5 08/18] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-06-09 Thread Christophe Leroy
In order to fit better with standard Linux page tables layout, add
support for 8M pages using contiguous PTE entries in a standard
page table. Page tables will then be populated with 1024 similar
entries and two PMD entries will point to that page table.

The PMD entries also get a flag to tell it is addressing an 8M page,
this is required for the HW tablewalk assistance.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3:
- Move huge_ptep_get() for a more readable commit diff
- Flag PMD as 8Mbytes in set_huge_pte_at()
- Define __pte_leaf_size()
- Change pte_update() instead of all huge callers of pte_update()
- Added ptep_is_8m_pmdp() helper
- Fixed kasan early memory 8M allocation

v5:
- In huge_ptep_get(), change pte_offset_kernel((pmd_t *)ptep, 0) to 
pte_offset_kernel((pmd_t *)ptep, ALIGN_DOWN(addr, SZ_8M)) which is more correct 
allthough not different.
---
 .../include/asm/nohash/32/hugetlb-8xx.h   | 38 +++--
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 53 ---
 arch/powerpc/include/asm/nohash/pgtable.h |  4 --
 arch/powerpc/include/asm/page.h   |  5 --
 arch/powerpc/include/asm/pgtable.h|  3 ++
 arch/powerpc/kernel/head_8xx.S| 10 +---
 arch/powerpc/mm/hugetlbpage.c | 21 +---
 arch/powerpc/mm/kasan/8xx.c   | 21 +---
 arch/powerpc/mm/nohash/8xx.c  | 40 +++---
 arch/powerpc/mm/pgtable.c | 27 +++---
 arch/powerpc/mm/pgtable_32.c  |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 12 files changed, 111 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h 
b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 92df40c6cc6b..014799557f60 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -4,42 +4,12 @@
 
 #define PAGE_SHIFT_8M  23
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   BUG_ON(!hugepd_ok(hpd));
-
-   return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return PAGE_SHIFT_8M;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   unsigned long idx = (addr & (SZ_4M - 1)) >> PAGE_SHIFT;
-
-   return hugepd_page(hpd) + idx;
-}
-
 static inline void flush_hugetlb_page(struct vm_area_struct *vma,
  unsigned long vmaddr)
 {
flush_tlb_page(vma, vmaddr);
 }
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
-{
-   *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
-static inline void hugepd_populate_kernel(hugepd_t *hpdp, pte_t *new, unsigned 
int pshift)
-{
-   *hpdp = __hugepd(__pa(new) | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
 static inline int check_and_get_huge_psize(int shift)
 {
return shift_to_mmu_psize(shift);
@@ -49,6 +19,14 @@ static inline int check_and_get_huge_psize(int shift)
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 pte_t pte, unsigned long sz);
 
+#define __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
+{
+   if (ptep_is_8m_pmdp(mm, addr, ptep))
+   ptep = pte_offset_kernel((pmd_t *)ptep, ALIGN_DOWN(addr, 
SZ_8M));
+   return ptep_get(ptep);
+}
+
 #define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep, unsigned long sz)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 625c31d6ce5c..54ebb91dbdcf 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -119,7 +119,7 @@ static inline pte_t pte_mkhuge(pte_t pte)
 
 #define pte_mkhuge pte_mkhuge
 
-static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, 
pte_t *p,
+static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
 unsigned long clr, unsigned long set, int 
huge);
 
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long 
addr, pte_t *ptep)
@@ -141,19 +141,12 @@ static inline void __ptep_set_access_flags(struct 
vm_area_struct *vma, pte_t *pt
 }
 #define __ptep_set_access_flags __ptep_set_access_flags
 
-static inline unsigned long pgd_leaf_size(pgd_t pgd)
-{
-   if (pgd_val(pgd) & _PMD_PAGE_8M)
-   return SZ_8M;
-   return SZ_4M;
-}
-
-#define pgd_leaf_size pgd_leaf_size
-
-static inline unsigned long pte_leaf_size(pte_t pte)
+static inline unsigned long __pte_leaf_size(pmd_t pmd, pte_t pte)
 {
 

[PATCH v5 07/18] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-06-09 Thread Christophe Leroy
set_huge_pte_at() expects the size of the hugepage as an int, not the
psize which is the index of the page definition in table mmu_psize_defs[]

Fixes: 935d4f0c6dc8 ("mm: hugetlb: add huge page size param to 
set_huge_pte_at()")
Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/mm/nohash/8xx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 43d4842bb1c7..d93433e26ded 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -94,7 +94,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long 
va, phys_addr_t pa,
return -EINVAL;
 
set_huge_pte_at(_mm, va, ptep,
-   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize);
+   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
+   1UL << mmu_psize_to_shift(psize));
 
return 0;
 }
-- 
2.44.0



[PATCH v5 06/18] powerpc/mm: Allow hugepages without hugepd

2024-06-09 Thread Christophe Leroy
In preparation of implementing huge pages on powerpc 8xx
without hugepd, enclose hugepd related code inside an
ifdef CONFIG_ARCH_HAS_HUGEPD

This also allows removing some stubs.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3:
- Prepare huge_pte_alloc() for full standard topology, not only for 2-level
- Reordered last part of huge_pte_alloc()

v4:
- Rebased of v6.10-rc1

v5:
- Moved the Kconfig split in this patch.
---
 arch/powerpc/Kconfig |  1 -
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  2 --
 arch/powerpc/include/asm/hugetlb.h   | 10 ++
 arch/powerpc/include/asm/nohash/pgtable.h|  2 +-
 arch/powerpc/mm/hugetlbpage.c| 33 
 arch/powerpc/mm/pgtable.c|  2 ++
 arch/powerpc/platforms/Kconfig.cputype   |  3 ++
 7 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c88c6d46a5bc..b60b6e991227 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,7 +135,6 @@ config PPC
select ARCH_HAS_DMA_MAP_DIRECT  if PPC_PSERIES
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
-   select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_KERNEL_FPU_SUPPORT  if PPC64 && PPC_FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index dc5c039eb28e..dd4eb3063175 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -47,8 +47,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index ea71f7245a63..79176a499763 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,10 +30,12 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling);
+#endif
 
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -67,14 +69,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct 
*vma,
 {
 }
 
-#define hugepd_shift(x) 0
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned pdshift)
-{
-   return NULL;
-}
-
-
 static inline void __init gigantic_hugetlb_cma_reserve(void)
 {
 }
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index f5f39d4f03c8..e7fc1314c23e 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -340,7 +340,7 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
 
 #define pgprot_writecombine pgprot_noncached_wc
 
-#ifdef CONFIG_HUGETLB_PAGE
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static inline int hugepd_ok(hugepd_t hpd)
 {
 #ifdef CONFIG_PPC_8xx
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 594a4b7b2ca2..20fad59ff9f5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -42,6 +42,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
   unsigned long address, unsigned int pdshift,
   unsigned int pshift, spinlock_t *ptl)
@@ -193,6 +194,36 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
return hugepte_offset(*hpdp, addr, pdshift);
 }
+#else
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long sz)
+{
+   p4d_t *p4d;
+   pud_t *pud;
+   pmd_t *pmd;
+
+   addr &= ~(sz - 1);
+
+   p4d = p4d_offset(pgd_offset(mm, addr), addr);
+   if (!mm_pud_folded(mm) && sz >= P4D_SIZE)
+   return (pte_t *)p4d;
+
+   pud = pud_alloc(mm, p4d, addr);
+   if (!pud)
+   return NULL;
+   if (!mm_pmd_folded(mm) && sz >= PUD_SIZE)
+   return (pte_t *)pud;
+
+   pmd = pmd_alloc(mm, pud, addr);
+   if (!pmd)
+   return NULL;
+
+   if (sz >= PMD_S

[PATCH v5 05/18] powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries

2024-06-09 Thread Christophe Leroy
Building on 32 bits with pmd_leaf() not returning always false leads
to the following error:

  CC  arch/powerpc/mm/pgtable.o
arch/powerpc/mm/pgtable.c: In function '__find_linux_pte':
arch/powerpc/mm/pgtable.c:506:1: error: function may return address of local 
variable [-Werror=return-local-addr]
  506 | }
  | ^
arch/powerpc/mm/pgtable.c:394:15: note: declared here
  394 | pud_t pud, *pudp;
  |   ^~~
arch/powerpc/mm/pgtable.c:394:15: note: declared here

This is due to pmd_offset() being a no-op in that case.

So rework it for powerpc/32 so that pXd_offset() are used on real
pointers and not on on-stack copies.

Behind fixing the problem, it also has the advantage of simplifying
__find_linux_pte() including the removal of stack frame:

After this patch:

0018 <__find_linux_pte>:
  18:   2c 06 00 00 cmpwi   r6,0
  1c:   41 82 00 0c beq 28 <__find_linux_pte+0x10>
  20:   39 20 00 00 li  r9,0
  24:   91 26 00 00 stw r9,0(r6)
  28:   2f 85 00 00 cmpwi   cr7,r5,0
  2c:   41 9e 00 0c beq cr7,38 <__find_linux_pte+0x20>
  30:   39 20 00 00 li  r9,0
  34:   99 25 00 00 stb r9,0(r5)
  38:   54 89 65 3a rlwinm  r9,r4,12,20,29
  3c:   7c 63 48 2e lwzxr3,r3,r9
  40:   2f 83 00 00 cmpwi   cr7,r3,0
  44:   41 9e 00 30 beq cr7,74 <__find_linux_pte+0x5c>
  48:   54 69 07 3a rlwinm  r9,r3,0,28,29
  4c:   2f 89 00 0c cmpwi   cr7,r9,12
  50:   54 63 00 26 clrrwi  r3,r3,12
  54:   54 84 b5 36 rlwinm  r4,r4,22,20,27
  58:   3c 63 c0 00 addis   r3,r3,-16384
  5c:   7c 63 22 14 add r3,r3,r4
  60:   4c be 00 20 bnelr+  cr7
  64:   4d 82 00 20 beqlr
  68:   39 20 00 17 li  r9,23
  6c:   91 26 00 00 stw r9,0(r6)
  70:   4e 80 00 20 blr
  74:   38 60 00 00 li  r3,0
  78:   4e 80 00 20 blr

Before this patch:

0018 <__find_linux_pte>:
  18:   2c 06 00 00 cmpwi   r6,0
  1c:   94 21 ff e0 stwur1,-32(r1)
  20:   41 82 00 0c beq 2c <__find_linux_pte+0x14>
  24:   39 20 00 00 li  r9,0
  28:   91 26 00 00 stw r9,0(r6)
  2c:   2f 85 00 00 cmpwi   cr7,r5,0
  30:   41 9e 00 0c beq cr7,3c <__find_linux_pte+0x24>
  34:   39 20 00 00 li  r9,0
  38:   99 25 00 00 stb r9,0(r5)
  3c:   54 89 65 3a rlwinm  r9,r4,12,20,29
  40:   7c 63 48 2e lwzxr3,r3,r9
  44:   54 69 07 3a rlwinm  r9,r3,0,28,29
  48:   2f 89 00 0c cmpwi   cr7,r9,12
  4c:   90 61 00 0c stw r3,12(r1)
  50:   41 9e 00 4c beq cr7,9c <__find_linux_pte+0x84>
  54:   80 61 00 0c lwz r3,12(r1)
  58:   54 69 07 3a rlwinm  r9,r3,0,28,29
  5c:   2f 89 00 0c cmpwi   cr7,r9,12
  60:   90 61 00 08 stw r3,8(r1)
  64:   41 9e 00 38 beq cr7,9c <__find_linux_pte+0x84>
  68:   80 61 00 08 lwz r3,8(r1)
  6c:   2f 83 00 00 cmpwi   cr7,r3,0
  70:   41 9e 00 54 beq cr7,c4 <__find_linux_pte+0xac>
  74:   54 69 07 3a rlwinm  r9,r3,0,28,29
  78:   2f 89 00 0c cmpwi   cr7,r9,12
  7c:   54 69 00 26 clrrwi  r9,r3,12
  80:   54 8a b5 36 rlwinm  r10,r4,22,20,27
  84:   3c 69 c0 00 addis   r3,r9,-16384
  88:   7c 63 52 14 add r3,r3,r10
  8c:   54 84 93 be srwir4,r4,14
  90:   41 9e 00 14 beq cr7,a4 <__find_linux_pte+0x8c>
  94:   38 21 00 20 addir1,r1,32
  98:   4e 80 00 20 blr
  9c:   54 69 00 26 clrrwi  r9,r3,12
  a0:   54 84 93 be srwir4,r4,14
  a4:   3c 69 c0 00 addis   r3,r9,-16384
  a8:   54 84 25 36 rlwinm  r4,r4,4,20,27
  ac:   7c 63 22 14 add r3,r3,r4
  b0:   41 a2 ff e4 beq 94 <__find_linux_pte+0x7c>
  b4:   39 20 00 17 li  r9,23
  b8:   91 26 00 00 stw r9,0(r6)
  bc:   38 21 00 20 addir1,r1,32
  c0:   4e 80 00 20 blr
  c4:   38 60 00 00 li  r3,0
  c8:   38 21 00 20     addi    r1,r1,32
  cc:   4e 80 00 20 blr

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3: Removed p4dp and pudp locals for PPC32 and add a comment.
v4: Properly set pdshift on PPC32 case
v5: Enhanced commit message
---
 arch/powerpc/mm/pgtable.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9e7ba9c3851f..bce8a861958

[PATCH v5 04/18] powerpc/mm: Remove _PAGE_PSIZE

2024-06-09 Thread Christophe Leroy
_PAGE_PSIZE macro is never used outside the place it is defined
and is used only on 8xx and e500.

Remove indirection, remove it and use its content directly.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/32/pte-40x.h  | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-44x.h  | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-85xx.h | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 5 ++---
 arch/powerpc/include/asm/nohash/pte-e500.h| 4 +---
 5 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h 
b/arch/powerpc/include/asm/nohash/32/pte-40x.h
index d759cfd74754..52ed58516fa4 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-40x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h
@@ -49,9 +49,6 @@
 #define _PAGE_EXEC 0x200   /* hardware: EX permission */
 #define _PAGE_ACCESSED 0x400   /* software: R: page referenced */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 /* cache related flags non existing on 40x */
 #define _PAGE_COHERENT 0
 
diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h 
b/arch/powerpc/include/asm/nohash/32/pte-44x.h
index 851813725237..da0469928273 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-44x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h
@@ -75,9 +75,6 @@
 #define _PAGE_NO_CACHE 0x0400  /* H: I bit */
 #define _PAGE_WRITETHRU0x0800  /* H: W bit */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 /* TODO: Add large page lowmem mapping support */
 #define _PMD_PRESENT   0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
index 653a342d3b25..14d64b4f3f14 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
@@ -31,9 +31,6 @@
 #define _PAGE_WRITETHRU0x00400 /* H: W bit */
 #define _PAGE_SPECIAL  0x00800 /* S: Special page */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 #define _PMD_PRESENT   0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
 #define _PMD_BAD   (~PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 137dc3c84e45..625c31d6ce5c 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -74,12 +74,11 @@
 #define _PTE_NONE_MASK 0
 
 #ifdef CONFIG_PPC_16K_PAGES
-#define _PAGE_PSIZE_PAGE_SPS
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_SPS)
 #else
-#define _PAGE_PSIZE0
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED)
 #endif
 
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
 #define _PAGE_BASE (_PAGE_BASE_NC)
 
 #include 
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
b/arch/powerpc/include/asm/nohash/pte-e500.h
index f516f0b5b7a8..975facc7e38e 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -65,8 +65,6 @@
 
 #define _PAGE_SPECIAL  _PAGE_SW0
 
-/* Base page size */
-#define _PAGE_PSIZE_PAGE_PSIZE_4K
 #definePTE_RPN_SHIFT   (24)
 
 #define PTE_WIMGE_SHIFT (19)
@@ -89,7 +87,7 @@
  * pages. We always set _PAGE_COHERENT when SMP is enabled or
  * the processor might need it for DMA coherency.
  */
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE_4K)
 #if defined(CONFIG_SMP)
 #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT)
 #else
-- 
2.44.0



[PATCH v5 03/18] mm: Provide mm_struct and address to huge_ptep_get()

2024-06-09 Thread Christophe Leroy
On powerpc 8xx huge_ptep_get() will need to know whether the given
ptep is a PTE entry or a PMD entry. This cannot be known with the
PMD entry itself because there is no easy way to know it from the
content of the entry.

So huge_ptep_get() will need to know either the size of the page
or get the pmd.

In order to be consistent with huge_ptep_get_and_clear(), give
mm and address to huge_ptep_get().

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v2: Add missing changes in arch implementations
v3: Fixed a comment in ARM and missing changes in S390
v4: Fix missing or bad changes in mm/hugetlb.c and rebased on v6.10-rc1
v5: Fix vma->mm to vma->vm_mm
---
 arch/arm/include/asm/hugetlb-3level.h |  4 +--
 arch/arm64/include/asm/hugetlb.h  |  2 +-
 arch/arm64/mm/hugetlbpage.c   |  2 +-
 arch/riscv/include/asm/hugetlb.h  |  2 +-
 arch/riscv/mm/hugetlbpage.c   |  2 +-
 arch/s390/include/asm/hugetlb.h   |  4 +--
 arch/s390/mm/hugetlbpage.c|  4 +--
 fs/hugetlbfs/inode.c  |  2 +-
 fs/proc/task_mmu.c| 10 +++---
 fs/userfaultfd.c  |  2 +-
 include/asm-generic/hugetlb.h |  2 +-
 include/linux/swapops.h   |  4 +--
 mm/damon/vaddr.c  |  6 ++--
 mm/gup.c  |  2 +-
 mm/hmm.c  |  2 +-
 mm/hugetlb.c  | 44 +--
 mm/memory-failure.c   |  2 +-
 mm/mempolicy.c|  2 +-
 mm/migrate.c  |  4 +--
 mm/mincore.c  |  2 +-
 mm/userfaultfd.c  |  2 +-
 21 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/arch/arm/include/asm/hugetlb-3level.h 
b/arch/arm/include/asm/hugetlb-3level.h
index a30be5505793..87d48e2d90ad 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -13,12 +13,12 @@
 
 /*
  * If our huge pte is non-zero then mark the valid bit.
- * This allows pte_present(huge_ptep_get(ptep)) to return true for non-zero
+ * This allows pte_present(huge_ptep_get(mm,addr,ptep)) to return true for 
non-zero
  * ptes.
  * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
  */
 #define __HAVE_ARCH_HUGE_PTEP_GET
-static inline pte_t huge_ptep_get(pte_t *ptep)
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
 {
pte_t retval = *ptep;
if (pte_val(retval))
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 3954cbd2ff56..293f880865e8 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
   pte_t *ptep, unsigned long sz);
 #define __HAVE_ARCH_HUGE_PTEP_GET
-extern pte_t huge_ptep_get(pte_t *ptep);
+extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t 
*ptep);
 
 void __init arm64_hugetlb_cma_reserve(void);
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 3f09ac73cce3..5f1e2103888b 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -127,7 +127,7 @@ static inline int num_contig_ptes(unsigned long size, 
size_t *pgsize)
return contig_ptes;
 }
 
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
int ncontig, i;
size_t pgsize;
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index b1ce97a9dbfc..faf3624d8057 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -44,7 +44,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
   pte_t pte, int dirty);
 
 #define __HAVE_ARCH_HUGE_PTEP_GET
-pte_t huge_ptep_get(pte_t *ptep);
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 
 pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 0ebd968b33c9..42314f093922 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,7 +3,7 @@
 #include 
 
 #ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
unsigned long pte_num;
int i;
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index ce5f4fe8be4d..cf1b5d6fb1a6 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -19,7 +19,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte, unsigned long sz);

[PATCH v5 01/18] powerpc/64e: Remove unused IBM HTW code [SQUASHED]

2024-06-09 Thread Christophe Leroy
From: Michael Ellerman 

This is a squash of series from Michael 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20240524073141.1637736-1-...@ellerman.id.au/

The nohash HTW_IBM (Hardware Table Walk) code is unused since support
for A2 was removed in commit fb5a515704d7 ("powerpc: Remove platforms/
wsp and associated pieces") (2014).

The remaining supported CPUs use either no HTW (data_tlb_miss_bolted),
or the e6500 HTW (data_tlb_miss_e6500).

Signed-off-by: Michael Ellerman 

powerpc/64e: Split out nohash Book3E 64-bit code

A reasonable chunk of nohash/tlb.c is 64-bit only code, split it out
into a separate file.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop E500 ifdefs in 64-bit code

All 64-bit Book3E have E500=y, so drop the unneeded ifdefs.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop MMU_FTR_TYPE_FSL_E checks in 64-bit code

All 64-bit Book3E have MMU_FTR_TYPE_FSL_E, since A2 was removed, so
remove checks for it in 64-bit only code.

Signed-off-by: Michael Ellerman 

powerpc/64e: Consolidate TLB miss handler patching

The 64e TLB miss handler patching is done in setup_mmu_htw(), and then
again immediately afterward in early_init_mmu_global(). Consolidate it
into a single location.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop unused TLB miss handlers

There are two possibilities for book3e_htw_mode, PPC_HTW_E6500 or
PPC_HTW_NONE.

The TLB miss handlers are patched to use, respectively:
  - exc_[data|indstruction]_tlb_miss_e6500_book3e
  - exc_[data|indstruction]_tlb_miss_bolted_book3e

Which means the default handlers are never used. Remove those, and use
the bolted handlers (PPC_HTW_NONE) by default.

Signed-off-by: Michael Ellerman 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h |   3 +-
 arch/powerpc/kernel/exceptions-64e.S   |   4 +-
 arch/powerpc/kernel/setup_64.c |   6 +-
 arch/powerpc/mm/nohash/Makefile|   2 +-
 arch/powerpc/mm/nohash/tlb.c   | 398 +--
 arch/powerpc/mm/nohash/tlb_64e.c   | 314 +++
 arch/powerpc/mm/nohash/tlb_low_64e.S   | 421 -
 7 files changed, 320 insertions(+), 828 deletions(-)
 create mode 100644 arch/powerpc/mm/nohash/tlb_64e.c

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 6ddced0415cb..7dc24b8632d7 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -303,8 +303,7 @@ extern unsigned long linear_map_top;
 extern int book3e_htw_mode;
 
 #define PPC_HTW_NONE   0
-#define PPC_HTW_IBM1
-#define PPC_HTW_E6500  2
+#define PPC_HTW_E6500  1
 
 /*
  * 64-bit booke platforms don't load the tlb in the tlb miss handler code.
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index dcf0591ad3c2..63f6b9f513a4 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -485,8 +485,8 @@ interrupt_base_book3e:  
/* fake trap */
EXCEPTION_STUB(0x160, decrementer)  /* 0x0900 */
EXCEPTION_STUB(0x180, fixed_interval)   /* 0x0980 */
EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */
-   EXCEPTION_STUB(0x1c0, data_tlb_miss)
-   EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
+   EXCEPTION_STUB(0x1c0, data_tlb_miss_bolted)
+   EXCEPTION_STUB(0x1e0, instruction_tlb_miss_bolted)
EXCEPTION_STUB(0x200, altivec_unavailable)
EXCEPTION_STUB(0x220, altivec_assist)
EXCEPTION_STUB(0x260, perfmon)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ae36a129789f..22f83fbbc762 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -696,11 +696,7 @@ __init u64 ppc64_bolted_size(void)
 {
 #ifdef CONFIG_PPC_BOOK3E_64
/* Freescale BookE bolts the entire linear mapping */
-   /* XXX: BookE ppc64_rma_limit setup seems to disagree? */
-   if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E))
-   return linear_map_top;
-   /* Other BookE, we assume the first GB is bolted */
-   return 1ul << 30;
+   return linear_map_top;
 #else
/* BookS radix, does not take faults on linear mapping */
if (early_radix_enabled())
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index b3f0498dd42f..90e846f0c46c 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-y  += mmu_context.o tlb.o tlb_low.o kup.o
-obj-$(CONFIG_PPC_BOOK3E_64)+= tlb_low_64e.o book3e_pgtable.o
+obj-$(CONFIG_PPC_BOOK3E_64)+= tlb_64e.o tlb_low_64e.o book3e_pgtable.o
 obj-$(CONFIG_40x)  += 40x.o
 obj-$(CONFIG_44x)  += 44x.o
 obj-$(CONFIG_PPC_8xx)  += 8xx.o
diff --git a/arch/powerpc

[PATCH v5 02/18] mm: Define __pte_leaf_size() to also take a PMD entry

2024-06-09 Thread Christophe Leroy
On powerpc 8xx, when a page is 8M size, the information is in the PMD
entry. So allow architectures to provide __pte_leaf_size() instead of
pte_leaf_size() and provide the PMD entry to that function.

When __pte_leaf_size() is not defined, define it as a pte_leaf_size()
so that architectures not interested in the PMD arguments are not
impacted.

Only define a default pte_leaf_size() when __pte_leaf_size() is not
defined to make sure nobody adds new calls to pte_leaf_size() in the
core.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3: Don't change pte_leaf_size() to not impact other architectures
---
 include/linux/pgtable.h | 3 +++
 kernel/events/core.c| 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 18019f037bae..3080e7cde3de 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1888,9 +1888,12 @@ typedef unsigned int pgtbl_mod_mask;
 #ifndef pmd_leaf_size
 #define pmd_leaf_size(x) PMD_SIZE
 #endif
+#ifndef __pte_leaf_size
 #ifndef pte_leaf_size
 #define pte_leaf_size(x) PAGE_SIZE
 #endif
+#define __pte_leaf_size(x,y) pte_leaf_size(y)
+#endif
 
 /*
  * We always define pmd_pfn for all archs as it's used in lots of generic
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f0128c5ff278..880df84ce07c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7596,7 +7596,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, 
unsigned long addr)
 
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
-   size = pte_leaf_size(pte);
+   size = __pte_leaf_size(pmd, pte);
pte_unmap(ptep);
 #endif /* CONFIG_HAVE_GUP_FAST */
 
-- 
2.44.0



[PATCH v5 00/18] Reimplement huge pages without hugepd on powerpc (8xx, e500, book3s/64)

2024-06-09 Thread Christophe Leroy
This series has reached maturity to not send it as an RFC anymore.
Only the book3s/64 part maybe needs more attention. Alternatively
we could simply disable HUGE pages on book3s/64 in hash-4k mode if
we want to be on the safe side.

Also see https://github.com/linuxppc/issues/issues/483

Unlike most architectures, powerpc 8xx HW requires a two-level
pagetable topology for all page sizes. So a leaf PMD-contig approach
is not feasible as such.

Possible sizes on 8xx are 4k, 16k, 512k and 8M.

First level (PGD/PMD) covers 4M per entry. For 8M pages, two PMD entries
must point to a single entry level-2 page table. Until now that was
done using hugepd. This series changes it to use standard page tables
where the entry is replicated 1024 times on each of the two pagetables
refered by the two associated PMD entries for that 8M page.

For e500 and book3s/64 there are less constraints because it is not
tied to the HW assisted tablewalk like on 8xx, so it is easier to use
leaf PMDs (and PUDs).

On e500 the supported page sizes are 4M, 16M, 64M, 256M and 1G. All at
PMD level on e500/32 (mpc85xx) and mix of PMD and PUD for e500/64. We
encode page size with 4 available bits in PTE entries. On e300/32 PGD
entries size is increases to 64 bits in order to allow leaf-PMD entries
because PTE are 64 bits on e500.

On book3s/64 only the hash-4k mode is concerned. It supports 16M pages
as cont-PMD and 16G pages as cont-PUD. In other modes (radix-4k, radix-6k
and hash-64k) the sizes match with PMD and PUD sizes so that's just leaf
entries. The hash processing make things a bit more complex. To ease
things, __hash_page_huge() is modified to bail out when DIRTY or ACCESSED
bits are missing, leaving it to mm core to fix it.

Global changes in v5:
- Now use PAGE SIZE field in e500's PTE to store TSIZE instead of using U0-U3
- On e500/64, use highest bit to discriminate leaf entries because PUD entries 
are not garantied to be 4k aligned so PAGE SIZE field is not garantied to be 0 
on a non-leaf entry.

Global changes in v4:
- Fixed a few issues reported privately by robots
- Rebased on top of v6.10-rc1

Global changes in v3:
- Removed patches 1 and 2
- Squashed patch 11 into patch 5
- Replaced patches 12 and 13 with a series from Michael
- Reordered patches a bit to have more general patches up front

For more details on changes, see in each patch.

Christophe Leroy (17):
  mm: Define __pte_leaf_size() to also take a PMD entry
  mm: Provide mm_struct and address to huge_ptep_get()
  powerpc/mm: Remove _PAGE_PSIZE
  powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries
  powerpc/mm: Allow hugepages without hugepd
  powerpc/8xx: Fix size given to set_huge_pte_at()
  powerpc/8xx: Rework support for 8M pages using contiguous PTE entries
  powerpc/8xx: Simplify struct mmu_psize_def
  powerpc/e500: Remove enc and ind fields from struct mmu_psize_def
  powerpc/e500: Switch to 64 bits PGD on 85xx (32 bits)
  powerpc/e500: Encode hugepage size in PTE bits
  powerpc/e500: Don't pre-check write access on data TLB error
  powerpc/e500: Free r10 for FIND_PTE
  powerpc/e500: Use contiguous PMD instead of hugepd
  powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
  powerpc/mm: Remove hugepd leftovers
  mm: Remove CONFIG_ARCH_HAS_HUGEPD

Michael Ellerman (1):
  powerpc/64e: Remove unused IBM HTW code [SQUASHED]

 arch/arm/include/asm/hugetlb-3level.h |   4 +-
 arch/arm64/include/asm/hugetlb.h  |   2 +-
 arch/arm64/mm/hugetlbpage.c   |   2 +-
 arch/powerpc/Kconfig  |   1 -
 arch/powerpc/include/asm/book3s/32/pgalloc.h  |   2 -
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  15 -
 arch/powerpc/include/asm/book3s/64/hash.h |  38 +-
 arch/powerpc/include/asm/book3s/64/hugetlb.h  |  38 --
 .../include/asm/book3s/64/pgtable-4k.h|  47 --
 .../include/asm/book3s/64/pgtable-64k.h   |  20 -
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  22 +-
 arch/powerpc/include/asm/hugetlb.h|  15 +-
 .../include/asm/nohash/32/hugetlb-8xx.h   |  38 +-
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h  |   9 +-
 arch/powerpc/include/asm/nohash/32/pte-40x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-44x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-85xx.h |   3 -
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  |  58 ++-
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  39 +-
 arch/powerpc/include/asm/nohash/mmu-e500.h|   6 +-
 arch/powerpc/include/asm/nohash/pgalloc.h |   2 -
 arch/powerpc/include/asm/nohash/pgtable.h |  46 +-
 arch/powerpc/include/asm/nohash/pte-e500.h|  63 ++-
 arch/powerpc/include/asm/page.h   |  32 --
 arch/powerpc/include/asm/pgtable-be-types.h   |  10 -
 arch/powerpc/include/asm/pgtable-types.h  |  13 +-
 arch/powerpc/include/asm/pgtable.h|   3 +
 arch/powerpc/kernel/exceptions-64e.S  |   4 +-
 arch/powerpc/kernel/head_85xx.S   |  70 +--
 arch/powerpc/kernel/head_8xx.S

Re: [PATCH V3 05/14] tools/perf: Add disasm_line__parse to parse raw instruction for powerpc

2024-06-08 Thread Christophe Leroy


Le 06/06/2024 à 08:33, Namhyung Kim a écrit :
> Hello,
> 
> On Sat, Jun 01, 2024 at 11:39:32AM +0530, Athira Rajeev wrote:
>> Currently, the perf tool infrastructure disasm_line__parse function to
>> parse disassembled line.
>>
>> Example snippet from objdump:
>> objdump  --start-address= --stop-address=  -d 
>> --no-show-raw-insn -C 
>>
>> c10224b4:lwz r10,0(r9)
>>
>> This line "lwz r10,0(r9)" is parsed to extract instruction name,
>> registers names and offset. In powerpc, the approach for data type
>> profiling uses raw instruction instead of result from objdump to identify
>> the instruction category and extract the source/target registers.
>>
>> Example: 38 01 81 e8 ld  r4,312(r1)
>>
>> Here "38 01 81 e8" is the raw instruction representation. Add function
>> "disasm_line__parse_powerpc" to handle parsing of raw instruction. Also
>> update "struct ins" and "struct ins_operands" to save "opcode" and
>> binary code. With the change, function captures:
>>
>> line -> "38 01 81 e8 ld  r4,312(r1)"
>> opcode and raw instruction "38 01 81 e8"
>>
>> Raw instruction is used later to extract the reg/offset fields. Macros
>> are added to extract opcode and register fields. "struct ins_operands"
>> and "struct ins" is updated to carry opcode and raw instruction binary
>> code (raw_insn). Function "disasm_line__parse_powerpc fills the raw
>> instruction hex value and opcode in newly added fields. There is no
>> changes in existing code paths, which parses the disassembled code.
>> The architecture using the instruction name and present approach is
>> not altered. Since this approach targets powerpc, the macro
>> implementation is added for powerpc as of now.
>>
>> Since the disasm_line__parse is used in other cases (perf annotate) and
>> not only data tye profiling, the powerpc callback includes changes to
>> work with binary code as well as mneumonic representation. Also in case
>> if the DSO read fails and libcapstone is not supported, the approach
>> fallback to use objdump as option. Hence as option, patch has changes to
>> ensure objdump option also works well.
>>
>> Signed-off-by: Athira Rajeev 
>> ---
>>   tools/include/linux/string.h  |  2 +
>>   tools/lib/string.c| 13 
>>   .../perf/arch/powerpc/annotate/instructions.c |  1 +
>>   tools/perf/arch/powerpc/util/dwarf-regs.c |  9 +++
>>   tools/perf/util/disasm.c  | 63 ++-
>>   tools/perf/util/disasm.h  |  7 +++
>>   6 files changed, 94 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
>> index db5c99318c79..0acb1fc14e19 100644
>> --- a/tools/include/linux/string.h
>> +++ b/tools/include/linux/string.h
>> @@ -46,5 +46,7 @@ extern char * __must_check skip_spaces(const char *);
>>   
>>   extern char *strim(char *);
>>   
>> +extern void remove_spaces(char *s);
>> +
>>   extern void *memchr_inv(const void *start, int c, size_t bytes);
>>   #endif /* _TOOLS_LINUX_STRING_H_ */
>> diff --git a/tools/lib/string.c b/tools/lib/string.c
>> index 8b6892f959ab..3126d2cff716 100644
>> --- a/tools/lib/string.c
>> +++ b/tools/lib/string.c
>> @@ -153,6 +153,19 @@ char *strim(char *s)
>>  return skip_spaces(s);
>>   }
>>   
>> +/*
>> + * remove_spaces - Removes whitespaces from @s
>> + */
>> +void remove_spaces(char *s)
>> +{
>> +char *d = s;
>> +
>> +do {
>> +while (*d == ' ')
>> +++d;
>> +} while ((*s++ = *d++));
>> +}
>> +
>>   /**
>>* strreplace - Replace all occurrences of character in string.
>>* @s: The string to operate on.
>> diff --git a/tools/perf/arch/powerpc/annotate/instructions.c 
>> b/tools/perf/arch/powerpc/annotate/instructions.c
>> index a3f423c27cae..d57fd023ef9c 100644
>> --- a/tools/perf/arch/powerpc/annotate/instructions.c
>> +++ b/tools/perf/arch/powerpc/annotate/instructions.c
>> @@ -55,6 +55,7 @@ static int powerpc__annotate_init(struct arch *arch, char 
>> *cpuid __maybe_unused)
>>  arch->initialized = true;
>>  arch->associate_instruction_ops = 
>> powerpc__associate_instruction_ops;
>>  arch->objdump.comment_char  = '#';
>> +annotate_opts.show_asm_raw = true;
>>  }
>>   
>>  return 0;
>> diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c 
>> b/tools/perf/arch/powerpc/util/dwarf-regs.c
>> index 0c4f4caf53ac..430623ca5612 100644
>> --- a/tools/perf/arch/powerpc/util/dwarf-regs.c
>> +++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
>> @@ -98,3 +98,12 @@ int regs_query_register_offset(const char *name)
>>  return roff->ptregs_offset;
>>  return -EINVAL;
>>   }
>> +
>> +#define PPC_OP(op)  (((op) >> 26) & 0x3F)
>> +#define PPC_RA(a)   (((a) >> 16) & 0x1f)
>> +#define PPC_RT(t)   (((t) >> 21) & 0x1f)
>> +#define PPC_RB(b)   (((b) >> 11) & 0x1f)
>> +#define PPC_D(D)((D) & 0xfffe)
>> +#define PPC_DS(DS)  ((DS) & 

Re: [RFC PATCH v4 12/16] powerpc/e500: Encode hugepage size in PTE bits

2024-05-29 Thread Christophe Leroy


Le 29/05/2024 à 12:09, Oscar Salvador a écrit :
> On Wed, May 29, 2024 at 09:49:48AM +0000, Christophe Leroy wrote:
>> Doesn't really matter if it's PUD or PMD at this point. On a 32 bits
>> kernel it will be all PMD while on a 64 bits kernel it is both PMD and PUD.
>>
>> At the time being (as implemented with hugepd), Linux support 4M, 16M,
>> 64M, 256M and 1G (Shifts 22, 24, 26, 28, 30)
>>
>> The hardware supports the following page sizes, and encodes them on 4
>> bits allthough it is not directly a shift. Maybe it would be better to
>> use that encoding after all:
> 
> I think so.
> 
>>
>> 0001 4 Kbytes (Shift 12)
>> 0010 16 Kbytes (Shift 14)
>> 0011 64 Kbytes (Shift 16)
>> 0100 256 Kbytes (Shift 18)
>> 0101 1 Mbyte (Shift 20)
>> 0110 4 Mbytes (Shift 22)
>> 0111 16 Mbytes (Shift 24)
>> 1000 64 Mbytes (Shift 26)
>> 1001 256 Mbytes (Shift 28)
>> 1010 1 Gbyte (e500v2 only) (Shift 30)
>> 1011 4 Gbytes (e500v2 only) (Shift 32)
> 
> You say hugehages start at 2MB (shift 21), but you say that the smallest 
> hugepage
> Linux support is 4MB (shift 22).?
> 
> 

No I say PMD_SIZE is 2MB on e500 with 64 bits PTE and at the time being 
Linux powerpc implementation for e500 supports sizes 4M, 16M, 64M, 256M 
and 1G.

But for instead on 8xx we have 16k and 512M hugepages. Here on the e500 
we could in a follow-up patch add support to lower pagesizes for 
instance 16k, 64k, 256k and 1M. Of course all would then be cont-PTE and 
not cont-PMD


Re: [RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-29 Thread Christophe Leroy


Le 29/05/2024 à 11:23, Oscar Salvador a écrit :
> On Mon, May 27, 2024 at 03:30:12PM +0200, Christophe Leroy wrote:
>> On book3s/64, the only user of hugepd is hash in 4k mode.
>>
>> All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.
>>
>> Rework hash-4k to use contiguous PMD and PUD instead.
>>
>> In that setup there are only two huge page sizes: 16M and 16G.
>>
>> 16M sits at PMD level and 16G at PUD level.
> 
> 
> On 4k mode, PMD_SIZE is 2MB and PUD_SIZE is 256MB, right?

Correct, as documented in arch/powerpc/include/asm/book3s/64/hash-4k.h

> 
>> +static inline unsigned long hash__pte_update(struct mm_struct *mm,
>> + unsigned long addr,
>> + pte_t *ptep, unsigned long clr,
>> + unsigned long set,
>> + int huge)
>> +{
>> +unsigned long old;
>> +
>> +old = hash__pte_update_one(ptep, clr, set);
>> +
>> +if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
>> +unsigned int psize = get_slice_psize(mm, addr);
>> +int nb, i;
>> +
>> +if (psize == MMU_PAGE_16M)
>> +nb = SZ_16M / PMD_SIZE;
>> +else if (psize == MMU_PAGE_16G)
>> +nb = SZ_16G / PUD_SIZE;
>> +else
>> +nb = 1;
> 
> On 4K, hugepages are either 16M or 16G. How can we end up in a situation
> whwere the is pte is huge, but is is neither MMU_PAGE_16G nor MMU_PAGE_16M?

We can't but I didn't want to leave nb undefined or with a value that 
might lead to writing in the weed. Value 1 seems a safe default.

> 
>> diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c 
>> b/arch/powerpc/mm/book3s64/hugetlbpage.c
>> index 5a2e512e96db..83c3361b358b 100644
>> --- a/arch/powerpc/mm/book3s64/hugetlbpage.c
>> +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
>> @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long 
>> access, unsigned long vsid,
>>  /* If PTE permissions don't match, take page fault */
>>  if (unlikely(!check_pte_access(access, old_pte)))
>>  return 1;
>> +/*
>> + * If hash-4k, hugepages use seeral contiguous PxD entries
> 'several'
>> + * so bail out and let mm make the page young or dirty
>> + */
>> +if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
>> +if (!(old_pte & _PAGE_ACCESSED))
>> +return 1;
>> +if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY))
>> +return 1;
> 
> I have 0 clue about this code. What would happen if we do not bail out?
> 

In that case the pte_xchg() in the while () will only set ACCESS or 
DIRTY bit on the first PxD entry, not on all cont-PxD entries.

Christophe


Re: [RFC PATCH v4 13/16] powerpc/e500: Use contiguous PMD instead of hugepd

2024-05-29 Thread Christophe Leroy


Le 29/05/2024 à 10:49, Oscar Salvador a écrit :
> [Vous ne recevez pas souvent de courriers de osalva...@suse.com. D?couvrez 
> pourquoi ceci est important ? https://aka.ms/LearnAboutSenderIdentification ]
> 
> On Mon, May 27, 2024 at 03:30:11PM +0200, Christophe Leroy wrote:
>> e500 supports many page sizes among which the following size are
>> implemented in the kernel at the time being: 4M, 16M, 64M, 256M, 1G.
>>
>> On e500, TLB miss for hugepages is exclusively handled by SW even
>> on e6500 which has HW assistance for 4k pages, so there are no
>> constraints like on the 8xx.
>>
>> On e500/32, all are at PGD/PMD level and can be handled as
>> cont-PMD.
>>
>> On e500/64, smaller ones are on PMD while bigger ones are on PUD.
>> Again, they can easily be handled as cont-PMD and cont-PUD instead
>> of hugepd.
>>
>> Signed-off-by: Christophe Leroy 
> 
> ...
> 
>> diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
>> b/arch/powerpc/include/asm/nohash/pgtable.h
>> index 90d6a0943b35..f7421d1a1693 100644
>> --- a/arch/powerpc/include/asm/nohash/pgtable.h
>> +++ b/arch/powerpc/include/asm/nohash/pgtable.h
>> @@ -52,11 +52,36 @@ static inline pte_basic_t pte_update(struct mm_struct 
>> *mm, unsigned long addr, p
>>   {
>>pte_basic_t old = pte_val(*p);
>>pte_basic_t new = (old & ~(pte_basic_t)clr) | set;
>> + unsigned long sz;
>> + unsigned long pdsize;
>> + int i;
>>
>>if (new == old)
>>return old;
>>
>> - *p = __pte(new);
>> +#ifdef CONFIG_PPC_E500
>> + if (huge)
>> + sz = 1UL << (((old & _PAGE_HSIZE_MSK) >> _PAGE_HSIZE_SHIFT) + 
>> 20);
>> + else
> 
> I think this will not compile when CONFIG_PPC_85xx && !CONFIG_PTE_64BIT.

Yes, I got a feedback on this from the robots.

> 
> You have declared _PAGE_HSIZE_MSK and _PAGE_HSIZE_SHIFT in
> arch/powerpc/include/asm/nohash/hugetlb-e500.h.
> 
> But hugetlb-e500.h is only included if CONFIG_PPC_85xx && CONFIG_PTE_64BIT
> (see arch/powerpc/include/asm/nohash/32/pgtable.h).
> 
> 
> 
>> +#endif
>> + sz = PAGE_SIZE;
>> +
>> + if (!huge || sz < PMD_SIZE)
>> + pdsize = PAGE_SIZE;
>> + else if (sz < PUD_SIZE)
>> + pdsize = PMD_SIZE;
>> + else if (sz < P4D_SIZE)
>> + pdsize = PUD_SIZE;
>> + else if (sz < PGDIR_SIZE)
>> + pdsize = P4D_SIZE;
>> + else
>> + pdsize = PGDIR_SIZE;
>> +
>> + for (i = 0; i < sz / pdsize; i++, p++) {
>> + *p = __pte(new);
>> + if (new)
>> + new += (unsigned long long)(pdsize / PAGE_SIZE) << 
>> PTE_RPN_SHIFT;
> 
> I guess 'new' can be 0 if pte_update() is called on behave of clearing the 
> pte?

It is exactly that, and without that verification I had pmd_bad() 
returning bad pmds after freeing page tables.

> 
>> +static inline unsigned long pmd_leaf_size(pmd_t pmd)
>> +{
>> + return 1UL << (((pmd_val(pmd) & _PAGE_HSIZE_MSK) >> _PAGE_HSIZE_SHIFT) 
>> + 20);
> 
> Can we have the '20' somewhere defined with a comment on top explaining
> what is so it is not a magic number?
> Otherwise people might come look at this and wonder why 20.

Yes I now have :

+#define _PAGE_HSIZE_MSK (_PAGE_U0 | _PAGE_U1 | _PAGE_U2 | _PAGE_U3)
+#define _PAGE_HSIZE_SHIFT  14
+#define _PAGE_HSIZE_SHIFT_OFFSET   20

and have added a helper to avoid doing the calculation at several places:

+static inline unsigned long pte_huge_size(pte_t pte)
+{
+   pte_basic_t val = pte_val(pte);
+
+   return 1UL << (((val & _PAGE_HSIZE_MSK) >> _PAGE_HSIZE_SHIFT) + 
_PAGE_HSIZE_SHIFT_OFFSET);
+}


> 
>> --- a/arch/powerpc/mm/pgtable.c
>> +++ b/arch/powerpc/mm/pgtable.c
>> @@ -331,6 +331,37 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned 
>> long addr, pte_t *ptep,
>>__set_huge_pte_at(pmdp, ptep, pte_val(pte));
>>}
>>   }
>> +#elif defined(CONFIG_PPC_E500)
>> +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
>> +  pte_t pte, unsigned long sz)
>> +{
>> + unsigned long pdsize;
>> + int i;
>> +
>> + pte = set_pte_filter(pte, addr);
>> +
>> + /*
>> +  * Make sure hardware valid bit is not set. We don't do
>> +  * tlb flush for this update.
>> +  */
>> + VM_WARN_ON(pte_hw_valid(*ptep) && !pt

Re: [RFC PATCH v4 12/16] powerpc/e500: Encode hugepage size in PTE bits

2024-05-29 Thread Christophe Leroy


Le 29/05/2024 à 10:05, Oscar Salvador a écrit :
> [Vous ne recevez pas souvent de courriers de osalva...@suse.com. D?couvrez 
> pourquoi ceci est important ? https://aka.ms/LearnAboutSenderIdentification ]
> 
> On Mon, May 27, 2024 at 03:30:10PM +0200, Christophe Leroy wrote:
>> Use U0-U3 bits to encode hugepage size, more exactly page shift.
>>
>> As we start using hugepages at shift 21 (2Mbytes), substract 20
>> so that it fits into 4 bits. That may change in the future if
>> we want to use smaller hugepages.
> 
> What other shifts we can have here on e500? PUD_SHIFT?

Doesn't really matter if it's PUD or PMD at this point. On a 32 bits 
kernel it will be all PMD while on a 64 bits kernel it is both PMD and PUD.

At the time being (as implemented with hugepd), Linux support 4M, 16M, 
64M, 256M and 1G (Shifts 22, 24, 26, 28, 30)

The hardware supports the following page sizes, and encodes them on 4 
bits allthough it is not directly a shift. Maybe it would be better to 
use that encoding after all:

0001 4 Kbytes (Shift 12)
0010 16 Kbytes (Shift 14)
0011 64 Kbytes (Shift 16)
0100 256 Kbytes (Shift 18)
0101 1 Mbyte (Shift 20)
0110 4 Mbytes (Shift 22)
0111 16 Mbytes (Shift 24)
1000 64 Mbytes (Shift 26)
1001 256 Mbytes (Shift 28)
1010 1 Gbyte (e500v2 only) (Shift 30)
1011 4 Gbytes (e500v2 only) (Shift 32)


> Could you please spell them out here?
> Or even better,
> 
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>>   arch/powerpc/include/asm/nohash/hugetlb-e500.h | 6 ++
>>   arch/powerpc/include/asm/nohash/pte-e500.h | 3 +++
>>   2 files changed, 9 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
>> b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
>> index 8f04ad20e040..d8e51a3f8557 100644
>> --- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
>> +++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
>> @@ -42,4 +42,10 @@ static inline int check_and_get_huge_psize(int shift)
>>return shift_to_mmu_psize(shift);
>>   }
>>
>> +static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, 
>> vm_flags_t flags)
>> +{
>> + return __pte(pte_val(entry) | (_PAGE_U3 * (shift - 20)));
>> +}
>> +#define arch_make_huge_pte arch_make_huge_pte
>> +
>>   #endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */
>> diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
>> b/arch/powerpc/include/asm/nohash/pte-e500.h
>> index 975facc7e38e..091e4bff1fba 100644
>> --- a/arch/powerpc/include/asm/nohash/pte-e500.h
>> +++ b/arch/powerpc/include/asm/nohash/pte-e500.h
>> @@ -46,6 +46,9 @@
>>   #define _PAGE_NO_CACHE   0x40 /* I: cache inhibit */
>>   #define _PAGE_WRITETHRU  0x80 /* W: cache write-through */
>> +#define _PAGE_HSIZE_MSK (_PAGE_U0 | _PAGE_U1 | _PAGE_U2 | _PAGE_U3)
>> +#define _PAGE_HSIZE_SHIFT14
> 
> Add a comment in above explaining which P*_SHIFT we need cover with these
> 4bits.
> 
> 
> 
> --
> Oscar Salvador
> SUSE Labs


Re: [RFC PATCH v4 08/16] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-29 Thread Christophe Leroy


Le 29/05/2024 à 10:02, Oscar Salvador a écrit :
> [Vous ne recevez pas souvent de courriers de osalva...@suse.com. D?couvrez 
> pourquoi ceci est important ? https://aka.ms/LearnAboutSenderIdentification ]
> 
> On Mon, May 27, 2024 at 03:30:06PM +0200, Christophe Leroy wrote:
>> In order to fit better with standard Linux page tables layout, add
>> support for 8M pages using contiguous PTE entries in a standard
>> page table. Page tables will then be populated with 1024 similar
>> entries and two PMD entries will point to that page table.
>>
>> The PMD entries also get a flag to tell it is addressing an 8M page,
>> this is required for the HW tablewalk assistance.
>>
>> Signed-off-by: Christophe Leroy 
>> Reviewed-by: Oscar Salvador 
>> ---
> ...
>> +#define __HAVE_ARCH_HUGE_PTEP_GET
>> +static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
>> pte_t *ptep)
>> +{
>> + if (ptep_is_8m_pmdp(mm, addr, ptep))
>> + ptep = pte_offset_kernel((pmd_t *)ptep, 0);
> 
> Yes, you are right that this should have had the addr aligned down.
> 
> I can speak for others, but for me it is more clear to think of it this way:
> 
> 1) check if ptep points to the first PMD entry for address
> 2) if it does, we know that the PMD describes a 8MB hugepage
> 3) return the PMD

But it is not exactly the way I see it, maybe I'm wrong but from my 
point of view:
1) check if ptep points to the first PMD entry for address
2) if it does, we know that the PMD relates to a 8MB hugepage
3) Return the first PTE in the page table pointed by the said PMD entry.

> 
> That is why I thought that directly calling pmd_page_vaddr() gave a more clear
> overview.
> 
> Now, feel free to ignore this if you think this is not clear or adds 
> confusion,
> I just wanted to give my insight reflecting on what I considered more
> logical.

For me using pte_offset_kernel() make it more clear that we want a PTE 
in a page table, and not some raw information at a given address pointed 
to by a huge-PMD.

Am I wrong ?

> 
> 
> --
> Oscar Salvador
> SUSE Labs


Re: [PATCH v2 1/2] memory: fsl_ifc: Make FSL_IFC config visible and selectable

2024-05-28 Thread Christophe Leroy


Le 28/05/2024 à 14:28, Esben Haabendal a écrit :
> [Vous ne recevez pas souvent de courriers de es...@geanix.com. Découvrez 
> pourquoi ceci est important à https://aka.ms/LearnAboutSenderIdentification ]
> 
> While use of fsl_ifc driver with NAND flash is fine, as the fsl_ifc_nand
> driver selects FSL_IFC automatically, we need the CONFIG_FSL_IFC option to
> be selectable for platforms using fsl_ifc with NOR flash.

I don't understand.

Shall I understand :

While use of fsl_ifc driver with NAND flash is fine as the fsl_ifc_nand 
driver selects FSL_IFC automatically, 

or

..., as the fsl_ifc_nand driver selects FSL_IFC automatically we need 
the CONFIG_FSL_IFC option to be selectable for platforms using fsl_ifc 
with NOR flash



I'm fine with the fact that you want to be able to select it when you 
use NOR flashes, allthough I can't see why, but why do you need to 
change the "select" to a "depend" ? You should be able to leave it as a 
"select" in which case patch 2 wouldn't be necessary.

Christophe



> 
> Fixes: ea0c0ad6b6eb ("memory: Enable compile testing for most of the drivers")
> Signed-off-by: Esben Haabendal 
> ---
>   drivers/memory/Kconfig   | 2 +-
>   drivers/mtd/nand/raw/Kconfig | 3 +--
>   2 files changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/memory/Kconfig b/drivers/memory/Kconfig
> index 8efdd1f97139..c82d8d8a16ea 100644
> --- a/drivers/memory/Kconfig
> +++ b/drivers/memory/Kconfig
> @@ -167,7 +167,7 @@ config FSL_CORENET_CF
>represents a coherency violation.
> 
>   config FSL_IFC
> -   bool "Freescale IFC driver" if COMPILE_TEST
> +   bool "Freescale IFC driver"
>  depends on FSL_SOC || ARCH_LAYERSCAPE || SOC_LS1021A || COMPILE_TEST
>  depends on HAS_IOMEM
> 
> diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
> index cbf8ae85e1ae..614257308516 100644
> --- a/drivers/mtd/nand/raw/Kconfig
> +++ b/drivers/mtd/nand/raw/Kconfig
> @@ -234,8 +234,7 @@ config MTD_NAND_FSL_IFC
>  tristate "Freescale IFC NAND controller"
>  depends on FSL_SOC || ARCH_LAYERSCAPE || SOC_LS1021A || COMPILE_TEST
>  depends on HAS_IOMEM
> -   select FSL_IFC
> -   select MEMORY
> +   depends on FSL_IFC
>  help
>Various Freescale chips e.g P1010, include a NAND Flash machine
>with built-in hardware ECC capabilities.
> 
> --
> 2.45.1
> 


Re: [RFC PATCH v4 03/16] mm: Provide mm_struct and address to huge_ptep_get()

2024-05-28 Thread Christophe Leroy


Le 28/05/2024 à 07:41, Oscar Salvador a écrit :
> On Mon, May 27, 2024 at 03:30:01PM +0200, Christophe Leroy wrote:
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -547,7 +547,7 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t 
>> *ptep, unsigned long sz
>>  if (pte_end < end)
>>  end = pte_end;
>>   
>> -pte = huge_ptep_get(ptep);
>> +pte = huge_ptep_get(vma->mm, addr, ptep);
> 
> I looked again and I stumbled upon this.
> It should have been "vma->vm_mm".

Oops ... Thanks for seeing that. As it goes away at the end it went 
unnoticed by builds.

Christophe


Re: [RFC PATCH v3 08/16] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-28 Thread Christophe Leroy


Le 27/05/2024 à 14:10, Oscar Salvador a écrit :
> On Sun, May 26, 2024 at 11:22:28AM +0200, Christophe Leroy wrote:
>> In order to fit better with standard Linux page tables layout, add
>> support for 8M pages using contiguous PTE entries in a standard
>> page table. Page tables will then be populated with 1024 similar
>> entries and two PMD entries will point to that page table.
>>
>> The PMD entries also get a flag to tell it is addressing an 8M page,
>> this is required for the HW tablewalk assistance.
>>
>> Signed-off-by: Christophe Leroy 
> 
> I did not look close into KSAN bits, and I trust you with the assembly part,
> but other than that looks good to me, so FWIW:
> 
> Reviewed-by: Oscar Salvador 
> 
> Just a nit below:
> 
>> +#define __HAVE_ARCH_HUGE_PTEP_GET
>> +static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
>> pte_t *ptep)
>> +{
>> +if (ptep_is_8m_pmdp(mm, addr, ptep))
>> +ptep = pte_offset_kernel((pmd_t *)ptep, 0);
> 
> Would it not be more clear to use pmd_page_vaddr directly there?
> 
> 

Well, the correct way should have been:

ptep = pte_offset_kernel((pmd_t *)ptep, ALIGN_DOWN(addr, SZ_8M));

Now, is it more clear with:

ptep = (pte_t *)pmd_page_vaddr(*(pmd_t *)ptep);

I don't know.


Re: [RFC PATCH v3 03/16] mm: Provide mm_struct and address to huge_ptep_get()

2024-05-27 Thread Christophe Leroy


Le 27/05/2024 à 13:19, Oscar Salvador a écrit :
> On Sun, May 26, 2024 at 11:22:23AM +0200, Christophe Leroy wrote:
>> On powerpc 8xx huge_ptep_get() will need to know whether the given
>> ptep is a PTE entry or a PMD entry. This cannot be known with the
>> PMD entry itself because there is no easy way to know it from the
>> content of the entry.
>>
>> So huge_ptep_get() will need to know either the size of the page
>> or get the pmd.
>>
>> In order to be consistent with huge_ptep_get_and_clear(), give
>> mm and address to huge_ptep_get().
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>> v2: Add missing changes in arch implementations
>> v3: Fixed a comment in ARM and missing changes in S390
>> ---
>>   arch/arm/include/asm/hugetlb-3level.h |  4 +--
>>   arch/arm64/include/asm/hugetlb.h  |  2 +-
>>   arch/arm64/mm/hugetlbpage.c   |  2 +-
>>   arch/riscv/include/asm/hugetlb.h  |  2 +-
>>   arch/riscv/mm/hugetlbpage.c   |  2 +-
>>   arch/s390/include/asm/hugetlb.h   |  4 +--
>>   arch/s390/mm/hugetlbpage.c|  4 +--
> 
> I was wondering whether we could do something similar for what we did in
> patch#1, so we do not touch architectures code.

We could be is that worth the churn ?

With patch 1 there was only one callsite.

Here we have many callsites, and we also have huge_ptep_get_and_clear() 
which already takes three arguments. So for me it make more sense to 
adapt huge_ptep_get() here.

Today several of the huge-related functions already have parameters that 
are used only by a few architectures and everytime one architecture 
needs a new parameter it is added for all of them, and there are 
exemples in the past of new functions added to get new parameters for 
only a few architectures that ended up with a mess and a need to 
re-factor at the end.

See for instance the story around arch_make_huge_pte() and pte_mkhuge(), 
both do the same but arch_make_huge_pte() was added to take additional 
parameters by commit d9ed9faac283 ("mm: add new arch_make_huge_pte() 
method for tile support") then they were merged by commit 16785bd77431 
("mm: merge pte_mkhuge() call into arch_make_huge_pte()")

So I'm open to any suggestion but we need to try not make it a bigger 
mess at the end.

By the way, I think most if not all huge related helpers should all take 
the same parameters even if not all of them are used, then it would make 
things easier. And maybe the cleanest would be to give the page size to 
all those functions instead of having them guess it.

So let's have your ideas here on the most straight forward way to handle 
that.

> 
>
>> diff --git a/mm/gup.c b/mm/gup.c
>> index 1611e73b1121..86b5105b82a1 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -2812,7 +2812,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, 
>> unsigned long addr,
>>  if (pte_end < end)
>>  end = pte_end;
>>   
>> -pte = huge_ptep_get(ptep);
>> +pte = huge_ptep_get(NULL, addr, ptep);
> 
> I know that after this series all this code is gone, but I was not sure
> about the behaviour between this patch and the last one.
> 
> It made me nervous, until I realized that this code is only used
> on CONFIG_ARCH_HAS_HUGEPD, which should not be the case anymore for 8xx after
> patch#8, and since 8xx is the only one that will use the mm parameter from
> huge_ptep_get, we are all good.
> 

By the way, after commit 01d89b93e176 ("mm/gup: fix hugepd handling in 
hugetlb rework") we now have the vma in gup_hugepte() so we now pass 
vma->vm_mm

Thanks for the review
Christophe


[RFC PATCH v4 16/16] mm: Remove CONFIG_ARCH_HAS_HUGEPD

2024-05-27 Thread Christophe Leroy
powerpc was the only user of CONFIG_ARCH_HAS_HUGEPD and doesn't
use it anymore, so remove all related code.

Signed-off-by: Christophe Leroy 
---
v4: Rebased on v6.10-rc1
---
 arch/powerpc/mm/hugetlbpage.c |   1 -
 include/linux/hugetlb.h   |   6 --
 mm/Kconfig|  10 --
 mm/gup.c  | 183 +-
 mm/pagewalk.c |  57 +--
 5 files changed, 9 insertions(+), 248 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 76846c6014e4..6b043180220a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -78,7 +78,6 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
return pte_alloc_huge(mm, pmd, addr);
 }
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2b3c3a404769..58daf7d14bf4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,12 +20,6 @@ struct user_struct;
 struct mmu_gather;
 struct node;
 
-#ifndef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define is_hugepd(hugepd) (0)
-#define __hugepd(x) ((hugepd_t) { (x) })
-#endif
-
 void free_huge_folio(struct folio *folio);
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/Kconfig b/mm/Kconfig
index b4cb45255a54..049d29ec6e20 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1119,16 +1119,6 @@ config DMAPOOL_TEST
 config ARCH_HAS_PTE_SPECIAL
bool
 
-#
-# Some architectures require a special hugepage directory format that is
-# required to support multiple hugepage sizes. For example a4fe3ce76
-# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
-# introduced it on powerpc.  This allows for a more flexible hugepage
-# pagetable layouts.
-#
-config ARCH_HAS_HUGEPD
-   bool
-
 config MAPPING_DIRTY_HELPERS
 bool
 
diff --git a/mm/gup.c b/mm/gup.c
index 53ebb0ae53a0..f8e982a42bba 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -501,7 +501,7 @@ static inline void mm_set_has_pinned_flag(unsigned long 
*mm_flags)
 
 #ifdef CONFIG_MMU
 
-#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST)
+#ifdef CONFIG_HAVE_GUP_FAST
 static int record_subpages(struct page *page, unsigned long sz,
   unsigned long addr, unsigned long end,
   struct page **pages)
@@ -515,147 +515,7 @@ static int record_subpages(struct page *page, unsigned 
long sz,
 
return nr;
 }
-#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */
-
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
- unsigned long sz)
-{
-   unsigned long __boundary = (addr + sz) & ~(sz-1);
-   return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
-/*
- * Returns 1 if succeeded, 0 if failed, -EMLINK if unshare needed.
- *
- * NOTE: for the same entry, gup-fast and gup-slow can return different
- * results (0 v.s. -EMLINK) depending on whether vma is available.  This is
- * the expected behavior, where we simply want gup-fast to fallback to
- * gup-slow to take the vma reference first.
- */
-static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long 
sz,
-  unsigned long addr, unsigned long end, unsigned int 
flags,
-  struct page **pages, int *nr)
-{
-   unsigned long pte_end;
-   struct page *page;
-   struct folio *folio;
-   pte_t pte;
-   int refs;
-
-   pte_end = (addr + sz) & ~(sz-1);
-   if (pte_end < end)
-   end = pte_end;
-
-   pte = huge_ptep_get(vma->mm, addr, ptep);
-
-   if (!pte_access_permitted(pte, flags & FOLL_WRITE))
-   return 0;
-
-   /* hugepages are never "special" */
-   VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-   page = pte_page(pte);
-   refs = record_subpages(page, sz, addr, end, pages + *nr);
-
-   folio = try_grab_folio(page, refs, flags);
-   if (!folio)
-   return 0;
-
-   if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   if (!pte_write(pte) && gup_must_unshare(vma, flags, >page)) {
-   gup_put_folio(folio, refs, flags);
-   return -EMLINK;
-   }
-
-   *nr += refs;
-   folio_set_referenced(folio);
-   return 1;
-}
-
-/*
- * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
- * systems on Power, which does not have issue with folio writeback against
- * GUP updates.  When hugepd will be extended to support non-hugetlbfs or
- * even anonymous memory, we need to do extra check as what we do with most
- * of the other folios. See writable_file_mapping_allowed() and
- * gup_fast_folio_allowed() for more information.
- */
-static int gu

[RFC PATCH v4 06/16] powerpc/mm: Allow hugepages without hugepd

2024-05-27 Thread Christophe Leroy
In preparation of implementing huge pages on powerpc 8xx
without hugepd, enclose hugepd related code inside an
ifdef CONFIG_ARCH_HAS_HUGEPD

This also allows removing some stubs.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3:
- Prepare huge_pte_alloc() for full standard topology, not only for 2-level
- Reordered last part of huge_pte_alloc()

v4:
- Rebased of v6.10-rc1
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  2 --
 arch/powerpc/include/asm/hugetlb.h   | 10 ++
 arch/powerpc/include/asm/nohash/pgtable.h|  2 +-
 arch/powerpc/mm/hugetlbpage.c| 33 
 arch/powerpc/mm/pgtable.c|  2 ++
 5 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index dc5c039eb28e..dd4eb3063175 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -47,8 +47,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index ea71f7245a63..79176a499763 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,10 +30,12 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling);
+#endif
 
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -67,14 +69,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct 
*vma,
 {
 }
 
-#define hugepd_shift(x) 0
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned pdshift)
-{
-   return NULL;
-}
-
-
 static inline void __init gigantic_hugetlb_cma_reserve(void)
 {
 }
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index f5f39d4f03c8..e7fc1314c23e 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -340,7 +340,7 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
 
 #define pgprot_writecombine pgprot_noncached_wc
 
-#ifdef CONFIG_HUGETLB_PAGE
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static inline int hugepd_ok(hugepd_t hpd)
 {
 #ifdef CONFIG_PPC_8xx
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 594a4b7b2ca2..20fad59ff9f5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -42,6 +42,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
   unsigned long address, unsigned int pdshift,
   unsigned int pshift, spinlock_t *ptl)
@@ -193,6 +194,36 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
return hugepte_offset(*hpdp, addr, pdshift);
 }
+#else
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long sz)
+{
+   p4d_t *p4d;
+   pud_t *pud;
+   pmd_t *pmd;
+
+   addr &= ~(sz - 1);
+
+   p4d = p4d_offset(pgd_offset(mm, addr), addr);
+   if (!mm_pud_folded(mm) && sz >= P4D_SIZE)
+   return (pte_t *)p4d;
+
+   pud = pud_alloc(mm, p4d, addr);
+   if (!pud)
+   return NULL;
+   if (!mm_pmd_folded(mm) && sz >= PUD_SIZE)
+   return (pte_t *)pud;
+
+   pmd = pmd_alloc(mm, pud, addr);
+   if (!pmd)
+   return NULL;
+
+   if (sz >= PMD_SIZE)
+   return (pte_t *)pmd;
+
+   return pte_alloc_huge(mm, pmd, addr);
+}
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
@@ -248,6 +279,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h, int 
nid)
return __alloc_bootmem_huge_page(h, nid);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 #ifndef CONFIG_PPC_BOOK3S_64
 #define HUGEPD_FREELIST_SIZE \
((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
@@ -505,6 +537,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
}
} while (addr = next, addr != end);
 }
+#endif
 
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc

[RFC PATCH v4 09/16] powerpc/8xx: Simplify struct mmu_psize_def

2024-05-27 Thread Christophe Leroy
On 8xx, only the shift field is used in struct mmu_psize_def

Remove other fields and related macros.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h 
b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 141d82e249a8..a756a1e59c54 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -189,19 +189,14 @@ typedef struct {
 
 #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff8)
 
-/* Page size definitions, common between 32 and 64-bit
+/*
+ * Page size definitions for 8xx
  *
  *shift : is the "PAGE_SHIFT" value for that page size
- *penc  : is the pte encoding mask
  *
  */
 struct mmu_psize_def {
unsigned intshift;  /* number of bits */
-   unsigned intenc;/* PTE encoding */
-   unsigned intind;/* Corresponding indirect page size shift */
-   unsigned intflags;
-#define MMU_PAGE_SIZE_DIRECT   0x1 /* Supported as a direct size */
-#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
 };
 
 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-- 
2.44.0



[RFC PATCH v4 05/16] powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries

2024-05-27 Thread Christophe Leroy
Building on 32 bits with pmd_leaf() not returning always false leads
to the following error:

  CC  arch/powerpc/mm/pgtable.o
arch/powerpc/mm/pgtable.c: In function '__find_linux_pte':
arch/powerpc/mm/pgtable.c:506:1: error: function may return address of local 
variable [-Werror=return-local-addr]
  506 | }
  | ^
arch/powerpc/mm/pgtable.c:394:15: note: declared here
  394 | pud_t pud, *pudp;
  |   ^~~
arch/powerpc/mm/pgtable.c:394:15: note: declared here

This is due to pmd_offset() being a no-op in that case.

So rework it for powerpc/32 so that pXd_offset() are used on real
pointers and not on on-stack copies.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3: Removed p4dp and pudp locals for PPC32 and add a comment.
v4: Properly set pdshift on PPC32 case
---
 arch/powerpc/mm/pgtable.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9e7ba9c3851f..bce8a8619589 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -382,8 +382,10 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
 {
pgd_t *pgdp;
+#ifdef CONFIG_PPC64
p4d_t p4d, *p4dp;
pud_t pud, *pudp;
+#endif
pmd_t pmd, *pmdp;
pte_t *ret_pte;
hugepd_t *hpdp = NULL;
@@ -401,8 +403,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 * page fault or a page unmap. The return pte_t * is still not
 * stable. So should be checked there for above conditions.
 * Top level is an exception because it is folded into p4d.
+*
+* On PPC32, P4D/PUD/PMD are folded into PGD so go straight to
+* PMD level.
 */
pgdp = pgdir + pgd_index(ea);
+#ifdef CONFIG_PPC64
p4dp = p4d_offset(pgdp, ea);
p4d  = READ_ONCE(*p4dp);
pdshift = P4D_SHIFT;
@@ -442,8 +448,11 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
goto out_huge;
}
 
-   pdshift = PMD_SHIFT;
pmdp = pmd_offset(, ea);
+#else
+   pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea);
+#endif
+   pdshift = PMD_SHIFT;
pmd  = READ_ONCE(*pmdp);
 
/*
-- 
2.44.0



[RFC PATCH v4 15/16] powerpc/mm: Remove hugepd leftovers

2024-05-27 Thread Christophe Leroy
All targets have now opted out of CONFIG_ARCH_HAS_HUGEPD so
remove left over code.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hugetlb.h  |   7 -
 arch/powerpc/include/asm/page.h |   6 -
 arch/powerpc/include/asm/pgtable-be-types.h |  10 -
 arch/powerpc/include/asm/pgtable-types.h|   9 -
 arch/powerpc/mm/hugetlbpage.c   | 412 
 arch/powerpc/mm/init-common.c   |   8 +-
 arch/powerpc/mm/pgtable.c   |  27 +-
 7 files changed, 3 insertions(+), 476 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index e959c26c0b52..18a3028ac3b6 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,13 +30,6 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
-   unsigned long end, unsigned long floor,
-   unsigned long ceiling);
-#endif
-
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 pte_t pte, unsigned long sz);
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index c0af246a64ff..83d0a4fc5f75 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -269,12 +269,6 @@ static inline const void *pfn_to_kaddr(unsigned long pfn)
 #define is_kernel_addr(x)  ((x) >= TASK_SIZE)
 #endif
 
-/*
- * Some number of bits at the level of the page table that points to
- * a hugepte are used to encode the size.  This masks those bits.
- */
-#define HUGEPD_SHIFT_MASK 0x3f
-
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h 
b/arch/powerpc/include/asm/pgtable-be-types.h
index 82633200b500..6bd8f89b25dc 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -101,14 +101,4 @@ static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t 
new)
return pmd_raw(old) == prev;
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { __be64 pdbe; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { cpu_to_be64(x) })
-
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return be64_to_cpu(x.pdbe);
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index db965d98e0ae..7b3d4c592a10 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -87,13 +87,4 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t 
new)
 }
 #endif
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { (x) })
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return x.pd;
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1fe2843f5b12..76846c6014e4 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -28,8 +28,6 @@
 
 bool hugetlb_disabled = false;
 
-#define hugepd_none(hpd)   (hpd_val(hpd) == 0)
-
 #define PTE_T_ORDER(__builtin_ffs(sizeof(pte_basic_t)) - \
 __builtin_ffs(sizeof(void *)))
 
@@ -42,156 +40,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-  unsigned long address, unsigned int pdshift,
-  unsigned int pshift, spinlock_t *ptl)
-{
-   struct kmem_cache *cachep;
-   pte_t *new;
-   int i;
-   int num_hugepd;
-
-   if (pshift >= pdshift) {
-   cachep = PGT_CACHE(PTE_T_ORDER);
-   num_hugepd = 1 << (pshift - pdshift);
-   } else {
-   cachep = PGT_CACHE(pdshift - pshift);
-   num_hugepd = 1;
-   }
-
-   if (!cachep) {
-   WARN_ONCE(1, "No page table cache created for hugetlb tables");
-   return -ENOMEM;
-   }
-
-   new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
-
-   BUG_ON(pshift > HUGEPD_SHIFT_MASK);
-   BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
-
-   if (!new)
-   return -ENOMEM;
-
-   /*
-* Make sure other cpus find the hugepd set only after a
-* properly initialized page table is visible to them.
-* For more details look for comment in __pte_alloc().
-*/
-   smp_wmb();
-
-   spin_lock(ptl

[RFC PATCH v4 13/16] powerpc/e500: Use contiguous PMD instead of hugepd

2024-05-27 Thread Christophe Leroy
e500 supports many page sizes among which the following size are
implemented in the kernel at the time being: 4M, 16M, 64M, 256M, 1G.

On e500, TLB miss for hugepages is exclusively handled by SW even
on e6500 which has HW assistance for 4k pages, so there are no
constraints like on the 8xx.

On e500/32, all are at PGD/PMD level and can be handled as
cont-PMD.

On e500/64, smaller ones are on PMD while bigger ones are on PUD.
Again, they can easily be handled as cont-PMD and cont-PUD instead
of hugepd.

Signed-off-by: Christophe Leroy 
---
v3: Add missing pmd_leaf_size() and pud_leaf_size()
v4: Rebased of v6.10-rc1 : pmd_huge() and pud_huge() are gone
---
 .../powerpc/include/asm/nohash/hugetlb-e500.h | 32 +-
 arch/powerpc/include/asm/nohash/pgalloc.h |  2 -
 arch/powerpc/include/asm/nohash/pgtable.h | 37 +++
 arch/powerpc/include/asm/nohash/pte-e500.h| 28 +
 arch/powerpc/include/asm/page.h   | 15 +
 arch/powerpc/kernel/head_85xx.S   | 23 +++
 arch/powerpc/mm/hugetlbpage.c |  2 -
 arch/powerpc/mm/nohash/tlb_low_64e.S  | 63 +++
 arch/powerpc/mm/pgtable.c | 31 +
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 10 files changed, 139 insertions(+), 95 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index d8e51a3f8557..d30e2a3f129d 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -2,38 +2,12 @@
 #ifndef _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 #define _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   if (WARN_ON(!hugepd_ok(hpd)))
-   return NULL;
-
-   return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return hpd_val(hpd) & HUGEPD_SHIFT_MASK;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   /*
-* On FSL BookE, we have multiple higher-level table entries that
-* point to the same hugepte.  Just use the first one since they're all
-* identical.  So for that case, idx=0.
-*/
-   return hugepd_page(hpd);
-}
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+pte_t pte, unsigned long sz);
 
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
-{
-   /* We use the old format for PPC_E500 */
-   *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
-}
-
 static inline int check_and_get_huge_psize(int shift)
 {
if (shift & 1)  /* Not a power of 4 */
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h 
b/arch/powerpc/include/asm/nohash/pgalloc.h
index 4b62376318e1..d06efac6d7aa 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -44,8 +44,6 @@ static inline void pgtable_free(void *table, int shift)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int 
shift)
 {
unsigned long pgf = (unsigned long)table;
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 90d6a0943b35..f7421d1a1693 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -52,11 +52,36 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, 
unsigned long addr, p
 {
pte_basic_t old = pte_val(*p);
pte_basic_t new = (old & ~(pte_basic_t)clr) | set;
+   unsigned long sz;
+   unsigned long pdsize;
+   int i;
 
if (new == old)
return old;
 
-   *p = __pte(new);
+#ifdef CONFIG_PPC_E500
+   if (huge)
+   sz = 1UL << (((old & _PAGE_HSIZE_MSK) >> _PAGE_HSIZE_SHIFT) + 
20);
+   else
+#endif
+   sz = PAGE_SIZE;
+
+   if (!huge || sz < PMD_SIZE)
+   pdsize = PAGE_SIZE;
+   else if (sz < PUD_SIZE)
+   pdsize = PMD_SIZE;
+   else if (sz < P4D_SIZE)
+   pdsize = PUD_SIZE;
+   else if (sz < PGDIR_SIZE)
+   pdsize = P4D_SIZE;
+   else
+   pdsize = PGDIR_SIZE;
+
+   for (i = 0; i < sz / pdsize; i++, p++) {
+   *p = __pte(new);
+   if (new)
+   new += (unsigned long long)(pdsize / PAGE_SIZE) << 
PTE_RPN_SHIFT;
+   }
 
if (IS_ENABLED(CONFIG_44x) && !is_kernel_addr(addr) && (old & 
_PAGE_EXEC))
icache_44x_need_flush = 1;
@@ -340,16 +365,6

[RFC PATCH v4 11/16] powerpc/e500: Switch to 64 bits PGD on 85xx (32 bits)

2024-05-27 Thread Christophe Leroy
At the time being when CONFIG_PTE_64BIT is selected, PTE entries are
64 bits but PGD entries are still 32 bits.

In order to allow leaf PMD entries, switch the PGD to 64 bits entries.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/pgtable-types.h |  4 
 arch/powerpc/kernel/head_85xx.S  | 10 ++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index 082c85cc09b1..db965d98e0ae 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -49,7 +49,11 @@ static inline unsigned long pud_val(pud_t x)
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
+#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT)
+typedef struct { unsigned long long pgd; } pgd_t;
+#else
 typedef struct { unsigned long pgd; } pgd_t;
+#endif
 #define __pgd(x)   ((pgd_t) { (x) })
 static inline unsigned long pgd_val(pgd_t x)
 {
diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index 39724ff5ae1f..a305244afc9f 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -307,8 +307,9 @@ set_ivor:
 #ifdef CONFIG_PTE_64BIT
 #ifdef CONFIG_HUGETLB_PAGE
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
blt 1000f;  /* Normal non-huge page */  \
beq 2f; /* Bail if no table */  \
@@ -321,8 +322,9 @@ set_ivor:
 1001:  lwz r11, 4(r12);/* Get pte entry */
 #else
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
beq 2f; /* Bail if no table */  \
rlwimi  r12, r10, 23, 20, 28;   /* Compute pte address */   \
-- 
2.44.0



[RFC PATCH v4 04/16] powerpc/mm: Remove _PAGE_PSIZE

2024-05-27 Thread Christophe Leroy
_PAGE_PSIZE macro is never used outside the place it is defined
and is used only on 8xx and e500.

Remove indirection, remove it and use its content directly.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/32/pte-40x.h  | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-44x.h  | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-85xx.h | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 5 ++---
 arch/powerpc/include/asm/nohash/pte-e500.h| 4 +---
 5 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h 
b/arch/powerpc/include/asm/nohash/32/pte-40x.h
index d759cfd74754..52ed58516fa4 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-40x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h
@@ -49,9 +49,6 @@
 #define _PAGE_EXEC 0x200   /* hardware: EX permission */
 #define _PAGE_ACCESSED 0x400   /* software: R: page referenced */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 /* cache related flags non existing on 40x */
 #define _PAGE_COHERENT 0
 
diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h 
b/arch/powerpc/include/asm/nohash/32/pte-44x.h
index 851813725237..da0469928273 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-44x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h
@@ -75,9 +75,6 @@
 #define _PAGE_NO_CACHE 0x0400  /* H: I bit */
 #define _PAGE_WRITETHRU0x0800  /* H: W bit */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 /* TODO: Add large page lowmem mapping support */
 #define _PMD_PRESENT   0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
index 653a342d3b25..14d64b4f3f14 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
@@ -31,9 +31,6 @@
 #define _PAGE_WRITETHRU0x00400 /* H: W bit */
 #define _PAGE_SPECIAL  0x00800 /* S: Special page */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 #define _PMD_PRESENT   0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
 #define _PMD_BAD   (~PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 137dc3c84e45..625c31d6ce5c 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -74,12 +74,11 @@
 #define _PTE_NONE_MASK 0
 
 #ifdef CONFIG_PPC_16K_PAGES
-#define _PAGE_PSIZE_PAGE_SPS
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_SPS)
 #else
-#define _PAGE_PSIZE0
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED)
 #endif
 
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
 #define _PAGE_BASE (_PAGE_BASE_NC)
 
 #include 
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
b/arch/powerpc/include/asm/nohash/pte-e500.h
index f516f0b5b7a8..975facc7e38e 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -65,8 +65,6 @@
 
 #define _PAGE_SPECIAL  _PAGE_SW0
 
-/* Base page size */
-#define _PAGE_PSIZE_PAGE_PSIZE_4K
 #definePTE_RPN_SHIFT   (24)
 
 #define PTE_WIMGE_SHIFT (19)
@@ -89,7 +87,7 @@
  * pages. We always set _PAGE_COHERENT when SMP is enabled or
  * the processor might need it for DMA coherency.
  */
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE_4K)
 #if defined(CONFIG_SMP)
 #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT)
 #else
-- 
2.44.0



[RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-27 Thread Christophe Leroy
On book3s/64, the only user of hugepd is hash in 4k mode.

All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.

Rework hash-4k to use contiguous PMD and PUD instead.

In that setup there are only two huge page sizes: 16M and 16G.

16M sits at PMD level and 16G at PUD level.

pte_update doesn't know page size, lets use the same trick as
hpte_need_flush() to get page size from segment properties. That's
not the most efficient way but let's do that until callers of
pte_update() provide page size instead of just a huge flag.

Signed-off-by: Christophe Leroy 
---
v3:
- Add missing pmd_leaf_size() and pud_leaf_size()
- More cleanup in hugetlbpage_init()
- Take a page fault when DIRTY or ACCESSED is missing on hash-4 hugepage

v4: Rebased on v6.10-rc1
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 15 --
 arch/powerpc/include/asm/book3s/64/hash.h | 38 ---
 arch/powerpc/include/asm/book3s/64/hugetlb.h  | 38 ---
 .../include/asm/book3s/64/pgtable-4k.h| 47 ---
 .../include/asm/book3s/64/pgtable-64k.h   | 20 
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 22 +++--
 arch/powerpc/include/asm/hugetlb.h|  4 ++
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  4 --
 arch/powerpc/include/asm/page.h   |  8 
 arch/powerpc/mm/book3s64/hash_utils.c | 11 +++--
 arch/powerpc/mm/book3s64/hugetlbpage.c| 10 
 arch/powerpc/mm/book3s64/pgtable.c| 12 -
 arch/powerpc/mm/hugetlbpage.c | 27 ---
 arch/powerpc/mm/pgtable.c |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 15 files changed, 72 insertions(+), 187 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/book3s/64/pgtable-4k.h

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 6472b08fa1b0..c654c376ef8b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -74,21 +74,6 @@
 #define remap_4k_pfn(vma, addr, pfn, prot) \
remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
-#ifdef CONFIG_HUGETLB_PAGE
-static inline int hash__hugepd_ok(hugepd_t hpd)
-{
-   unsigned long hpdval = hpd_val(hpd);
-   /*
-* if it is not a pte and have hugepd shift mask
-* set, then it is a hugepd directory pointer
-*/
-   if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
-   ((hpdval & HUGEPD_SHIFT_MASK) != 0))
-   return true;
-   return false;
-}
-#endif
-
 /*
  * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just
  * a matter of returning the PTE bits that need to be modified. On 64K PTE,
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index faf3e3b4e4b2..8202c27afe23 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include 
+#include 
 
 /*
  * Common bits between 4K and 64K pages in a linux-style PTE.
@@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, 
unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
 unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long 
flags);
 /* Atomic PTE updates */
-static inline unsigned long hash__pte_update(struct mm_struct *mm,
-unsigned long addr,
-pte_t *ptep, unsigned long clr,
-unsigned long set,
-int huge)
+static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long 
clr,
+unsigned long set)
 {
__be64 old_be, tmp_be;
-   unsigned long old;
 
__asm__ __volatile__(
"1: ldarx   %0,0,%3 # pte_update\n\
@@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct 
mm_struct *mm,
: "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
: "cc" );
+
+   return be64_to_cpu(old_be);
+}
+
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+unsigned long addr,
+pte_t *ptep, unsigned long clr,
+unsigned long set,
+int huge)
+{
+   unsigned long old;
+
+   old = hash__pte_update_one(ptep, clr, set);
+
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
+   unsigned int psize = get_slice_psize(mm, addr);
+   int nb, i;
+
+   if (psize == MMU_PAG

[RFC PATCH v4 10/16] powerpc/e500: Remove enc and ind fields from struct mmu_psize_def

2024-05-27 Thread Christophe Leroy
enc field is hidden behind BOOK3E_PAGESZ_XX macros, and when you look
closer you realise that this field is nothing else than the value of
shift minus ten.

So remove enc field and calculate tsize from shift field.

Also remove inc field which is unused.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h | 3 ---
 arch/powerpc/mm/nohash/book3e_pgtable.c| 4 ++--
 arch/powerpc/mm/nohash/tlb.c   | 9 +
 arch/powerpc/mm/nohash/tlb_64e.c   | 2 +-
 4 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 7dc24b8632d7..b281d9eeaf1e 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -244,14 +244,11 @@ typedef struct {
 /* Page size definitions, common between 32 and 64-bit
  *
  *shift : is the "PAGE_SHIFT" value for that page size
- *penc  : is the pte encoding mask
  *
  */
 struct mmu_psize_def
 {
unsigned intshift;  /* number of bits */
-   unsigned intenc;/* PTE encoding */
-   unsigned intind;/* Corresponding indirect page size shift */
unsigned intflags;
 #define MMU_PAGE_SIZE_DIRECT   0x1 /* Supported as a direct size */
 #define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c 
b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 1c5e4ecbebeb..ad2a7c26f2a0 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -29,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start,
_PAGE_KERNEL_RW;
 
/* PTEs only contain page size encodings up to 32M */
-   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf);
 
/* Encode the size in the PTE */
-   flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+   flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8;
 
/* For each PTE for that area, map things. Note that we don't
 * increment phys because all PTEs are of the large size and
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index f57dc721d063..b653a7be4cb1 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -53,37 +53,30 @@
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift  = 12,
-   .enc= BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_2M] = {
.shift  = 21,
-   .enc= BOOK3E_PAGESZ_2M,
},
[MMU_PAGE_4M] = {
.shift  = 22,
-   .enc= BOOK3E_PAGESZ_4M,
},
[MMU_PAGE_16M] = {
.shift  = 24,
-   .enc= BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_64M] = {
.shift  = 26,
-   .enc= BOOK3E_PAGESZ_64M,
},
[MMU_PAGE_256M] = {
.shift  = 28,
-   .enc= BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift  = 30,
-   .enc= BOOK3E_PAGESZ_1GB,
},
 };
 
 static inline int mmu_get_tsize(int psize)
 {
-   return mmu_psize_defs[psize].enc;
+   return mmu_psize_defs[psize].shift - 10;
 }
 #else
 static inline int mmu_get_tsize(int psize)
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
index 053128a5636c..7988238496d7 100644
--- a/arch/powerpc/mm/nohash/tlb_64e.c
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -53,7 +53,7 @@ int extlb_level_exc;
  */
 void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 {
-   int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+   int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10;
 
if (book3e_htw_mode != PPC_HTW_NONE) {
unsigned long start = address & PMD_MASK;
-- 
2.44.0



[RFC PATCH v4 12/16] powerpc/e500: Encode hugepage size in PTE bits

2024-05-27 Thread Christophe Leroy
Use U0-U3 bits to encode hugepage size, more exactly page shift.

As we start using hugepages at shift 21 (2Mbytes), substract 20
so that it fits into 4 bits. That may change in the future if
we want to use smaller hugepages.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/hugetlb-e500.h | 6 ++
 arch/powerpc/include/asm/nohash/pte-e500.h | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index 8f04ad20e040..d8e51a3f8557 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -42,4 +42,10 @@ static inline int check_and_get_huge_psize(int shift)
return shift_to_mmu_psize(shift);
 }
 
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, 
vm_flags_t flags)
+{
+   return __pte(pte_val(entry) | (_PAGE_U3 * (shift - 20)));
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
 #endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
b/arch/powerpc/include/asm/nohash/pte-e500.h
index 975facc7e38e..091e4bff1fba 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -46,6 +46,9 @@
 #define _PAGE_NO_CACHE 0x40 /* I: cache inhibit */
 #define _PAGE_WRITETHRU0x80 /* W: cache write-through */
 
+#define _PAGE_HSIZE_MSK (_PAGE_U0 | _PAGE_U1 | _PAGE_U2 | _PAGE_U3)
+#define _PAGE_HSIZE_SHIFT  14
+
 /* "Higher level" linux bit combinations */
 #define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was 
cache cleaned */
 #define _PAGE_READ (_PAGE_BAP_SR | _PAGE_BAP_UR) /* User read 
permission */
-- 
2.44.0



[RFC PATCH v4 08/16] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-27 Thread Christophe Leroy
In order to fit better with standard Linux page tables layout, add
support for 8M pages using contiguous PTE entries in a standard
page table. Page tables will then be populated with 1024 similar
entries and two PMD entries will point to that page table.

The PMD entries also get a flag to tell it is addressing an 8M page,
this is required for the HW tablewalk assistance.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3:
- Move huge_ptep_get() for a more readable commit diff
- Flag PMD as 8Mbytes in set_huge_pte_at()
- Define __pte_leaf_size()
- Change pte_update() instead of all huge callers of pte_update()
- Added ptep_is_8m_pmdp() helper
- Fixed kasan early memory 8M allocation
---
 arch/powerpc/Kconfig  |  1 -
 .../include/asm/nohash/32/hugetlb-8xx.h   | 38 +++--
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 53 ---
 arch/powerpc/include/asm/nohash/pgtable.h |  4 --
 arch/powerpc/include/asm/page.h   |  5 --
 arch/powerpc/include/asm/pgtable.h|  3 ++
 arch/powerpc/kernel/head_8xx.S| 10 +---
 arch/powerpc/mm/hugetlbpage.c | 18 ---
 arch/powerpc/mm/kasan/8xx.c   | 21 +---
 arch/powerpc/mm/nohash/8xx.c  | 40 +++---
 arch/powerpc/mm/pgtable.c | 27 +++---
 arch/powerpc/mm/pgtable_32.c  |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  2 +
 13 files changed, 112 insertions(+), 112 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3c968f2f4ac4..cddccd4ca477 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,7 +135,6 @@ config PPC
select ARCH_HAS_DMA_MAP_DIRECT  if PPC_PSERIES
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
-   select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_KERNEL_FPU_SUPPORT  if PPC_FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h 
b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 92df40c6cc6b..c60219269323 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -4,42 +4,12 @@
 
 #define PAGE_SHIFT_8M  23
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   BUG_ON(!hugepd_ok(hpd));
-
-   return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return PAGE_SHIFT_8M;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   unsigned long idx = (addr & (SZ_4M - 1)) >> PAGE_SHIFT;
-
-   return hugepd_page(hpd) + idx;
-}
-
 static inline void flush_hugetlb_page(struct vm_area_struct *vma,
  unsigned long vmaddr)
 {
flush_tlb_page(vma, vmaddr);
 }
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
-{
-   *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
-static inline void hugepd_populate_kernel(hugepd_t *hpdp, pte_t *new, unsigned 
int pshift)
-{
-   *hpdp = __hugepd(__pa(new) | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
 static inline int check_and_get_huge_psize(int shift)
 {
return shift_to_mmu_psize(shift);
@@ -49,6 +19,14 @@ static inline int check_and_get_huge_psize(int shift)
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 pte_t pte, unsigned long sz);
 
+#define __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
+{
+   if (ptep_is_8m_pmdp(mm, addr, ptep))
+   ptep = pte_offset_kernel((pmd_t *)ptep, 0);
+   return ptep_get(ptep);
+}
+
 #define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep, unsigned long sz)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 625c31d6ce5c..54ebb91dbdcf 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -119,7 +119,7 @@ static inline pte_t pte_mkhuge(pte_t pte)
 
 #define pte_mkhuge pte_mkhuge
 
-static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, 
pte_t *p,
+static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
 unsigned long clr, unsigned long set, int 
huge);
 
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long 
addr, pte_t *ptep)
@@ -141,19 +141,12 @@ static inline void __ptep_set_access_flags(struct 
vm_area_struct *vma, pte_t *pt
 }
 #define __ptep_se

[RFC PATCH v4 07/16] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-05-27 Thread Christophe Leroy
set_huge_pte_at() expects the size of the hugepage as an int, not the
psize which is the index of the page definition in table mmu_psize_defs[]

Fixes: 935d4f0c6dc8 ("mm: hugetlb: add huge page size param to 
set_huge_pte_at()")
Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/mm/nohash/8xx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 43d4842bb1c7..d93433e26ded 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -94,7 +94,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long 
va, phys_addr_t pa,
return -EINVAL;
 
set_huge_pte_at(_mm, va, ptep,
-   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize);
+   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
+   1UL << mmu_psize_to_shift(psize));
 
return 0;
 }
-- 
2.44.0



[RFC PATCH v4 03/16] mm: Provide mm_struct and address to huge_ptep_get()

2024-05-27 Thread Christophe Leroy
On powerpc 8xx huge_ptep_get() will need to know whether the given
ptep is a PTE entry or a PMD entry. This cannot be known with the
PMD entry itself because there is no easy way to know it from the
content of the entry.

So huge_ptep_get() will need to know either the size of the page
or get the pmd.

In order to be consistent with huge_ptep_get_and_clear(), give
mm and address to huge_ptep_get().

Signed-off-by: Christophe Leroy 
---
v2: Add missing changes in arch implementations
v3: Fixed a comment in ARM and missing changes in S390
v4: Fix missing or bad changes in mm/hugetlb.c and rebased on v6.10-rc1
---
 arch/arm/include/asm/hugetlb-3level.h |  4 +--
 arch/arm64/include/asm/hugetlb.h  |  2 +-
 arch/arm64/mm/hugetlbpage.c   |  2 +-
 arch/riscv/include/asm/hugetlb.h  |  2 +-
 arch/riscv/mm/hugetlbpage.c   |  2 +-
 arch/s390/include/asm/hugetlb.h   |  4 +--
 arch/s390/mm/hugetlbpage.c|  4 +--
 fs/hugetlbfs/inode.c  |  2 +-
 fs/proc/task_mmu.c| 10 +++---
 fs/userfaultfd.c  |  2 +-
 include/asm-generic/hugetlb.h |  2 +-
 include/linux/swapops.h   |  4 +--
 mm/damon/vaddr.c  |  6 ++--
 mm/gup.c  |  2 +-
 mm/hmm.c  |  2 +-
 mm/hugetlb.c  | 44 +--
 mm/memory-failure.c   |  2 +-
 mm/mempolicy.c|  2 +-
 mm/migrate.c  |  4 +--
 mm/mincore.c  |  2 +-
 mm/userfaultfd.c  |  2 +-
 21 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/arch/arm/include/asm/hugetlb-3level.h 
b/arch/arm/include/asm/hugetlb-3level.h
index a30be5505793..87d48e2d90ad 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -13,12 +13,12 @@
 
 /*
  * If our huge pte is non-zero then mark the valid bit.
- * This allows pte_present(huge_ptep_get(ptep)) to return true for non-zero
+ * This allows pte_present(huge_ptep_get(mm,addr,ptep)) to return true for 
non-zero
  * ptes.
  * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
  */
 #define __HAVE_ARCH_HUGE_PTEP_GET
-static inline pte_t huge_ptep_get(pte_t *ptep)
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
 {
pte_t retval = *ptep;
if (pte_val(retval))
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 3954cbd2ff56..293f880865e8 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
   pte_t *ptep, unsigned long sz);
 #define __HAVE_ARCH_HUGE_PTEP_GET
-extern pte_t huge_ptep_get(pte_t *ptep);
+extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t 
*ptep);
 
 void __init arm64_hugetlb_cma_reserve(void);
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 3f09ac73cce3..5f1e2103888b 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -127,7 +127,7 @@ static inline int num_contig_ptes(unsigned long size, 
size_t *pgsize)
return contig_ptes;
 }
 
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
int ncontig, i;
size_t pgsize;
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index b1ce97a9dbfc..faf3624d8057 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -44,7 +44,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
   pte_t pte, int dirty);
 
 #define __HAVE_ARCH_HUGE_PTEP_GET
-pte_t huge_ptep_get(pte_t *ptep);
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 
 pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 0ebd968b33c9..42314f093922 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,7 +3,7 @@
 #include 
 
 #ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
unsigned long pte_num;
int i;
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index ce5f4fe8be4d..cf1b5d6fb1a6 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -19,7 +19,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte, unsigned long sz);
 void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr

[RFC PATCH v4 00/16] Reimplement huge pages without hugepd on powerpc (8xx, e500, book3s/64)

2024-05-27 Thread Christophe Leroy
This is the continuation of the RFC v1 series "Reimplement huge pages
without hugepd on powerpc 8xx". It now get rid of hugepd completely
after handling also e500 and book3s/64

Also see https://github.com/linuxppc/issues/issues/483

Unlike most architectures, powerpc 8xx HW requires a two-level
pagetable topology for all page sizes. So a leaf PMD-contig approach
is not feasible as such.

Possible sizes on 8xx are 4k, 16k, 512k and 8M.

First level (PGD/PMD) covers 4M per entry. For 8M pages, two PMD entries
must point to a single entry level-2 page table. Until now that was
done using hugepd. This series changes it to use standard page tables
where the entry is replicated 1024 times on each of the two pagetables
refered by the two associated PMD entries for that 8M page.

For e500 and book3s/64 there are less constraints because it is not
tied to the HW assisted tablewalk like on 8xx, so it is easier to use
leaf PMDs (and PUDs).

On e500 the supported page sizes are 4M, 16M, 64M, 256M and 1G. All at
PMD level on e500/32 (mpc85xx) and mix of PMD and PUD for e500/64. We
encode page size with 4 available bits in PTE entries. On e300/32 PGD
entries size is increases to 64 bits in order to allow leaf-PMD entries
because PTE are 64 bits on e500.

On book3s/64 only the hash-4k mode is concerned. It supports 16M pages
as cont-PMD and 16G pages as cont-PUD. In other modes (radix-4k, radix-6k
and hash-64k) the sizes match with PMD and PUD sizes so that's just leaf
entries. The hash processing make things a bit more complex. To ease
things, __hash_page_huge() is modified to bail out when DIRTY or ACCESSED
bits are missing, leaving it to mm core to fix it.

Global changes in v4:
- Fixed a few issues reported privately by robots
- Rebased on top of v6.10-rc1

Global changes in v3:
- Removed patches 1 and 2
- Squashed patch 11 into patch 5
- Replaced patches 12 and 13 with a series from Michael
- Reordered patches a bit to have more general patches up front

For more details on changes, see in each patch.

Christophe Leroy (15):
  mm: Define __pte_leaf_size() to also take a PMD entry
  mm: Provide mm_struct and address to huge_ptep_get()
  powerpc/mm: Remove _PAGE_PSIZE
  powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries
  powerpc/mm: Allow hugepages without hugepd
  powerpc/8xx: Fix size given to set_huge_pte_at()
  powerpc/8xx: Rework support for 8M pages using contiguous PTE entries
  powerpc/8xx: Simplify struct mmu_psize_def
  powerpc/e500: Remove enc and ind fields from struct mmu_psize_def
  powerpc/e500: Switch to 64 bits PGD on 85xx (32 bits)
  powerpc/e500: Encode hugepage size in PTE bits
  powerpc/e500: Use contiguous PMD instead of hugepd
  powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
  powerpc/mm: Remove hugepd leftovers
  mm: Remove CONFIG_ARCH_HAS_HUGEPD

Michael Ellerman (1):
  powerpc/64e: Remove unused IBM HTW code [SQUASHED]

 arch/arm/include/asm/hugetlb-3level.h |   4 +-
 arch/arm64/include/asm/hugetlb.h  |   2 +-
 arch/arm64/mm/hugetlbpage.c   |   2 +-
 arch/powerpc/Kconfig  |   1 -
 arch/powerpc/include/asm/book3s/32/pgalloc.h  |   2 -
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  15 -
 arch/powerpc/include/asm/book3s/64/hash.h |  38 +-
 arch/powerpc/include/asm/book3s/64/hugetlb.h  |  38 --
 .../include/asm/book3s/64/pgtable-4k.h|  47 --
 .../include/asm/book3s/64/pgtable-64k.h   |  20 -
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  22 +-
 arch/powerpc/include/asm/hugetlb.h|  15 +-
 .../include/asm/nohash/32/hugetlb-8xx.h   |  38 +-
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h  |   9 +-
 arch/powerpc/include/asm/nohash/32/pte-40x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-44x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-85xx.h |   3 -
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  |  58 ++-
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  36 +-
 arch/powerpc/include/asm/nohash/mmu-e500.h|   6 +-
 arch/powerpc/include/asm/nohash/pgalloc.h |   2 -
 arch/powerpc/include/asm/nohash/pgtable.h |  41 +-
 arch/powerpc/include/asm/nohash/pte-e500.h|  35 +-
 arch/powerpc/include/asm/page.h   |  32 --
 arch/powerpc/include/asm/pgtable-be-types.h   |  10 -
 arch/powerpc/include/asm/pgtable-types.h  |  13 +-
 arch/powerpc/include/asm/pgtable.h|   3 +
 arch/powerpc/kernel/exceptions-64e.S  |   4 +-
 arch/powerpc/kernel/head_85xx.S   |  33 +-
 arch/powerpc/kernel/head_8xx.S|  10 +-
 arch/powerpc/kernel/setup_64.c|   6 +-
 arch/powerpc/mm/book3s64/hash_utils.c |  11 +-
 arch/powerpc/mm/book3s64/hugetlbpage.c|  10 +
 arch/powerpc/mm/book3s64/pgtable.c|  12 -
 arch/powerpc/mm/hugetlbpage.c | 455 +---
 arch/powerpc/mm/init-common.c |   8 +-
 arch/powerpc/mm/kasan/8xx.c   |  2

[RFC PATCH v4 02/16] mm: Define __pte_leaf_size() to also take a PMD entry

2024-05-27 Thread Christophe Leroy
On powerpc 8xx, when a page is 8M size, the information is in the PMD
entry. So allow architectures to provide __pte_leaf_size() instead of
pte_leaf_size() and provide the PMD entry to that function.

When __pte_leaf_size() is not defined, define it as a pte_leaf_size()
so that architectures not interested in the PMD arguments are not
impacted.

Only define a default pte_leaf_size() when __pte_leaf_size() is not
defined to make sure nobody adds new calls to pte_leaf_size() in the
core.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
v3: Don't change pte_leaf_size() to not impact other architectures
---
 include/linux/pgtable.h | 3 +++
 kernel/events/core.c| 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 18019f037bae..3080e7cde3de 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1888,9 +1888,12 @@ typedef unsigned int pgtbl_mod_mask;
 #ifndef pmd_leaf_size
 #define pmd_leaf_size(x) PMD_SIZE
 #endif
+#ifndef __pte_leaf_size
 #ifndef pte_leaf_size
 #define pte_leaf_size(x) PAGE_SIZE
 #endif
+#define __pte_leaf_size(x,y) pte_leaf_size(y)
+#endif
 
 /*
  * We always define pmd_pfn for all archs as it's used in lots of generic
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f0128c5ff278..880df84ce07c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7596,7 +7596,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, 
unsigned long addr)
 
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
-   size = pte_leaf_size(pte);
+   size = __pte_leaf_size(pmd, pte);
pte_unmap(ptep);
 #endif /* CONFIG_HAVE_GUP_FAST */
 
-- 
2.44.0



[RFC PATCH v4 01/16] powerpc/64e: Remove unused IBM HTW code [SQUASHED]

2024-05-27 Thread Christophe Leroy
From: Michael Ellerman 

This is a squash of series from Michael 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20240524073141.1637736-1-...@ellerman.id.au/

The nohash HTW_IBM (Hardware Table Walk) code is unused since support
for A2 was removed in commit fb5a515704d7 ("powerpc: Remove platforms/
wsp and associated pieces") (2014).

The remaining supported CPUs use either no HTW (data_tlb_miss_bolted),
or the e6500 HTW (data_tlb_miss_e6500).

Signed-off-by: Michael Ellerman 

powerpc/64e: Split out nohash Book3E 64-bit code

A reasonable chunk of nohash/tlb.c is 64-bit only code, split it out
into a separate file.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop E500 ifdefs in 64-bit code

All 64-bit Book3E have E500=y, so drop the unneeded ifdefs.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop MMU_FTR_TYPE_FSL_E checks in 64-bit code

All 64-bit Book3E have MMU_FTR_TYPE_FSL_E, since A2 was removed, so
remove checks for it in 64-bit only code.

Signed-off-by: Michael Ellerman 

powerpc/64e: Consolidate TLB miss handler patching

The 64e TLB miss handler patching is done in setup_mmu_htw(), and then
again immediately afterward in early_init_mmu_global(). Consolidate it
into a single location.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop unused TLB miss handlers

There are two possibilities for book3e_htw_mode, PPC_HTW_E6500 or
PPC_HTW_NONE.

The TLB miss handlers are patched to use, respectively:
  - exc_[data|indstruction]_tlb_miss_e6500_book3e
  - exc_[data|indstruction]_tlb_miss_bolted_book3e

Which means the default handlers are never used. Remove those, and use
the bolted handlers (PPC_HTW_NONE) by default.

Signed-off-by: Michael Ellerman 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h |   3 +-
 arch/powerpc/kernel/exceptions-64e.S   |   4 +-
 arch/powerpc/kernel/setup_64.c |   6 +-
 arch/powerpc/mm/nohash/Makefile|   2 +-
 arch/powerpc/mm/nohash/tlb.c   | 398 +--
 arch/powerpc/mm/nohash/tlb_64e.c   | 314 +++
 arch/powerpc/mm/nohash/tlb_low_64e.S   | 421 -
 7 files changed, 320 insertions(+), 828 deletions(-)
 create mode 100644 arch/powerpc/mm/nohash/tlb_64e.c

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 6ddced0415cb..7dc24b8632d7 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -303,8 +303,7 @@ extern unsigned long linear_map_top;
 extern int book3e_htw_mode;
 
 #define PPC_HTW_NONE   0
-#define PPC_HTW_IBM1
-#define PPC_HTW_E6500  2
+#define PPC_HTW_E6500  1
 
 /*
  * 64-bit booke platforms don't load the tlb in the tlb miss handler code.
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index dcf0591ad3c2..63f6b9f513a4 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -485,8 +485,8 @@ interrupt_base_book3e:  
/* fake trap */
EXCEPTION_STUB(0x160, decrementer)  /* 0x0900 */
EXCEPTION_STUB(0x180, fixed_interval)   /* 0x0980 */
EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */
-   EXCEPTION_STUB(0x1c0, data_tlb_miss)
-   EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
+   EXCEPTION_STUB(0x1c0, data_tlb_miss_bolted)
+   EXCEPTION_STUB(0x1e0, instruction_tlb_miss_bolted)
EXCEPTION_STUB(0x200, altivec_unavailable)
EXCEPTION_STUB(0x220, altivec_assist)
EXCEPTION_STUB(0x260, perfmon)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ae36a129789f..22f83fbbc762 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -696,11 +696,7 @@ __init u64 ppc64_bolted_size(void)
 {
 #ifdef CONFIG_PPC_BOOK3E_64
/* Freescale BookE bolts the entire linear mapping */
-   /* XXX: BookE ppc64_rma_limit setup seems to disagree? */
-   if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E))
-   return linear_map_top;
-   /* Other BookE, we assume the first GB is bolted */
-   return 1ul << 30;
+   return linear_map_top;
 #else
/* BookS radix, does not take faults on linear mapping */
if (early_radix_enabled())
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index b3f0498dd42f..90e846f0c46c 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-y  += mmu_context.o tlb.o tlb_low.o kup.o
-obj-$(CONFIG_PPC_BOOK3E_64)+= tlb_low_64e.o book3e_pgtable.o
+obj-$(CONFIG_PPC_BOOK3E_64)+= tlb_64e.o tlb_low_64e.o book3e_pgtable.o
 obj-$(CONFIG_40x)  += 40x.o
 obj-$(CONFIG_44x)  += 44x.o
 obj-$(CONFIG_PPC_8xx)  += 8xx.o
diff --git a/arch/powerpc

Re: [PATCH v2 12/14] mm/treewide: Remove pXd_huge()

2024-05-27 Thread Christophe Leroy


Le 18/03/2024 à 21:04, pet...@redhat.com a écrit :
> From: Peter Xu 
> 
> This API is not used anymore, drop it for the whole tree.

Some documentation remain in v6.10-rc1:

$ git grep -w p.d_huge
Documentation/mm/arch_pgtable_helpers.rst:| pmd_huge  | 
Tests a HugeTLB mapped PMD   |
Documentation/mm/arch_pgtable_helpers.rst:| pud_huge  | 
Tests a HugeTLB mapped PUD   |
arch/x86/mm/pat/set_memory.c:* otherwise 
pmd_present/pmd_huge will return true


Christophe


Re: [RFC PATCH v3 05/16] powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries

2024-05-26 Thread Christophe Leroy


Le 27/05/2024 à 06:55, Oscar Salvador a écrit :
> On Sun, May 26, 2024 at 11:22:25AM +0200, Christophe Leroy wrote:
>> Building on 32 bits with pmd_leaf() not returning always false leads
>> to the following error:
>>
>>CC  arch/powerpc/mm/pgtable.o
>> arch/powerpc/mm/pgtable.c: In function '__find_linux_pte':
>> arch/powerpc/mm/pgtable.c:506:1: error: function may return address of local 
>> variable [-Werror=return-local-addr]
>>506 | }
>>| ^
>> arch/powerpc/mm/pgtable.c:394:15: note: declared here
>>394 | pud_t pud, *pudp;
>>|   ^~~
>> arch/powerpc/mm/pgtable.c:394:15: note: declared here
>>
>> This is due to pmd_offset() being a no-op in that case.
>>
>> So rework it for powerpc/32 so that pXd_offset() are used on real
>> pointers and not on on-stack copies.
>>
>> Signed-off-by: Christophe Leroy 
> 
> Maybe this could be folded into the patch that makes pmd_leaf() not returning
> always false, but no strong feelings:

I prefer to keep it separate, the patch introducing pmd_leaf() is 
already big enough.

> 
> Reviewed-by: Oscar Salvador 
> 
> 


Re: [RFC PATCH 4/8] mm: Provide mm_struct and address to huge_ptep_get()

2024-05-26 Thread Christophe Leroy


Le 25/03/2024 à 17:35, Jason Gunthorpe a écrit :
> On Mon, Mar 25, 2024 at 03:55:57PM +0100, Christophe Leroy wrote:
> 
>>   arch/arm64/include/asm/hugetlb.h |  2 +-
>>   fs/hugetlbfs/inode.c |  2 +-
>>   fs/proc/task_mmu.c   |  8 +++---
>>   fs/userfaultfd.c |  2 +-
>>   include/asm-generic/hugetlb.h|  2 +-
>>   include/linux/swapops.h  |  2 +-
>>   mm/damon/vaddr.c |  6 ++---
>>   mm/gup.c |  2 +-
>>   mm/hmm.c |  2 +-
>>   mm/hugetlb.c | 46 
>>   mm/memory-failure.c  |  2 +-
>>   mm/mempolicy.c   |  2 +-
>>   mm/migrate.c |  4 +--
>>   mm/mincore.c |  2 +-
>>   mm/userfaultfd.c |  2 +-
>>   15 files changed, 43 insertions(+), 43 deletions(-)
>>
>> diff --git a/arch/qarm64/include/asm/hugetlb.h 
>> b/arch/arm64/include/asm/hugetlb.h
>> index 2ddc33d93b13..1af39a74e791 100644
>> --- a/arch/arm64/include/asm/hugetlb.h
>> +++ b/arch/arm64/include/asm/hugetlb.h
>> @@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct 
>> *vma,
>>   extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>> pte_t *ptep, unsigned long sz);
>>   #define __HAVE_ARCH_HUGE_PTEP_GET
>> -extern pte_t huge_ptep_get(pte_t *ptep);
>> +extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t 
>> *ptep);
> 
> The header changed but not the implementation? This will need to do
> riscv and s390 too.

It is now fixed.

> 
> Though, really, I think the right path is to work toward removing
> huge_ptep_get() from the arch code..
> 
> riscv and arm are doing the same thing - propogating dirty/young bits
> from the contig PTEs to the results. The core code can do this, maybe
> with a ARCH #define opt in.
> 
> s390.. Ouchy - is this because hugetlb wants to pretend that every
> level is encoded as a PTE so it takes the PGD and recodes the flags to
> the PTE layout??
> 
> Jason


[RFC PATCH v3 15/16] powerpc/mm: Remove hugepd leftovers

2024-05-26 Thread Christophe Leroy
All targets have now opted out of CONFIG_ARCH_HAS_HUGEPD so
remove left over code.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hugetlb.h  |   7 -
 arch/powerpc/include/asm/page.h |   6 -
 arch/powerpc/include/asm/pgtable-be-types.h |  10 -
 arch/powerpc/include/asm/pgtable-types.h|   9 -
 arch/powerpc/mm/hugetlbpage.c   | 412 
 arch/powerpc/mm/init-common.c   |   8 +-
 arch/powerpc/mm/pgtable.c   |  27 +-
 7 files changed, 3 insertions(+), 476 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index e959c26c0b52..18a3028ac3b6 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,13 +30,6 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
-   unsigned long end, unsigned long floor,
-   unsigned long ceiling);
-#endif
-
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 pte_t pte, unsigned long sz);
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index c0af246a64ff..83d0a4fc5f75 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -269,12 +269,6 @@ static inline const void *pfn_to_kaddr(unsigned long pfn)
 #define is_kernel_addr(x)  ((x) >= TASK_SIZE)
 #endif
 
-/*
- * Some number of bits at the level of the page table that points to
- * a hugepte are used to encode the size.  This masks those bits.
- */
-#define HUGEPD_SHIFT_MASK 0x3f
-
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h 
b/arch/powerpc/include/asm/pgtable-be-types.h
index 82633200b500..6bd8f89b25dc 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -101,14 +101,4 @@ static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t 
new)
return pmd_raw(old) == prev;
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { __be64 pdbe; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { cpu_to_be64(x) })
-
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return be64_to_cpu(x.pdbe);
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index db965d98e0ae..7b3d4c592a10 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -87,13 +87,4 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t 
new)
 }
 #endif
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { (x) })
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return x.pd;
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1fe2843f5b12..76846c6014e4 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -28,8 +28,6 @@
 
 bool hugetlb_disabled = false;
 
-#define hugepd_none(hpd)   (hpd_val(hpd) == 0)
-
 #define PTE_T_ORDER(__builtin_ffs(sizeof(pte_basic_t)) - \
 __builtin_ffs(sizeof(void *)))
 
@@ -42,156 +40,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-  unsigned long address, unsigned int pdshift,
-  unsigned int pshift, spinlock_t *ptl)
-{
-   struct kmem_cache *cachep;
-   pte_t *new;
-   int i;
-   int num_hugepd;
-
-   if (pshift >= pdshift) {
-   cachep = PGT_CACHE(PTE_T_ORDER);
-   num_hugepd = 1 << (pshift - pdshift);
-   } else {
-   cachep = PGT_CACHE(pdshift - pshift);
-   num_hugepd = 1;
-   }
-
-   if (!cachep) {
-   WARN_ONCE(1, "No page table cache created for hugetlb tables");
-   return -ENOMEM;
-   }
-
-   new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
-
-   BUG_ON(pshift > HUGEPD_SHIFT_MASK);
-   BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
-
-   if (!new)
-   return -ENOMEM;
-
-   /*
-* Make sure other cpus find the hugepd set only after a
-* properly initialized page table is visible to them.
-* For more details look for comment in __pte_alloc().
-*/
-   smp_wmb();
-
-   spin_lock(ptl

[RFC PATCH v3 16/16] mm: Remove CONFIG_ARCH_HAS_HUGEPD

2024-05-26 Thread Christophe Leroy
powerpc was the only user of CONFIG_ARCH_HAS_HUGEPD and doesn't
use it anymore, so remove all related code.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/hugetlbpage.c |   1 -
 include/linux/hugetlb.h   |   6 --
 mm/Kconfig|  10 
 mm/gup.c  | 105 +-
 mm/pagewalk.c |  57 ++
 5 files changed, 5 insertions(+), 174 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 76846c6014e4..6b043180220a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -78,7 +78,6 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
return pte_alloc_huge(mm, pmd, addr);
 }
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076..f6a509487773 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,12 +20,6 @@ struct user_struct;
 struct mmu_gather;
 struct node;
 
-#ifndef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define is_hugepd(hugepd) (0)
-#define __hugepd(x) ((hugepd_t) { (x) })
-#endif
-
 void free_huge_folio(struct folio *folio);
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/Kconfig b/mm/Kconfig
index b1448aa81e15..a52f8e3224fb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1114,16 +1114,6 @@ config DMAPOOL_TEST
 config ARCH_HAS_PTE_SPECIAL
bool
 
-#
-# Some architectures require a special hugepage directory format that is
-# required to support multiple hugepage sizes. For example a4fe3ce76
-# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
-# introduced it on powerpc.  This allows for a more flexible hugepage
-# pagetable layouts.
-#
-config ARCH_HAS_HUGEPD
-   bool
-
 config MAPPING_DIRTY_HELPERS
 bool
 
diff --git a/mm/gup.c b/mm/gup.c
index 86b5105b82a1..95f121223f04 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2790,89 +2790,6 @@ static int record_subpages(struct page *page, unsigned 
long addr,
return nr;
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
- unsigned long sz)
-{
-   unsigned long __boundary = (addr + sz) & ~(sz-1);
-   return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
-static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-  unsigned long end, unsigned int flags,
-  struct page **pages, int *nr)
-{
-   unsigned long pte_end;
-   struct page *page;
-   struct folio *folio;
-   pte_t pte;
-   int refs;
-
-   pte_end = (addr + sz) & ~(sz-1);
-   if (pte_end < end)
-   end = pte_end;
-
-   pte = huge_ptep_get(NULL, addr, ptep);
-
-   if (!pte_access_permitted(pte, flags & FOLL_WRITE))
-   return 0;
-
-   /* hugepages are never "special" */
-   VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-   page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
-   refs = record_subpages(page, addr, end, pages + *nr);
-
-   folio = try_grab_folio(page, refs, flags);
-   if (!folio)
-   return 0;
-
-   if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   if (!folio_fast_pin_allowed(folio, flags)) {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   if (!pte_write(pte) && gup_must_unshare(NULL, flags, >page)) {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   *nr += refs;
-   folio_set_referenced(folio);
-   return 1;
-}
-
-static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-   unsigned int pdshift, unsigned long end, unsigned int flags,
-   struct page **pages, int *nr)
-{
-   pte_t *ptep;
-   unsigned long sz = 1UL << hugepd_shift(hugepd);
-   unsigned long next;
-
-   ptep = hugepte_offset(hugepd, addr, pdshift);
-   do {
-   next = hugepte_addr_end(addr, end, sz);
-   if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
-   return 0;
-   } while (ptep++, addr = next, addr != end);
-
-   return 1;
-}
-#else
-static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-   unsigned int pdshift, unsigned long end, unsigned int flags,
-   struct page **pages, int *nr)
-{
-   return 0;
-}
-#endif /* CONFIG_ARCH_HAS_HUGEPD */
-
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
@@ -3026,14 +2943,6 @@ static int gup_p

[RFC PATCH v3 13/16] powerpc/e500: Use contiguous PMD instead of hugepd

2024-05-26 Thread Christophe Leroy
e500 supports many page sizes among which the following size are
implemented in the kernel at the time being: 4M, 16M, 64M, 256M, 1G.

On e500, TLB miss for hugepages is exclusively handled by SW even
on e6500 which has HW assistance for 4k pages, so there are no
constraints like on the 8xx.

On e500/32, all are at PGD/PMD level and can be handled as
cont-PMD.

On e500/64, smaller ones are on PMD while bigger ones are on PUD.
Again, they can easily be handled as cont-PMD and cont-PUD instead
of hugepd.

Signed-off-by: Christophe Leroy 
---
v3: Add missing pmd_leaf_size() and pud_leaf_size()
---
 .../powerpc/include/asm/nohash/hugetlb-e500.h | 32 +-
 arch/powerpc/include/asm/nohash/pgalloc.h |  2 -
 arch/powerpc/include/asm/nohash/pgtable.h | 43 +
 arch/powerpc/include/asm/nohash/pte-e500.h| 28 +
 arch/powerpc/include/asm/page.h   | 15 +
 arch/powerpc/kernel/head_85xx.S   | 23 +++
 arch/powerpc/mm/hugetlbpage.c |  2 -
 arch/powerpc/mm/nohash/tlb_low_64e.S  | 63 +++
 arch/powerpc/mm/pgtable.c | 31 +
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 10 files changed, 144 insertions(+), 96 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index d8e51a3f8557..d30e2a3f129d 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -2,38 +2,12 @@
 #ifndef _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 #define _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   if (WARN_ON(!hugepd_ok(hpd)))
-   return NULL;
-
-   return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return hpd_val(hpd) & HUGEPD_SHIFT_MASK;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   /*
-* On FSL BookE, we have multiple higher-level table entries that
-* point to the same hugepte.  Just use the first one since they're all
-* identical.  So for that case, idx=0.
-*/
-   return hugepd_page(hpd);
-}
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+pte_t pte, unsigned long sz);
 
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
-{
-   /* We use the old format for PPC_E500 */
-   *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
-}
-
 static inline int check_and_get_huge_psize(int shift)
 {
if (shift & 1)  /* Not a power of 4 */
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h 
b/arch/powerpc/include/asm/nohash/pgalloc.h
index 4b62376318e1..d06efac6d7aa 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -44,8 +44,6 @@ static inline void pgtable_free(void *table, int shift)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int 
shift)
 {
unsigned long pgf = (unsigned long)table;
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index c4be7754e96f..28ecb2c8b433 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -52,11 +52,36 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, 
unsigned long addr, p
 {
pte_basic_t old = pte_val(*p);
pte_basic_t new = (old & ~(pte_basic_t)clr) | set;
+   unsigned long sz;
+   unsigned long pdsize;
+   int i;
 
if (new == old)
return old;
 
-   *p = __pte(new);
+#ifdef CONFIG_PPC_E500
+   if (huge)
+   sz = 1UL << (((old & _PAGE_HSIZE_MSK) >> _PAGE_HSIZE_SHIFT) + 
20);
+   else
+#endif
+   sz = PAGE_SIZE;
+
+   if (!huge || sz < PMD_SIZE)
+   pdsize = PAGE_SIZE;
+   else if (sz < PUD_SIZE)
+   pdsize = PMD_SIZE;
+   else if (sz < P4D_SIZE)
+   pdsize = PUD_SIZE;
+   else if (sz < PGDIR_SIZE)
+   pdsize = P4D_SIZE;
+   else
+   pdsize = PGDIR_SIZE;
+
+   for (i = 0; i < sz / pdsize; i++, p++) {
+   *p = __pte(new);
+   if (new)
+   new += (unsigned long long)(pdsize / PAGE_SIZE) << 
PTE_RPN_SHIFT;
+   }
 
if (IS_ENABLED(CONFIG_44x) && !is_kernel_addr(addr) && (old & 
_PAGE_EXEC))
icache_44x_need_flush = 1;
@@ -340,25 +365,19 @@ static inline void __set_pte_at(struct mm_struct *mm, 
uns

Re: [RFC PATCH 1/8] mm: Provide pagesize to pmd_populate()

2024-05-26 Thread Christophe Leroy


Le 25/03/2024 à 17:19, Jason Gunthorpe a écrit :
> On Mon, Mar 25, 2024 at 03:55:54PM +0100, Christophe Leroy wrote:
>> Unlike many architectures, powerpc 8xx hardware tablewalk requires
>> a two level process for all page sizes, allthough second level only
>> has one entry when pagesize is 8M.
>>
>> To fit with Linux page table topology and without requiring special
>> page directory layout like hugepd, the page entry will be replicated
>> 1024 times in the standard page table. However for large pages it is
>> necessary to set bits in the level-1 (PMD) entry. At the time being,
>> for 512k pages the flag is kept in the PTE and inserted in the PMD
>> entry at TLB miss exception, that is necessary because we can have
>> pages of different sizes in a page table. However the 12 PTE bits are
>> fully used and there is no room for an additional bit for page size.
>>
>> For 8M pages, there will be only one page per PMD entry, it is
>> therefore possible to flag the pagesize in the PMD entry, with the
>> advantage that the information will already be at the right place for
>> the hardware.
>>
>> To do so, add a new helper called pmd_populate_size() which takes the
>> page size as an additional argument, and modify __pte_alloc() to also
>> take that argument. pte_alloc() is left unmodified in order to
>> reduce churn on callers, and a pte_alloc_size() is added for use by
>> pte_alloc_huge().
>>
>> When an architecture doesn't provide pmd_populate_size(),
>> pmd_populate() is used as a fallback.
> 
> I think it would be a good idea to document what the semantic is
> supposed to be for sz?
> 
> Just a general remark, probably nothing for this, but with these new
> arguments the historical naming seems pretty tortured for
> pte_alloc_size().. Something like pmd_populate_leaf(size) as a naming
> scheme would make this more intuitive. Ie pmd_populate_leaf() gives
> you a PMD entry where the entry points to a leaf page table able to
> store folios of at least size.

I removed patches 1 and 2 and now add bit _PMD_PAGE_8M in PMD entry 
afterwards in set_huge_pte_at()

> 
> Anyhow, I thought the edits to the mm helpers were fine, certainly
> much nicer than hugepd. Do you see a path to remove hugepd entirely
> from here?
> 
> Thanks,
> Jason


[RFC PATCH v3 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-26 Thread Christophe Leroy
On book3s/64, the only user of hugepd is hash in 4k mode.

All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.

Rework hash-4k to use contiguous PMD and PUD instead.

In that setup there are only two huge page sizes: 16M and 16G.

16M sits at PMD level and 16G at PUD level.

pte_update doesn't know page size, lets use the same trick as
hpte_need_flush() to get page size from segment properties. That's
not the most efficient way but let's do that until callers of
pte_update() provide page size instead of just a huge flag.

Signed-off-by: Christophe Leroy 
---
v3:
- Add missing pmd_leaf_size() and pud_leaf_size()
- More cleanup in hugetlbpage_init()
- Take a page fault when DIRTY or ACCESSED is missing on hash-4 hugepage
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 15 
 arch/powerpc/include/asm/book3s/64/hash.h | 38 +++
 arch/powerpc/include/asm/book3s/64/hugetlb.h  | 38 ---
 .../include/asm/book3s/64/pgtable-4k.h| 34 -
 .../include/asm/book3s/64/pgtable-64k.h   | 20 --
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 18 +
 arch/powerpc/include/asm/hugetlb.h|  4 ++
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  4 --
 arch/powerpc/include/asm/page.h   |  8 
 arch/powerpc/mm/book3s64/hash_utils.c | 11 --
 arch/powerpc/mm/book3s64/hugetlbpage.c| 10 +
 arch/powerpc/mm/book3s64/pgtable.c| 12 --
 arch/powerpc/mm/hugetlbpage.c | 27 -
 arch/powerpc/mm/pgtable.c |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 15 files changed, 71 insertions(+), 171 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 6472b08fa1b0..c654c376ef8b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -74,21 +74,6 @@
 #define remap_4k_pfn(vma, addr, pfn, prot) \
remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
-#ifdef CONFIG_HUGETLB_PAGE
-static inline int hash__hugepd_ok(hugepd_t hpd)
-{
-   unsigned long hpdval = hpd_val(hpd);
-   /*
-* if it is not a pte and have hugepd shift mask
-* set, then it is a hugepd directory pointer
-*/
-   if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
-   ((hpdval & HUGEPD_SHIFT_MASK) != 0))
-   return true;
-   return false;
-}
-#endif
-
 /*
  * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just
  * a matter of returning the PTE bits that need to be modified. On 64K PTE,
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index faf3e3b4e4b2..8202c27afe23 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include 
+#include 
 
 /*
  * Common bits between 4K and 64K pages in a linux-style PTE.
@@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, 
unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
 unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long 
flags);
 /* Atomic PTE updates */
-static inline unsigned long hash__pte_update(struct mm_struct *mm,
-unsigned long addr,
-pte_t *ptep, unsigned long clr,
-unsigned long set,
-int huge)
+static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long 
clr,
+unsigned long set)
 {
__be64 old_be, tmp_be;
-   unsigned long old;
 
__asm__ __volatile__(
"1: ldarx   %0,0,%3 # pte_update\n\
@@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct 
mm_struct *mm,
: "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
: "cc" );
+
+   return be64_to_cpu(old_be);
+}
+
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+unsigned long addr,
+pte_t *ptep, unsigned long clr,
+unsigned long set,
+int huge)
+{
+   unsigned long old;
+
+   old = hash__pte_update_one(ptep, clr, set);
+
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
+   unsigned int psize = get_slice_psize(mm, addr);
+   int nb, i;
+
+   if (psize == MMU_PAGE_16M)
+   nb = SZ_16M / PMD_SIZE;
+  

[RFC PATCH v3 11/16] powerpc/e500: Switch to 64 bits PGD on 85xx (32 bits)

2024-05-26 Thread Christophe Leroy
At the time being when CONFIG_PTE_64BIT is selected, PTE entries are
64 bits but PGD entries are still 32 bits.

In order to allow leaf PMD entries, switch the PGD to 64 bits entries.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/pgtable-types.h |  4 
 arch/powerpc/kernel/head_85xx.S  | 10 ++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index 082c85cc09b1..db965d98e0ae 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -49,7 +49,11 @@ static inline unsigned long pud_val(pud_t x)
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
+#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT)
+typedef struct { unsigned long long pgd; } pgd_t;
+#else
 typedef struct { unsigned long pgd; } pgd_t;
+#endif
 #define __pgd(x)   ((pgd_t) { (x) })
 static inline unsigned long pgd_val(pgd_t x)
 {
diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index 39724ff5ae1f..a305244afc9f 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -307,8 +307,9 @@ set_ivor:
 #ifdef CONFIG_PTE_64BIT
 #ifdef CONFIG_HUGETLB_PAGE
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
blt 1000f;  /* Normal non-huge page */  \
beq 2f; /* Bail if no table */  \
@@ -321,8 +322,9 @@ set_ivor:
 1001:  lwz r11, 4(r12);/* Get pte entry */
 #else
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
beq 2f; /* Bail if no table */  \
rlwimi  r12, r10, 23, 20, 28;   /* Compute pte address */   \
-- 
2.44.0



[RFC PATCH v3 10/16] powerpc/e500: Remove enc and ind fields from struct mmu_psize_def

2024-05-26 Thread Christophe Leroy
enc field is hidden behind BOOK3E_PAGESZ_XX macros, and when you look
closer you realise that this field is nothing else than the value of
shift minus ten.

So remove enc field and calculate tsize from shift field.

Also remove inc field which is unused.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h | 3 ---
 arch/powerpc/mm/nohash/book3e_pgtable.c| 4 ++--
 arch/powerpc/mm/nohash/tlb.c   | 9 +
 arch/powerpc/mm/nohash/tlb_64e.c   | 2 +-
 4 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 7dc24b8632d7..b281d9eeaf1e 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -244,14 +244,11 @@ typedef struct {
 /* Page size definitions, common between 32 and 64-bit
  *
  *shift : is the "PAGE_SHIFT" value for that page size
- *penc  : is the pte encoding mask
  *
  */
 struct mmu_psize_def
 {
unsigned intshift;  /* number of bits */
-   unsigned intenc;/* PTE encoding */
-   unsigned intind;/* Corresponding indirect page size shift */
unsigned intflags;
 #define MMU_PAGE_SIZE_DIRECT   0x1 /* Supported as a direct size */
 #define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c 
b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 1c5e4ecbebeb..ad2a7c26f2a0 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -29,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start,
_PAGE_KERNEL_RW;
 
/* PTEs only contain page size encodings up to 32M */
-   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf);
 
/* Encode the size in the PTE */
-   flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+   flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8;
 
/* For each PTE for that area, map things. Note that we don't
 * increment phys because all PTEs are of the large size and
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index f57dc721d063..b653a7be4cb1 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -53,37 +53,30 @@
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift  = 12,
-   .enc= BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_2M] = {
.shift  = 21,
-   .enc= BOOK3E_PAGESZ_2M,
},
[MMU_PAGE_4M] = {
.shift  = 22,
-   .enc= BOOK3E_PAGESZ_4M,
},
[MMU_PAGE_16M] = {
.shift  = 24,
-   .enc= BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_64M] = {
.shift  = 26,
-   .enc= BOOK3E_PAGESZ_64M,
},
[MMU_PAGE_256M] = {
.shift  = 28,
-   .enc= BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift  = 30,
-   .enc= BOOK3E_PAGESZ_1GB,
},
 };
 
 static inline int mmu_get_tsize(int psize)
 {
-   return mmu_psize_defs[psize].enc;
+   return mmu_psize_defs[psize].shift - 10;
 }
 #else
 static inline int mmu_get_tsize(int psize)
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
index 053128a5636c..7988238496d7 100644
--- a/arch/powerpc/mm/nohash/tlb_64e.c
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -53,7 +53,7 @@ int extlb_level_exc;
  */
 void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 {
-   int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+   int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10;
 
if (book3e_htw_mode != PPC_HTW_NONE) {
unsigned long start = address & PMD_MASK;
-- 
2.44.0



[RFC PATCH v3 12/16] powerpc/e500: Encode hugepage size in PTE bits

2024-05-26 Thread Christophe Leroy
Use U0-U3 bits to encode hugepage size, more exactly page shift.

As we start using hugepages at shift 21 (2Mbytes), substract 20
so that it fits into 4 bits. That may change in the future if
we want to use smaller hugepages.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/hugetlb-e500.h | 6 ++
 arch/powerpc/include/asm/nohash/pte-e500.h | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index 8f04ad20e040..d8e51a3f8557 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -42,4 +42,10 @@ static inline int check_and_get_huge_psize(int shift)
return shift_to_mmu_psize(shift);
 }
 
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, 
vm_flags_t flags)
+{
+   return __pte(pte_val(entry) | (_PAGE_U3 * (shift - 20)));
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
 #endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
b/arch/powerpc/include/asm/nohash/pte-e500.h
index 975facc7e38e..091e4bff1fba 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -46,6 +46,9 @@
 #define _PAGE_NO_CACHE 0x40 /* I: cache inhibit */
 #define _PAGE_WRITETHRU0x80 /* W: cache write-through */
 
+#define _PAGE_HSIZE_MSK (_PAGE_U0 | _PAGE_U1 | _PAGE_U2 | _PAGE_U3)
+#define _PAGE_HSIZE_SHIFT  14
+
 /* "Higher level" linux bit combinations */
 #define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was 
cache cleaned */
 #define _PAGE_READ (_PAGE_BAP_SR | _PAGE_BAP_UR) /* User read 
permission */
-- 
2.44.0



[RFC PATCH v3 02/16] mm: Define __pte_leaf_size() to also take a PMD entry

2024-05-26 Thread Christophe Leroy
On powerpc 8xx, when a page is 8M size, the information is in the PMD
entry. So allow architectures to provide __pte_leaf_size() instead of
pte_leaf_size() and provide the PMD entry to that function.

When __pte_leaf_size() is not defined, define it as a pte_leaf_size()
so that architectures not interested in the PMD arguments are not
impacted.

Only define a default pte_leaf_size() when __pte_leaf_size() is not
defined to make sure nobody adds new calls to pte_leaf_size() in the
core.

Signed-off-by: Christophe Leroy 
---
v3: Don't change pte_leaf_size() to not impact other architectures
---
 include/linux/pgtable.h | 3 +++
 kernel/events/core.c| 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 85fc7554cd52..514e05730df1 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1801,9 +1801,12 @@ typedef unsigned int pgtbl_mod_mask;
 #ifndef pmd_leaf_size
 #define pmd_leaf_size(x) PMD_SIZE
 #endif
+#ifndef __pte_leaf_size
 #ifndef pte_leaf_size
 #define pte_leaf_size(x) PAGE_SIZE
 #endif
+#define __pte_leaf_size(x,y) pte_leaf_size(y)
+#endif
 
 /*
  * Some architectures have MMUs that are configurable or selectable at boot
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..d37512f2ebf2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7585,7 +7585,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, 
unsigned long addr)
 
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
-   size = pte_leaf_size(pte);
+   size = __pte_leaf_size(pmd, pte);
pte_unmap(ptep);
 #endif /* CONFIG_HAVE_FAST_GUP */
 
-- 
2.44.0



[RFC PATCH v3 01/16] powerpc/64e: Remove unused IBM HTW code [SQUASHED]

2024-05-26 Thread Christophe Leroy
From: Michael Ellerman 

This is a squash of series from Michael 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20240524073141.1637736-1-...@ellerman.id.au/

The nohash HTW_IBM (Hardware Table Walk) code is unused since support
for A2 was removed in commit fb5a515704d7 ("powerpc: Remove platforms/
wsp and associated pieces") (2014).

The remaining supported CPUs use either no HTW (data_tlb_miss_bolted),
or the e6500 HTW (data_tlb_miss_e6500).

Signed-off-by: Michael Ellerman 

powerpc/64e: Split out nohash Book3E 64-bit code

A reasonable chunk of nohash/tlb.c is 64-bit only code, split it out
into a separate file.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop E500 ifdefs in 64-bit code

All 64-bit Book3E have E500=y, so drop the unneeded ifdefs.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop MMU_FTR_TYPE_FSL_E checks in 64-bit code

All 64-bit Book3E have MMU_FTR_TYPE_FSL_E, since A2 was removed, so
remove checks for it in 64-bit only code.

Signed-off-by: Michael Ellerman 

powerpc/64e: Consolidate TLB miss handler patching

The 64e TLB miss handler patching is done in setup_mmu_htw(), and then
again immediately afterward in early_init_mmu_global(). Consolidate it
into a single location.

Signed-off-by: Michael Ellerman 

powerpc/64e: Drop unused TLB miss handlers

There are two possibilities for book3e_htw_mode, PPC_HTW_E6500 or
PPC_HTW_NONE.

The TLB miss handlers are patched to use, respectively:
  - exc_[data|indstruction]_tlb_miss_e6500_book3e
  - exc_[data|indstruction]_tlb_miss_bolted_book3e

Which means the default handlers are never used. Remove those, and use
the bolted handlers (PPC_HTW_NONE) by default.

Signed-off-by: Michael Ellerman 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h |   3 +-
 arch/powerpc/kernel/exceptions-64e.S   |   4 +-
 arch/powerpc/kernel/setup_64.c |   6 +-
 arch/powerpc/mm/nohash/Makefile|   2 +-
 arch/powerpc/mm/nohash/tlb.c   | 398 +--
 arch/powerpc/mm/nohash/tlb_64e.c   | 314 +++
 arch/powerpc/mm/nohash/tlb_low_64e.S   | 421 -
 7 files changed, 320 insertions(+), 828 deletions(-)
 create mode 100644 arch/powerpc/mm/nohash/tlb_64e.c

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 6ddced0415cb..7dc24b8632d7 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -303,8 +303,7 @@ extern unsigned long linear_map_top;
 extern int book3e_htw_mode;
 
 #define PPC_HTW_NONE   0
-#define PPC_HTW_IBM1
-#define PPC_HTW_E6500  2
+#define PPC_HTW_E6500  1
 
 /*
  * 64-bit booke platforms don't load the tlb in the tlb miss handler code.
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index dcf0591ad3c2..63f6b9f513a4 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -485,8 +485,8 @@ interrupt_base_book3e:  
/* fake trap */
EXCEPTION_STUB(0x160, decrementer)  /* 0x0900 */
EXCEPTION_STUB(0x180, fixed_interval)   /* 0x0980 */
EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */
-   EXCEPTION_STUB(0x1c0, data_tlb_miss)
-   EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
+   EXCEPTION_STUB(0x1c0, data_tlb_miss_bolted)
+   EXCEPTION_STUB(0x1e0, instruction_tlb_miss_bolted)
EXCEPTION_STUB(0x200, altivec_unavailable)
EXCEPTION_STUB(0x220, altivec_assist)
EXCEPTION_STUB(0x260, perfmon)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ae36a129789f..22f83fbbc762 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -696,11 +696,7 @@ __init u64 ppc64_bolted_size(void)
 {
 #ifdef CONFIG_PPC_BOOK3E_64
/* Freescale BookE bolts the entire linear mapping */
-   /* XXX: BookE ppc64_rma_limit setup seems to disagree? */
-   if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E))
-   return linear_map_top;
-   /* Other BookE, we assume the first GB is bolted */
-   return 1ul << 30;
+   return linear_map_top;
 #else
/* BookS radix, does not take faults on linear mapping */
if (early_radix_enabled())
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index b3f0498dd42f..90e846f0c46c 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-y  += mmu_context.o tlb.o tlb_low.o kup.o
-obj-$(CONFIG_PPC_BOOK3E_64)+= tlb_low_64e.o book3e_pgtable.o
+obj-$(CONFIG_PPC_BOOK3E_64)+= tlb_64e.o tlb_low_64e.o book3e_pgtable.o
 obj-$(CONFIG_40x)  += 40x.o
 obj-$(CONFIG_44x)  += 44x.o
 obj-$(CONFIG_PPC_8xx)  += 8xx.o
diff --git a/arch/powerpc

[RFC PATCH v3 03/16] mm: Provide mm_struct and address to huge_ptep_get()

2024-05-26 Thread Christophe Leroy
On powerpc 8xx huge_ptep_get() will need to know whether the given
ptep is a PTE entry or a PMD entry. This cannot be known with the
PMD entry itself because there is no easy way to know it from the
content of the entry.

So huge_ptep_get() will need to know either the size of the page
or get the pmd.

In order to be consistent with huge_ptep_get_and_clear(), give
mm and address to huge_ptep_get().

Signed-off-by: Christophe Leroy 
---
v2: Add missing changes in arch implementations
v3: Fixed a comment in ARM and missing changes in S390
---
 arch/arm/include/asm/hugetlb-3level.h |  4 +--
 arch/arm64/include/asm/hugetlb.h  |  2 +-
 arch/arm64/mm/hugetlbpage.c   |  2 +-
 arch/riscv/include/asm/hugetlb.h  |  2 +-
 arch/riscv/mm/hugetlbpage.c   |  2 +-
 arch/s390/include/asm/hugetlb.h   |  4 +--
 arch/s390/mm/hugetlbpage.c|  4 +--
 fs/hugetlbfs/inode.c  |  2 +-
 fs/proc/task_mmu.c|  8 ++---
 fs/userfaultfd.c  |  2 +-
 include/asm-generic/hugetlb.h |  2 +-
 include/linux/swapops.h   |  2 +-
 mm/damon/vaddr.c  |  6 ++--
 mm/gup.c  |  2 +-
 mm/hmm.c  |  2 +-
 mm/hugetlb.c  | 46 +--
 mm/memory-failure.c   |  2 +-
 mm/mempolicy.c|  2 +-
 mm/migrate.c  |  4 +--
 mm/mincore.c  |  2 +-
 mm/userfaultfd.c  |  2 +-
 21 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/arch/arm/include/asm/hugetlb-3level.h 
b/arch/arm/include/asm/hugetlb-3level.h
index a30be5505793..87d48e2d90ad 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -13,12 +13,12 @@
 
 /*
  * If our huge pte is non-zero then mark the valid bit.
- * This allows pte_present(huge_ptep_get(ptep)) to return true for non-zero
+ * This allows pte_present(huge_ptep_get(mm,addr,ptep)) to return true for 
non-zero
  * ptes.
  * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
  */
 #define __HAVE_ARCH_HUGE_PTEP_GET
-static inline pte_t huge_ptep_get(pte_t *ptep)
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
 {
pte_t retval = *ptep;
if (pte_val(retval))
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2ddc33d93b13..1af39a74e791 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
   pte_t *ptep, unsigned long sz);
 #define __HAVE_ARCH_HUGE_PTEP_GET
-extern pte_t huge_ptep_get(pte_t *ptep);
+extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t 
*ptep);
 
 void __init arm64_hugetlb_cma_reserve(void);
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b872b003a55f..19c4abde13a3 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -141,7 +141,7 @@ static inline int num_contig_ptes(unsigned long size, 
size_t *pgsize)
return contig_ptes;
 }
 
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
int ncontig, i;
size_t pgsize;
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 22deb7a2a6ec..6321bca08740 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -44,7 +44,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
   pte_t pte, int dirty);
 
 #define __HAVE_ARCH_HUGE_PTEP_GET
-pte_t huge_ptep_get(pte_t *ptep);
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 
 pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 5ef2a6891158..20bf499044b7 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,7 +3,7 @@
 #include 
 
 #ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
unsigned long pte_num;
int i;
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index deb198a61039..3b4835094fd5 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -19,7 +19,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte, unsigned long sz);
 void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte);
-pte_t huge_ptep_get(pte_t *ptep

[RFC PATCH v3 00/16] Reimplement huge pages without hugepd on powerpc (8xx, e500, book3s/64)

2024-05-26 Thread Christophe Leroy
This is the continuation of the RFC v1 series "Reimplement huge pages
without hugepd on powerpc 8xx". It now get rid of hugepd completely
after handling also e500 and book3s/64

Also see https://github.com/linuxppc/issues/issues/483

Unlike most architectures, powerpc 8xx HW requires a two-level
pagetable topology for all page sizes. So a leaf PMD-contig approach
is not feasible as such.

Possible sizes on 8xx are 4k, 16k, 512k and 8M.

First level (PGD/PMD) covers 4M per entry. For 8M pages, two PMD entries
must point to a single entry level-2 page table. Until now that was
done using hugepd. This series changes it to use standard page tables
where the entry is replicated 1024 times on each of the two pagetables
refered by the two associated PMD entries for that 8M page.

For e500 and book3s/64 there are less constraints because it is not
tied to the HW assisted tablewalk like on 8xx, so it is easier to use
leaf PMDs (and PUDs).

On e500 the supported page sizes are 4M, 16M, 64M, 256M and 1G. All at
PMD level on e500/32 (mpc85xx) and mix of PMD and PUD for e500/64. We
encode page size with 4 available bits in PTE entries. On e300/32 PGD
entries size is increases to 64 bits in order to allow leaf-PMD entries
because PTE are 64 bits on e500.

On book3s/64 only the hash-4k mode is concerned. It supports 16M pages
as cont-PMD and 16G pages as cont-PUD. In other modes (radix-4k, radix-6k
and hash-64k) the sizes match with PMD and PUD sizes so that's just leaf
entries. The hash processing make things a bit more complex. To ease
things, __hash_page_huge() is modified to bail out when DIRTY or ACCESSED
bits are missing, leaving it to mm core to fix it.

Global changes in v3:
- Removed patches 1 and 2
- Squashed patch 11 into patch 5
- Replaced patches 12 and 13 with a series from Michael
- Reordered patches a bit to have more general patches up front

For more details on changes, see in each patch.

Christophe Leroy (15):
  mm: Define __pte_leaf_size() to also take a PMD entry
  mm: Provide mm_struct and address to huge_ptep_get()
  powerpc/mm: Remove _PAGE_PSIZE
  powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries
  powerpc/mm: Allow hugepages without hugepd
  powerpc/8xx: Fix size given to set_huge_pte_at()
  powerpc/8xx: Rework support for 8M pages using contiguous PTE entries
  powerpc/8xx: Simplify struct mmu_psize_def
  powerpc/e500: Remove enc and ind fields from struct mmu_psize_def
  powerpc/e500: Switch to 64 bits PGD on 85xx (32 bits)
  powerpc/e500: Encode hugepage size in PTE bits
  powerpc/e500: Use contiguous PMD instead of hugepd
  powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
  powerpc/mm: Remove hugepd leftovers
  mm: Remove CONFIG_ARCH_HAS_HUGEPD

Michael Ellerman (1):
  powerpc/64e: Remove unused IBM HTW code [SQUASHED]

 arch/arm/include/asm/hugetlb-3level.h |   4 +-
 arch/arm64/include/asm/hugetlb.h  |   2 +-
 arch/arm64/mm/hugetlbpage.c   |   2 +-
 arch/powerpc/Kconfig  |   1 -
 arch/powerpc/include/asm/book3s/32/pgalloc.h  |   2 -
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  15 -
 arch/powerpc/include/asm/book3s/64/hash.h |  38 +-
 arch/powerpc/include/asm/book3s/64/hugetlb.h  |  38 --
 .../include/asm/book3s/64/pgtable-4k.h|  34 --
 .../include/asm/book3s/64/pgtable-64k.h   |  20 -
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  18 +
 arch/powerpc/include/asm/hugetlb.h|  15 +-
 .../include/asm/nohash/32/hugetlb-8xx.h   |  38 +-
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h  |   9 +-
 arch/powerpc/include/asm/nohash/32/pte-40x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-44x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-85xx.h |   3 -
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  |  58 ++-
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  36 +-
 arch/powerpc/include/asm/nohash/mmu-e500.h|   6 +-
 arch/powerpc/include/asm/nohash/pgalloc.h |   2 -
 arch/powerpc/include/asm/nohash/pgtable.h |  45 +-
 arch/powerpc/include/asm/nohash/pte-e500.h|  35 +-
 arch/powerpc/include/asm/page.h   |  32 --
 arch/powerpc/include/asm/pgtable-be-types.h   |  10 -
 arch/powerpc/include/asm/pgtable-types.h  |  13 +-
 arch/powerpc/include/asm/pgtable.h|   3 +
 arch/powerpc/kernel/exceptions-64e.S  |   4 +-
 arch/powerpc/kernel/head_85xx.S   |  33 +-
 arch/powerpc/kernel/head_8xx.S|  10 +-
 arch/powerpc/kernel/setup_64.c|   6 +-
 arch/powerpc/mm/book3s64/hash_utils.c |  11 +-
 arch/powerpc/mm/book3s64/hugetlbpage.c|  10 +
 arch/powerpc/mm/book3s64/pgtable.c|  12 -
 arch/powerpc/mm/hugetlbpage.c | 455 +---
 arch/powerpc/mm/init-common.c |   8 +-
 arch/powerpc/mm/kasan/8xx.c   |  21 +-
 arch/powerpc/mm/nohash/8xx.c  |  43 +-
 arch/powerpc/mm/nohash/Makefile  

[RFC PATCH v3 06/16] powerpc/mm: Allow hugepages without hugepd

2024-05-26 Thread Christophe Leroy
In preparation of implementing huge pages on powerpc 8xx
without hugepd, enclose hugepd related code inside an
ifdef CONFIG_ARCH_HAS_HUGEPD

This also allows removing some stubs.

Signed-off-by: Christophe Leroy 
---
v3:
- Prepare huge_pte_alloc() for full standard topology, not only for 2-level
- Reordered last part of huge_pte_alloc()
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  2 --
 arch/powerpc/include/asm/hugetlb.h   | 10 ++
 arch/powerpc/include/asm/nohash/pgtable.h|  8 +++--
 arch/powerpc/mm/hugetlbpage.c| 33 
 arch/powerpc/mm/pgtable.c|  2 ++
 5 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index dc5c039eb28e..dd4eb3063175 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -47,8 +47,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index ea71f7245a63..79176a499763 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,10 +30,12 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling);
+#endif
 
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -67,14 +69,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct 
*vma,
 {
 }
 
-#define hugepd_shift(x) 0
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned pdshift)
-{
-   return NULL;
-}
-
-
 static inline void __init gigantic_hugetlb_cma_reserve(void)
 {
 }
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 427db14292c9..ac3353f7f2ac 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -340,7 +340,7 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
 
 #define pgprot_writecombine pgprot_noncached_wc
 
-#ifdef CONFIG_HUGETLB_PAGE
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static inline int hugepd_ok(hugepd_t hpd)
 {
 #ifdef CONFIG_PPC_8xx
@@ -351,6 +351,10 @@ static inline int hugepd_ok(hugepd_t hpd)
 #endif
 }
 
+#define is_hugepd(hpd) (hugepd_ok(hpd))
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
 static inline int pmd_huge(pmd_t pmd)
 {
return 0;
@@ -360,8 +364,6 @@ static inline int pud_huge(pud_t pud)
 {
return 0;
 }
-
-#define is_hugepd(hpd) (hugepd_ok(hpd))
 #endif
 
 int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 594a4b7b2ca2..20fad59ff9f5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -42,6 +42,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
   unsigned long address, unsigned int pdshift,
   unsigned int pshift, spinlock_t *ptl)
@@ -193,6 +194,36 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
return hugepte_offset(*hpdp, addr, pdshift);
 }
+#else
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long sz)
+{
+   p4d_t *p4d;
+   pud_t *pud;
+   pmd_t *pmd;
+
+   addr &= ~(sz - 1);
+
+   p4d = p4d_offset(pgd_offset(mm, addr), addr);
+   if (!mm_pud_folded(mm) && sz >= P4D_SIZE)
+   return (pte_t *)p4d;
+
+   pud = pud_alloc(mm, p4d, addr);
+   if (!pud)
+   return NULL;
+   if (!mm_pmd_folded(mm) && sz >= PUD_SIZE)
+   return (pte_t *)pud;
+
+   pmd = pmd_alloc(mm, pud, addr);
+   if (!pmd)
+   return NULL;
+
+   if (sz >= PMD_SIZE)
+   return (pte_t *)pmd;
+
+   return pte_alloc_huge(mm, pmd, addr);
+}
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
@@ -248,6 +279,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h, int 
nid)
return __alloc_bootmem_huge_page(h, nid);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 #ifnd

[RFC PATCH v3 08/16] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-26 Thread Christophe Leroy
In order to fit better with standard Linux page tables layout, add
support for 8M pages using contiguous PTE entries in a standard
page table. Page tables will then be populated with 1024 similar
entries and two PMD entries will point to that page table.

The PMD entries also get a flag to tell it is addressing an 8M page,
this is required for the HW tablewalk assistance.

Signed-off-by: Christophe Leroy 
---
v3:
- Move huge_ptep_get() for a more readable commit diff
- Flag PMD as 8Mbytes in set_huge_pte_at()
- Define __pte_leaf_size()
- Change pte_update() instead of all huge callers of pte_update()
- Added ptep_is_8m_pmdp() helper
- Fixed kasan early memory 8M allocation
---
 arch/powerpc/Kconfig  |  1 -
 .../include/asm/nohash/32/hugetlb-8xx.h   | 38 +++--
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 53 ---
 arch/powerpc/include/asm/nohash/pgtable.h |  4 --
 arch/powerpc/include/asm/page.h   |  5 --
 arch/powerpc/include/asm/pgtable.h|  3 ++
 arch/powerpc/kernel/head_8xx.S| 10 +---
 arch/powerpc/mm/hugetlbpage.c | 18 ---
 arch/powerpc/mm/kasan/8xx.c   | 21 +---
 arch/powerpc/mm/nohash/8xx.c  | 40 +++---
 arch/powerpc/mm/pgtable.c | 27 +++---
 arch/powerpc/mm/pgtable_32.c  |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  2 +
 13 files changed, 112 insertions(+), 112 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a1a3b3363008..6a4ea7dad23f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,7 +135,6 @@ config PPC
select ARCH_HAS_DMA_MAP_DIRECT  if PPC_PSERIES
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
-   select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h 
b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 92df40c6cc6b..c60219269323 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -4,42 +4,12 @@
 
 #define PAGE_SHIFT_8M  23
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   BUG_ON(!hugepd_ok(hpd));
-
-   return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return PAGE_SHIFT_8M;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   unsigned long idx = (addr & (SZ_4M - 1)) >> PAGE_SHIFT;
-
-   return hugepd_page(hpd) + idx;
-}
-
 static inline void flush_hugetlb_page(struct vm_area_struct *vma,
  unsigned long vmaddr)
 {
flush_tlb_page(vma, vmaddr);
 }
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
-{
-   *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
-static inline void hugepd_populate_kernel(hugepd_t *hpdp, pte_t *new, unsigned 
int pshift)
-{
-   *hpdp = __hugepd(__pa(new) | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
 static inline int check_and_get_huge_psize(int shift)
 {
return shift_to_mmu_psize(shift);
@@ -49,6 +19,14 @@ static inline int check_and_get_huge_psize(int shift)
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 pte_t pte, unsigned long sz);
 
+#define __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
+{
+   if (ptep_is_8m_pmdp(mm, addr, ptep))
+   ptep = pte_offset_kernel((pmd_t *)ptep, 0);
+   return ptep_get(ptep);
+}
+
 #define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep, unsigned long sz)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 625c31d6ce5c..54ebb91dbdcf 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -119,7 +119,7 @@ static inline pte_t pte_mkhuge(pte_t pte)
 
 #define pte_mkhuge pte_mkhuge
 
-static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, 
pte_t *p,
+static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
 unsigned long clr, unsigned long set, int 
huge);
 
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long 
addr, pte_t *ptep)
@@ -141,19 +141,12 @@ static inline void __ptep_set_access_flags(struct 
vm_area_struct *vma, pte_t *pt
 }
 #define __ptep_set_access_flags __ptep_set_access_flags
 

[RFC PATCH v3 07/16] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-05-26 Thread Christophe Leroy
set_huge_pte_at() expects the size of the hugepage as an int, not the
psize which is the index of the page definition in table mmu_psize_defs[]

Fixes: 935d4f0c6dc8 ("mm: hugetlb: add huge page size param to 
set_huge_pte_at()")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/8xx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 43d4842bb1c7..d93433e26ded 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -94,7 +94,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long 
va, phys_addr_t pa,
return -EINVAL;
 
set_huge_pte_at(_mm, va, ptep,
-   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize);
+   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
+   1UL << mmu_psize_to_shift(psize));
 
return 0;
 }
-- 
2.44.0



[RFC PATCH v3 04/16] powerpc/mm: Remove _PAGE_PSIZE

2024-05-26 Thread Christophe Leroy
_PAGE_PSIZE macro is never used outside the place it is defined
and is used only on 8xx and e500.

Remove indirection, remove it and use its content directly.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/32/pte-40x.h  | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-44x.h  | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-85xx.h | 3 ---
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 5 ++---
 arch/powerpc/include/asm/nohash/pte-e500.h| 4 +---
 5 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h 
b/arch/powerpc/include/asm/nohash/32/pte-40x.h
index d759cfd74754..52ed58516fa4 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-40x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h
@@ -49,9 +49,6 @@
 #define _PAGE_EXEC 0x200   /* hardware: EX permission */
 #define _PAGE_ACCESSED 0x400   /* software: R: page referenced */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 /* cache related flags non existing on 40x */
 #define _PAGE_COHERENT 0
 
diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h 
b/arch/powerpc/include/asm/nohash/32/pte-44x.h
index 851813725237..da0469928273 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-44x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h
@@ -75,9 +75,6 @@
 #define _PAGE_NO_CACHE 0x0400  /* H: I bit */
 #define _PAGE_WRITETHRU0x0800  /* H: W bit */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 /* TODO: Add large page lowmem mapping support */
 #define _PMD_PRESENT   0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
index 653a342d3b25..14d64b4f3f14 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
@@ -31,9 +31,6 @@
 #define _PAGE_WRITETHRU0x00400 /* H: W bit */
 #define _PAGE_SPECIAL  0x00800 /* S: Special page */
 
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE0
-
 #define _PMD_PRESENT   0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
 #define _PMD_BAD   (~PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 137dc3c84e45..625c31d6ce5c 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -74,12 +74,11 @@
 #define _PTE_NONE_MASK 0
 
 #ifdef CONFIG_PPC_16K_PAGES
-#define _PAGE_PSIZE_PAGE_SPS
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_SPS)
 #else
-#define _PAGE_PSIZE0
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED)
 #endif
 
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
 #define _PAGE_BASE (_PAGE_BASE_NC)
 
 #include 
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
b/arch/powerpc/include/asm/nohash/pte-e500.h
index f516f0b5b7a8..975facc7e38e 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -65,8 +65,6 @@
 
 #define _PAGE_SPECIAL  _PAGE_SW0
 
-/* Base page size */
-#define _PAGE_PSIZE_PAGE_PSIZE_4K
 #definePTE_RPN_SHIFT   (24)
 
 #define PTE_WIMGE_SHIFT (19)
@@ -89,7 +87,7 @@
  * pages. We always set _PAGE_COHERENT when SMP is enabled or
  * the processor might need it for DMA coherency.
  */
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE_4K)
 #if defined(CONFIG_SMP)
 #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT)
 #else
-- 
2.44.0



[RFC PATCH v3 05/16] powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries

2024-05-26 Thread Christophe Leroy
Building on 32 bits with pmd_leaf() not returning always false leads
to the following error:

  CC  arch/powerpc/mm/pgtable.o
arch/powerpc/mm/pgtable.c: In function '__find_linux_pte':
arch/powerpc/mm/pgtable.c:506:1: error: function may return address of local 
variable [-Werror=return-local-addr]
  506 | }
  | ^
arch/powerpc/mm/pgtable.c:394:15: note: declared here
  394 | pud_t pud, *pudp;
  |   ^~~
arch/powerpc/mm/pgtable.c:394:15: note: declared here

This is due to pmd_offset() being a no-op in that case.

So rework it for powerpc/32 so that pXd_offset() are used on real
pointers and not on on-stack copies.

Signed-off-by: Christophe Leroy 
---
v3: Removed p4dp and pudp locals for PPC32 and add a comment.
---
 arch/powerpc/mm/pgtable.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9e7ba9c3851f..10adef5967a3 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -382,8 +382,10 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
 {
pgd_t *pgdp;
+#ifdef CONFIG_PPC64
p4d_t p4d, *p4dp;
pud_t pud, *pudp;
+#endif
pmd_t pmd, *pmdp;
pte_t *ret_pte;
hugepd_t *hpdp = NULL;
@@ -401,8 +403,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 * page fault or a page unmap. The return pte_t * is still not
 * stable. So should be checked there for above conditions.
 * Top level is an exception because it is folded into p4d.
+*
+* On PPC32, P4D/PUD/PMD are folded into PGD so go straight to
+* PMD level.
 */
pgdp = pgdir + pgd_index(ea);
+#ifdef CONFIG_PPC64
p4dp = p4d_offset(pgdp, ea);
p4d  = READ_ONCE(*p4dp);
pdshift = P4D_SHIFT;
@@ -444,6 +450,9 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 
pdshift = PMD_SHIFT;
pmdp = pmd_offset(, ea);
+#else
+   pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea);
+#endif
pmd  = READ_ONCE(*pmdp);
 
/*
-- 
2.44.0



[RFC PATCH v3 09/16] powerpc/8xx: Simplify struct mmu_psize_def

2024-05-26 Thread Christophe Leroy
On 8xx, only the shift field is used in struct mmu_psize_def

Remove other fields and related macros.

Signed-off-by: Christophe Leroy 
Reviewed-by: Oscar Salvador 
---
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h 
b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 141d82e249a8..a756a1e59c54 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -189,19 +189,14 @@ typedef struct {
 
 #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff8)
 
-/* Page size definitions, common between 32 and 64-bit
+/*
+ * Page size definitions for 8xx
  *
  *shift : is the "PAGE_SHIFT" value for that page size
- *penc  : is the pte encoding mask
  *
  */
 struct mmu_psize_def {
unsigned intshift;  /* number of bits */
-   unsigned intenc;/* PTE encoding */
-   unsigned intind;/* Corresponding indirect page size shift */
-   unsigned intflags;
-#define MMU_PAGE_SIZE_DIRECT   0x1 /* Supported as a direct size */
-#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
 };
 
 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-- 
2.44.0



Re: [RFC PATCH v2 15/20] powerpc/85xx: Switch to 64 bits PGD

2024-05-25 Thread Christophe Leroy


Le 25/05/2024 à 06:54, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 09:00:09PM +0200, Christophe Leroy wrote:
>> In order to allow leaf PMD entries, switch the PGD to 64 bits entries.
>>
>> Signed-off-by: Christophe Leroy 
> 
> I do not quite understand this change.
> Are not powerE500 and power85xx two different things?

Yes they are two different things, but one contains the other

e500 is the processor-core which is included inside the MPC85xx micro 
controller.

But CONFIG_PPC_E500 is a bit more than e500 core, it also includes e5500 
and e6500 which are evolutions of e500.

mpc85xx is 32 bits
e5500 and e6500 are 64 bits



> You are changing making it 64 for PPC_E500_64bits, but you are updating 
> head_85xx.
> Are they sharing this code?

Not exactly. mpc85xx can be built with 32 bits PTE or 64 bits PTE, based 
on CONFIG_PTE_64BIT

When CONFIG_PTE_64BIT is selected it uses the same PTE layout on 32-bits 
and 64-bits. But on 32-bits the PGD is still 32-bits, so it is not 
possible to use leaf entries at PGD level hence the change.

When CONFIG_PTE_64BIT is not selected, huge pages are not supported.

> 
> Also, we would benefit from a slightly bigger changelog, explaining why
> do we need this change in some more detail.

Yes I can write this is because PTEs are 64-bits allthought I thought it 
was obvious.

> 
>   
>> diff --git a/arch/powerpc/include/asm/pgtable-types.h 
>> b/arch/powerpc/include/asm/pgtable-types.h
>> index 082c85cc09b1..db965d98e0ae 100644
>> --- a/arch/powerpc/include/asm/pgtable-types.h
>> +++ b/arch/powerpc/include/asm/pgtable-types.h
>> @@ -49,7 +49,11 @@ static inline unsigned long pud_val(pud_t x)
>>   #endif /* CONFIG_PPC64 */
>>   
>>   /* PGD level */
>> +#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT)
>> +typedef struct { unsigned long long pgd; } pgd_t;
>> +#else
>>   typedef struct { unsigned long pgd; } pgd_t;
>> +#endif
>>   #define __pgd(x)   ((pgd_t) { (x) })
>>   static inline unsigned long pgd_val(pgd_t x)
>>   {
>> diff --git a/arch/powerpc/kernel/head_85xx.S 
>> b/arch/powerpc/kernel/head_85xx.S
>> index 39724ff5ae1f..a305244afc9f 100644
>> --- a/arch/powerpc/kernel/head_85xx.S
>> +++ b/arch/powerpc/kernel/head_85xx.S
>> @@ -307,8 +307,9 @@ set_ivor:
>>   #ifdef CONFIG_PTE_64BIT
>>   #ifdef CONFIG_HUGETLB_PAGE
>>   #define FIND_PTE   \
>> -rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
>> -lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
>> +rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
>> +add r12, r11, r12;
> 
> You add the offset to pgdir?

Yes because later r12 points to the PTE so when it is a leaf PGD entry 
we need r12 to point to that entry.

> 
>> +lwz r11, 4(r12);/* Get pgd/pmd entry */ \
> 
> What is i offset 4?

It is big endian, the entry is now 64 bits but the real content of the 
entry is still 32 bits so it is in the lower word.

> 
> 


Re: [RFC PATCH v2 11/20] powerpc/mm: Complement huge_pte_alloc() for all non HUGEPD setups

2024-05-25 Thread Christophe Leroy


Le 25/05/2024 à 06:29, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 09:00:05PM +0200, Christophe Leroy wrote:
>> huge_pte_alloc() for non-HUGEPD targets is reserved for 8xx at the
>> moment. In order to convert other targets for non-HUGEPD, complement
>> huge_pte_alloc() to support any standard cont-PxD setup.
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>>   arch/powerpc/mm/hugetlbpage.c | 25 -
>>   1 file changed, 24 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
>> index 42b12e1ec851..f8aefa1e7363 100644
>> --- a/arch/powerpc/mm/hugetlbpage.c
>> +++ b/arch/powerpc/mm/hugetlbpage.c
>> @@ -195,11 +195,34 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
>> vm_area_struct *vma,
>>   pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
>>unsigned long addr, unsigned long sz)
>>   {
>> -pmd_t *pmd = pmd_off(mm, addr);
>> +pgd_t *pgd;
>> +p4d_t *p4d;
>> +pud_t *pud;
>> +pmd_t *pmd;
>> +
>> +addr &= ~(sz - 1);
>> +pgd = pgd_offset(mm, addr);
>> +
>> +p4d = p4d_offset(pgd, addr);
>> +if (sz >= PGDIR_SIZE)
>> +return (pte_t *)p4d;
>> +
>> +pud = pud_alloc(mm, p4d, addr);
>> +if (!pud)
>> +return NULL;
>> +if (sz >= PUD_SIZE)
>> +return (pte_t *)pud;
>> +
>> +pmd = pmd_alloc(mm, pud, addr);
>> +if (!pmd)
>> +return NULL;
>>   
>>  if (sz < PMD_SIZE)
>>  return pte_alloc_huge(mm, pmd, addr, sz);
>>   
>> +if (!IS_ENABLED(CONFIG_PPC_8xx))
>> +return (pte_t *)pmd;
> 
> So only 8xx has cont-PMD for hugepages?

No, all have cont-PMD but only 8xx handles pages greater than PMD_SIZE 
as cont-PTE instead of cont-PMD.

> 
>> +
>>  if (sz != SZ_8M)
>>  return NULL;
> 
> Since this function is the core for allocation huge pages, I think it would
> benefit from a comment at the top explaining the possible layouts.
> e.g: Who can have cont-{P4d,PUD,PMD} etc.
> A brief explanation of the possible scheme for all powerpc platforms.

All is standard except 8xx, let's just have a comment for 8xx.

> 
> That would help people looking into this in a future.
> 
>   
> 


Re: [RFC PATCH v2 10/20] powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries

2024-05-25 Thread Christophe Leroy


Le 25/05/2024 à 06:12, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 09:00:04PM +0200, Christophe Leroy wrote:
>> Building on 32 bits with pmd_leaf() not returning always false leads
>> to the following error:
> 
> I am curious though.
> pmd_leaf is only defined in include/linux/pgtable.h for 32bits, and is 
> hardcoded
> to false.
> I do not see where we change it in previous patches, so is this artificial?

Patch 17 brings pmd_leaf()

> 
>>
>>CC  arch/powerpc/mm/pgtable.o
>> arch/powerpc/mm/pgtable.c: In function '__find_linux_pte':
>> arch/powerpc/mm/pgtable.c:506:1: error: function may return address of local 
>> variable [-Werror=return-local-addr]
>>506 | }
>>| ^
>> arch/powerpc/mm/pgtable.c:394:15: note: declared here
>>394 | pud_t pud, *pudp;
>>|   ^~~
>> arch/powerpc/mm/pgtable.c:394:15: note: declared here
>>
>> This is due to pmd_offset() being a no-op in that case.
> 
> This is because 32bits powerpc include pgtable-nopmd.h?
> 
>> So rework it for powerpc/32 so that pXd_offset() are used on real
>> pointers and not on on-stack copies.
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>>   arch/powerpc/mm/pgtable.c | 14 --
>>   1 file changed, 12 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
>> index 59f0d7706d2f..51ee508eeb5b 100644
>> --- a/arch/powerpc/mm/pgtable.c
>> +++ b/arch/powerpc/mm/pgtable.c
>> @@ -390,8 +390,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
>>  bool *is_thp, unsigned *hpage_shift)
>>   {
>>  pgd_t *pgdp;
>> -p4d_t p4d, *p4dp;
>> -pud_t pud, *pudp;
>> +p4d_t *p4dp;
>> +pud_t *pudp;
>> +#ifdef CONFIG_PPC64
>> +p4d_t p4d;
>> +pud_t pud;
>> +#endif
>>  pmd_t pmd, *pmdp;
>>  pte_t *ret_pte;
>>  hugepd_t *hpdp = NULL;
>> @@ -412,6 +416,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
>>   */
>>  pgdp = pgdir + pgd_index(ea);
>>  p4dp = p4d_offset(pgdp, ea);
>> +#ifdef CONFIG_PPC64
>>  p4d  = READ_ONCE(*p4dp);
>>  pdshift = P4D_SHIFT;
>>   
>> @@ -452,6 +457,11 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
>>   
>>  pdshift = PMD_SHIFT;
>>  pmdp = pmd_offset(, ea);
>> +#else
>> +p4dp = p4d_offset(pgdp, ea);
>> +pudp = pud_offset(p4dp, ea);
>> +pmdp = pmd_offset(pudp, ea);
> 
> I would drop a comment on top explaining that these are no-op for 32bits,
> otherwise it might not be obvious to people as why this distiction between 64 
> and
> 32bits.

Ok

> 
> Other than that looks good to me
> 
>   
> 


Re: [PATCH V2 8/9] tools/perf: Add support to find global register variables using find_data_type_global_reg

2024-05-24 Thread Christophe Leroy


Le 24/05/2024 à 14:17, Athira Rajeev a écrit :
> 
> 
>> On 7 May 2024, at 3:33 PM, Christophe Leroy  
>> wrote:
>>
>>
>>
>> Le 06/05/2024 à 14:19, Athira Rajeev a écrit :
>>> There are cases where define a global register variable and associate it
>>> with a specified register. Example, in powerpc, two registers are
>>> defined to represent variable:
>>> 1. r13: represents local_paca
>>> register struct paca_struct *local_paca asm("r13");
>>>
>>> 2. r1: represents stack_pointer
>>> register void *__stack_pointer asm("r1");
>>
>> What about r2:
>>
>> register struct task_struct *current asm ("r2");
> 
> Hi Christophe,
> 
> Referring to arch/powerpc/include/asm/current.h, “current” in powerpc 64 bit 
> is from paca_struct which is handled with r13
> R2 definition which you shared above is for 32 bit case.
> 

Hi Athira,

Yes I know.

Your patches are meant to handle both powerpc/64 and powerpc/32, aren't 
they ?

Christophe


Re: [RFC PATCH v2 07/20] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-24 Thread Christophe Leroy


Le 24/05/2024 à 12:02, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 09:00:01PM +0200, Christophe Leroy wrote:
>> In order to fit better with standard Linux page tables layout, add
>> support for 8M pages using contiguous PTE entries in a standard
>> page table. Page tables will then be populated with 1024 similar
>> entries and two PMD entries will point to that page table.
>>
>> The PMD entries also get a flag to tell it is addressing an 8M page,
>> this is required for the HW tablewalk assistance.
>>
>> Signed-off-by: Christophe Leroy 
> 
> I guess that this will slightly change if you remove patch#1 and patch#2
> as you said you will.
> So I will not comment on the overall design because I do not know how it will
> look afterwards, but just some things that caught my eye

Sure. I should send-out a v3 today or tomorrow, once I've done a few 
more tests.


> 
>> --- a/arch/powerpc/include/asm/hugetlb.h
>> +++ b/arch/powerpc/include/asm/hugetlb.h
>> @@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, 
>> unsigned long addr,
>>   static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>>  unsigned long addr, pte_t *ptep)
>>   {
>> -return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
>> +pmd_t *pmdp = (pmd_t *)ptep;
>> +pte_t pte;
>> +
>> +if (IS_ENABLED(CONFIG_PPC_8xx) && pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
>> SZ_8M))) {
> 
> There are quite some places where you do the "pmd_off" to check whether that
> is a 8MB entry.

I refactored the code, now I have only two places with it: pte_update() 
and huge_ptep_get()

By the way it doesn't check that PMD is 8M, it checks that the ptep 
points to the first PMD entry matching the said address.

> I think it would make somse sense to have some kind of macro/function to make
> more clear what we are checking against.
> e.g:
> 
>   #define pmd_is_SZ_8M(mm, addr, pmdp) (pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
> SZ_8M)))
>   (or whatever name you see fit)
>   
> then you would just need
> 
>   if (IS_ENABLED(CONFIG_PPC_8xx && pmd_is_SZ_8M(mm, addr, pdmp))
> 
> Because I see that is also scaterred in 8xx code.
> 
> 
>> +pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), 
>> ~0UL, 0, 1));
>> +pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
>> 1);
> 
> I have this fresh one because I recently read about 8xx pagetables, but not 
> sure
> how my memory will survive this, so maybe throw a little comment in there that
> we are pointing the two pmds to the area.

The two PMD are now pointing to there own areas, we are not anymore in 
the hugepd case where the PMD was pointing to a single HUGEPD containing 
a single HUGEPTE.

> 
> Also, the way we pass the parameters here to pte_update() is a bit awkward.
> Ideally we should be using some meaningful names?
> 
>   clr_all_bits = ~0UL
>   set_bits = 0
>   bool is_huge = true
> 
>   pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr_all_bits, 
> set_bits, is_huge)
> 
> or something along those lines

Well, with my refactoring those functions are not modified anymore so I 
won't change them.

> 
>> -static inline int check_and_get_huge_psize(int shift)
>> -{
>> -return shift_to_mmu_psize(shift);
>> +if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))
> 
> Here you could also use the pmd_is_SZ_8M()

Yes, may do that.

> 
>> +ptep = pte_offset_kernel(pmdp, 0);
>> +return ptep_get(ptep);
>>   }
>>   
>>   #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
>> @@ -53,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long 
>> addr, pte_t *ptep,
>>   static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>>pte_t *ptep, unsigned long sz)
>>   {
>> -pte_update(mm, addr, ptep, ~0UL, 0, 1);
>> +pmd_t *pmdp = (pmd_t *)ptep;
>> +
>> +if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
>> +pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1);
>> +pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
>> 1);
>> +} else {
>> +pte_update(mm, addr, ptep, ~0UL, 0, 1);
>> +}
> 
> Could we not leverage this in huge_ptep_get_and_clear()?

I'm not modifying that anymore

> AFAICS,
> 
>   huge_pet_get_and_clear(mm, addr, pte_t *p)
>   {
>pte_t pte = pte_val(*p);
> 
>huge_pte_clear(mm, addr, p);
>return pte;
>   }
> 
> Or maybe it is not tha

Re: [RFC PATCH v2 12/20] powerpc/64e: Remove unneeded #ifdef CONFIG_PPC_E500

2024-05-24 Thread Christophe Leroy


Le 24/05/2024 à 09:31, Michael Ellerman a écrit :
> Christophe Leroy  writes:
>> When it is a nohash/64 it can't be anything else than
>> CONFIG_PPC_E500 so remove the #ifdef as they are always true.
> 
> I have a series doing some similar cleanups, I'll post it. We can decide
> whether to merge it before your series or combine them or whatever.
> 

Great. I'll apply my series on top.

Note that it doesn't apply cleanly on merge branch (47279113c5d0), a 
3-way merge is needed:

$ LANG= git am -3 
~/Téléchargements/1-6-powerpc-64e-Remove-unused-IBM-HTW-code.patch
Applying: powerpc/64e: Remove unused IBM HTW code
Applying: powerpc/64e: Split out nohash Book3E 64-bit code
Using index info to reconstruct a base tree...
M   arch/powerpc/mm/nohash/Makefile
.git/rebase-apply/patch:554: trailing whitespace.
def->shift = 0; 
warning: 1 line adds whitespace errors.
Falling back to patching base and 3-way merge...
Auto-merging arch/powerpc/mm/nohash/Makefile
Applying: powerpc/64e: Drop E500 ifdefs in 64-bit code
Applying: powerpc/64e: Drop MMU_FTR_TYPE_FSL_E checks in 64-bit code
Applying: powerpc/64e: Consolidate TLB miss handler patching
Applying: powerpc/64e: Drop unused TLB miss handlers

Thanks
Christophe


Re: [RFC PATCH v2 06/20] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-05-22 Thread Christophe Leroy
+Peter Z. who added that commit.

Le 22/05/2024 à 10:32, Christophe Leroy a écrit :
> 
> 
> Le 21/05/2024 à 11:26, Oscar Salvador a écrit :
>> On Tue, May 21, 2024 at 10:48:21AM +1000, Michael Ellerman wrote:
>>> Yeah I can. Does it actually cause a bug at runtime (I assume so)?
>>
>> No, currently set_huge_pte_at() from 8xx ignores the 'sz' parameter.
>> But it will be used after this series.
>>
> 
> Ah yes, I mixed things up with something else in my mind.
> 
> So this patch doesn't qualify as a fix and doesn't need to be handled 
> separately from the series and doesn't really need to go on top of the 
> series either, I think it is better to keep it grouped with other 8xx 
> changes.
> 

I remember now, what I had in mind was commit c5eecbb58f65 
("powerpc/8xx: Implement pXX_leaf_size() support")

That commit is buggy, because pgd_leaf() will always return false on 
8xx. First of all pgd_leaf() could only return true on a target with 
P4Ds. Without P4Ds it should just return 0 like pgd_none(), pgd_bad(), 
... as defined in include/asm-generic/pgtable-nop4d.h

So it is pmd_leaf_size() that could eventually return something for 8xx.
But as 8xx is using hugepd, at the best case it will return crap, worst 
case the read will go in the weed.

To be correct we should had support of hugepd in perf_get_pgtable_size() 
but that's not trivial and this series is aiming at removing hugepd 
completely so there is no point in fixing stuff here, except maybe for 
stable ?


Re: [RFC PATCH v2 03/20] mm: Provide pmd to pte_leaf_size()

2024-05-22 Thread Christophe Leroy


Le 21/05/2024 à 11:39, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 08:59:57PM +0200, Christophe Leroy wrote:
>> On powerpc 8xx, when a page is 8M size, the information is in the PMD
>> entry. So provide it to pte_leaf_size().
>>
>> Signed-off-by: Christophe Leroy 
> 
> Overall looks good to me.
> 
> Would be nicer if we could left the arch code untouched.
> I wanted to see how this would be if we go down that road and focus only
> on 8xx at the risk of being more esoteric.
> pmd_pte_leaf_size() is a name of hell, but could be replaced
> with __pte_leaf_size for example.
> 
> Worth it? Maybe not, anyway, just wanted to give it a go:

I like the idea, it doesn't look that bad after all, it avoids changes 
to other arches.

> 
> 
>   diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
> b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
>   index 137dc3c84e45..9e3fe6e1083f 100644
>   --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
>   +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
>   @@ -151,7 +151,7 @@ static inline unsigned long pgd_leaf_size(pgd_t pgd)
>   
>#define pgd_leaf_size pgd_leaf_size
>   
>   -static inline unsigned long pte_leaf_size(pte_t pte)
>   +static inline unsigned long pmd_pte_leaf_size(pte_t pte)
>{
>   pte_basic_t val = pte_val(pte);
>   
>   @@ -162,7 +162,7 @@ static inline unsigned long pte_leaf_size(pte_t pte)
>   return SZ_4K;
>}
>   
>   -#define pte_leaf_size pte_leaf_size
>   +#define pmd_pte_leaf_size pmd_pte_leaf_size
>   
>/*
> * On the 8xx, the page tables are a bit special. For 16k pages, we have
>   diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>   index 18019f037bae..2bc2fe3b2b53 100644
>   --- a/include/linux/pgtable.h
>   +++ b/include/linux/pgtable.h
>   @@ -1891,6 +1891,9 @@ typedef unsigned int pgtbl_mod_mask;
>#ifndef pte_leaf_size
>#define pte_leaf_size(x) PAGE_SIZE
>#endif
>   +#ifndef pmd_pte_leaf_size
>   +#define pmd_pte_leaf_size(x, y) pte_leaf_size(y)
>   +#endif
>   
>/*
> * We always define pmd_pfn for all archs as it's used in lots of generic
>   diff --git a/kernel/events/core.c b/kernel/events/core.c
>   index f0128c5ff278..e90a547d2fb2 100644
>   --- a/kernel/events/core.c
>   +++ b/kernel/events/core.c
>   @@ -7596,7 +7596,7 @@ static u64 perf_get_pgtable_size(struct mm_struct 
> *mm, unsigned long addr)
>   
>   pte = ptep_get_lockless(ptep);
>   if (pte_present(pte))
>   -   size = pte_leaf_size(pte);
>   +   size = pmd_pte_leaf_size(pmd, pte);
>   pte_unmap(ptep);
>#endif /* CONFIG_HAVE_GUP_FAST */
> 
>   
> 


Re: [RFC PATCH v2 18/20] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-22 Thread Christophe Leroy


Le 22/05/2024 à 03:13, Nicholas Piggin a écrit :
> On Tue May 21, 2024 at 2:43 AM AEST, Christophe Leroy wrote:
>>
>>
>> Le 20/05/2024 à 14:54, Nicholas Piggin a écrit :
>>> On Sat May 18, 2024 at 5:00 AM AEST, Christophe Leroy wrote:
>>>> On book3s/64, the only user of hugepd is hash in 4k mode.
>>>>
>>>> All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.
>>>>
>>>> Rework hash-4k to use contiguous PMD and PUD instead.
>>>>
>>>> In that setup there are only two huge page sizes: 16M and 16G.
>>>>
>>>> 16M sits at PMD level and 16G at PUD level.
>>>>
>>>> pte_update doesn't know page size, lets use the same trick as
>>>> hpte_need_flush() to get page size from segment properties. That's
>>>> not the most efficient way but let's do that until callers of
>>>> pte_update() provide page size instead of just a huge flag.
>>>>
>>>> Signed-off-by: Christophe Leroy 
>>>> ---
>>>>arch/powerpc/include/asm/book3s/64/hash-4k.h  | 15 
>>>>arch/powerpc/include/asm/book3s/64/hash.h | 38 +++
>>>>arch/powerpc/include/asm/book3s/64/hugetlb.h  | 38 ---
>>>>.../include/asm/book3s/64/pgtable-4k.h| 34 -
>>>>.../include/asm/book3s/64/pgtable-64k.h   | 20 --
>>>>arch/powerpc/include/asm/hugetlb.h|  4 ++
>>>>.../include/asm/nohash/32/hugetlb-8xx.h   |  4 --
>>>>.../powerpc/include/asm/nohash/hugetlb-e500.h |  4 --
>>>>arch/powerpc/include/asm/page.h   |  8 
>>>>arch/powerpc/mm/book3s64/hash_utils.c | 11 --
>>>>arch/powerpc/mm/book3s64/pgtable.c| 12 --
>>>>arch/powerpc/mm/hugetlbpage.c | 19 --
>>>>arch/powerpc/mm/pgtable.c |  2 +-
>>>>arch/powerpc/platforms/Kconfig.cputype|  1 -
>>>>14 files changed, 43 insertions(+), 167 deletions(-)
>>>>
>>>> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
>>>> b/arch/powerpc/include/asm/book3s/64/hash-4k.h
>>>> index 6472b08fa1b0..c654c376ef8b 100644
>>>> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
>>>> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
>>>> @@ -74,21 +74,6 @@
>>>>#define remap_4k_pfn(vma, addr, pfn, prot)  \
>>>>remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
>>>>
>>>> -#ifdef CONFIG_HUGETLB_PAGE
>>>> -static inline int hash__hugepd_ok(hugepd_t hpd)
>>>> -{
>>>> -  unsigned long hpdval = hpd_val(hpd);
>>>> -  /*
>>>> -   * if it is not a pte and have hugepd shift mask
>>>> -   * set, then it is a hugepd directory pointer
>>>> -   */
>>>> -  if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
>>>> -  ((hpdval & HUGEPD_SHIFT_MASK) != 0))
>>>> -  return true;
>>>> -  return false;
>>>> -}
>>>> -#endif
>>>> -
>>>>/*
>>>> * 4K PTE format is different from 64K PTE format. Saving the hash_slot 
>>>> is just
>>>> * a matter of returning the PTE bits that need to be modified. On 64K 
>>>> PTE,
>>>> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
>>>> b/arch/powerpc/include/asm/book3s/64/hash.h
>>>> index faf3e3b4e4b2..509811ca7695 100644
>>>> --- a/arch/powerpc/include/asm/book3s/64/hash.h
>>>> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
>>>> @@ -4,6 +4,7 @@
>>>>#ifdef __KERNEL__
>>>>
>>>>#include 
>>>> +#include 
>>>>
>>>>/*
>>>> * Common bits between 4K and 64K pages in a linux-style PTE.
>>>> @@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, 
>>>> unsigned long addr,
>>>>pte_t *ptep, unsigned long pte, int huge);
>>>>unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned 
>>>> long flags);
>>>>/* Atomic PTE updates */
>>>> -static inline unsigned long hash__pte_update(struct mm_struct *mm,
>>>> -   unsigned long addr,
>>

Re: [RFC PATCH 0/8] Reimplement huge pages without hugepd on powerpc 8xx

2024-05-22 Thread Christophe Leroy


Le 17/05/2024 à 16:27, Oscar Salvador a écrit :
> On Mon, Mar 25, 2024 at 03:55:53PM +0100, Christophe Leroy wrote:
>> This series reimplements hugepages with hugepd on powerpc 8xx.
>>
>> Unlike most architectures, powerpc 8xx HW requires a two-level
>> pagetable topology for all page sizes. So a leaf PMD-contig approach
>> is not feasible as such.
>>
>> Possible sizes are 4k, 16k, 512k and 8M.
>>
>> First level (PGD/PMD) covers 4M per entry. For 8M pages, two PMD entries
>> must point to a single entry level-2 page table. Until now that was
>> done using hugepd. This series changes it to use standard page tables
>> where the entry is replicated 1024 times on each of the two pagetables
>> refered by the two associated PMD entries for that 8M page.
>>
>> At the moment it has to look into each helper to know if the
>> hugepage ptep is a PTE or a PMD in order to know it is a 8M page or
>> a lower size. I hope this can me handled by core-mm in the future.
>>
>> There are probably several ways to implement stuff, so feedback is
>> very welcome.
> 
> 
> Hi Christophe,
> 
> I have been looking into this because I am interested in the ongoing work of
> the hugetlb unification, but my knowledge of ppc pagetables tends to zero,
> So be prepared for some stupid questions.
> 
> First, let me have a clear picture of the current situation:
> 
> power8xx has 4KB, 16KB, 512KB, and 8MB page sizes, and operate on a 2Level
> pagetables. Wiki [1] mentions PGD + PTE, here you seem to be referring them
> as PMD + PTE though.
> 
> And we can have 1024 PGDs, each of one covers 4MB, so we can cover a total of
> of 4GB.
> 
> Looking at the page table diagram for power8xx, it seems power8xx has also 
> some
> sort of CONTIG_PTE? (same as arm64 does) So we can have contig_ptes 
> representing
> bigger page sizes?
> I also guess that although power8xx supports all these different sizes, only 
> one
> of them can be active at any time, right?

Don't know what you mean by "active at any time". In a running system 
with PAGE_SIZE defined as 4k, you can at any time have some hugepages of 
size 16K, some 512K and some 8M.

> 
> It also seems that this whole hugepd thing is only used when we are using 8MB
> PAGE_SIZE, right?

Today yes. In the past it was also used for 512K pages, until commit 
b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as standard pages.")

> And when that is active, we do have 2 PGDs(4MB each) pointing to the same 8MB
> hugepd.
> E.g:
>  
>   [PGD#0] > ||
> | 8MB hugepd |
>   [PGD#1] > ||
> 
> What you want to do with this work is to get rid of that hugepd abstraction
> because it is something power8xx/hugetlb specific and cannot be represented
> with our "normal" page table layout (PGD,P4D,PUD,PMD,PTE).

It is more than 8xx, also used on e500 and book3s/64 but for sure that's 
specific to powerpc and it would help to get rid of it completely.

> I did not check, but I guess we cannot walk the hugepd thing with a normal
> page table walker, or can we? (how special is a hugepd? can you describe its
> internal layout?)

depends on what you mean by "normal". For instance 
walk_page_range_novma() handles hugepd.

> 
> So, what you proprose, is something like the following?
> 
>   [PGD#X] ---> [PTE#0]
>   ---> [PTE..#1023]
>   [PGD#Y] ---> [PTE#0]
>   ---> [PTE..#1023]
> 
> so a 8MB hugepage will be covered by PGD#X and PGD#Y using contiguos PTEs.
> 
> The diagram at [1] for 8xx 16K seems a bit misleading to me (or maybe it is
> just me). They say that a Level2 table (aka PTE) covers 4KB chunks regardless
> of the pagesize, but either I read that wrong or..else.

What it means is that when PAGE_SIZE is 16k, the pte_t is a table of 4 
long, see 
https://elixir.bootlin.com/linux/v6.9/source/arch/powerpc/include/asm/pgtable-types.h#L11

> Because on 16K page size, they show that each pte covers for 16KB memory 
> chunk.
> But that would mean 16KB << 10 = 16M each PGD, which is not really that, so 
> what
> is the deal there? Or it is just that we always use 4KB PTEs, and use 
> contiguous
> PTEs for bigger sizes?

In a way yes, we cheat the HW by defining a PTE as a table of 4 u32 
values to behave like a cont-PTE.

> 
> Now, it seems that power8xx has no-p4d, no-pud and no-pmd, right?
> 
> Peter mentioned that we should have something like:
> 
> X   X
> [PGD] - [P4D] - [PUD] - [PMD] - [PTE]
> 
> where the PMD and PTE would be the ones we use for representing the 2Lev

Re: [RFC PATCH v2 06/20] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-05-22 Thread Christophe Leroy


Le 20/05/2024 à 19:42, Oscar Salvador a écrit :
> On Mon, May 20, 2024 at 04:31:39PM +0000, Christophe Leroy wrote:
>> Hi Oscar, hi Michael,
>>
>> Le 20/05/2024 à 11:14, Oscar Salvador a écrit :
>>> On Fri, May 17, 2024 at 09:00:00PM +0200, Christophe Leroy wrote:
>>>> set_huge_pte_at() expects the real page size, not the psize which is
>>>
>>> "expects the size of the huge page" sounds bettter?
>>
>> Parameter 'pzize' already provides the size of the hugepage, but not in
>> the way set_huge_pte_at() expects it.
>>
>> psize has one of the values defined by MMU_PAGE_XXX macros defined in
>> arch/powerpc/include/asm/mmu.h while set_huge_pte_at() expects the size
>> as a value.
> 
> Yes, psize is an index, which is not a size by itself but used to get
> mmu_psize_def.shift to see the actual size, I guess.
> This is why I thought that being explicit about "expects the size of the
> huge page" was better.
> 
> But no strong feelings here.
> 

Thanks, I'll try a rephrase.

Christophe


Re: [RFC PATCH v2 01/20] mm: Provide pagesize to pmd_populate()

2024-05-22 Thread Christophe Leroy


Le 21/05/2024 à 13:57, Oscar Salvador a écrit :
> On Mon, May 20, 2024 at 04:24:51PM +0000, Christophe Leroy wrote:
>> I had a quick look at that document and it seems to provide a good
>> summary of MMU features and principles. However there are some
>> theoritical information which is not fully right in practice. For
>> instance when they say "Segment attributes. These fields define
>> attributes common to all pages in this segment.". This is right in
>> theory if you consider it from Linux page table topology point of view,
>> hence what they call a segment is a PMD entry for Linux. However, in
>> practice each page has its own L1 and L2 attributes and there is not
>> requirement at HW level to have all L1 attributes of all pages of a
>> segment the same.
> 
> Thanks for taking the time Christophe, highly appreciated.
> 
>   
>> rlwimi = Rotate Left Word Immediate then Mask Insert. Here it rotates
>> r10 by 23 bits to the left (or 9 to the right) then masks with
>> _PMD_PAGE_512K and inserts it into r11.
>>
>> It means _PAGE_HUGE bit is copied into lower bit of PS attribute.
>>
>> PS takes the following values:
>>
>> PS = 00 ==> Small page (4k or 16k)
>> PS = 01 ==> 512k page
>> PS = 10 ==> Undefined
>> PS = 11 ==> 8M page
> 
> I see, thanks for the explanation.
> 
>> That's a RFC, all ideas are welcome, I needed something to replace
>> hugepd_populate()
> 
> The only user interested in pmd_populate() having a sz parameter
> is 8xx because it will toggle _PMD_PAGE_8M in case of a 8MB mapping.
> 
> Would it be possible for 8xx to encode the 'sz' in the *pmd pointer
> prior to calling down the chain? (something like as we do for PTR_ERR()).
> Then pmd_populate_{kernel_}size() from 8xx, would extract it like:
> 
>   unsigned long sz = PTR_SIZE(pmd)
> 
> Then we would not need all these 'sz' parameters scattered.
> 
> Can that work?

Indeed _PMD_PAGE_8M can be set in set_huge_pte_at(), no need to do it 
atomically as part of pmd_populate, so I'll drop patches 1 and 2.

> 
> 
> PD: Do you know a way to emulate a 8xx VM? qemu seems to not have
> support support.
> 

I don't know any way. You are right that 8xx is not supported by QEMU 
unfortunately. I don't know how difficult it would be to add it to QEMU.

Christophe


Re: [RFC PATCH v2 06/20] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-05-22 Thread Christophe Leroy


Le 21/05/2024 à 11:26, Oscar Salvador a écrit :
> On Tue, May 21, 2024 at 10:48:21AM +1000, Michael Ellerman wrote:
>> Yeah I can. Does it actually cause a bug at runtime (I assume so)?
> 
> No, currently set_huge_pte_at() from 8xx ignores the 'sz' parameter.
> But it will be used after this series.
> 

Ah yes, I mixed things up with something else in my mind.

So this patch doesn't qualify as a fix and doesn't need to be handled 
separately from the series and doesn't really need to go on top of the 
series either, I think it is better to keep it grouped with other 8xx 
changes.

Christophe


Re: [RFC PATCH v2 18/20] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-20 Thread Christophe Leroy


Le 20/05/2024 à 14:54, Nicholas Piggin a écrit :
> On Sat May 18, 2024 at 5:00 AM AEST, Christophe Leroy wrote:
>> On book3s/64, the only user of hugepd is hash in 4k mode.
>>
>> All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.
>>
>> Rework hash-4k to use contiguous PMD and PUD instead.
>>
>> In that setup there are only two huge page sizes: 16M and 16G.
>>
>> 16M sits at PMD level and 16G at PUD level.
>>
>> pte_update doesn't know page size, lets use the same trick as
>> hpte_need_flush() to get page size from segment properties. That's
>> not the most efficient way but let's do that until callers of
>> pte_update() provide page size instead of just a huge flag.
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>>   arch/powerpc/include/asm/book3s/64/hash-4k.h  | 15 
>>   arch/powerpc/include/asm/book3s/64/hash.h | 38 +++
>>   arch/powerpc/include/asm/book3s/64/hugetlb.h  | 38 ---
>>   .../include/asm/book3s/64/pgtable-4k.h| 34 -
>>   .../include/asm/book3s/64/pgtable-64k.h   | 20 --
>>   arch/powerpc/include/asm/hugetlb.h|  4 ++
>>   .../include/asm/nohash/32/hugetlb-8xx.h   |  4 --
>>   .../powerpc/include/asm/nohash/hugetlb-e500.h |  4 --
>>   arch/powerpc/include/asm/page.h   |  8 
>>   arch/powerpc/mm/book3s64/hash_utils.c | 11 --
>>   arch/powerpc/mm/book3s64/pgtable.c| 12 --
>>   arch/powerpc/mm/hugetlbpage.c | 19 --
>>   arch/powerpc/mm/pgtable.c |  2 +-
>>   arch/powerpc/platforms/Kconfig.cputype|  1 -
>>   14 files changed, 43 insertions(+), 167 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
>> b/arch/powerpc/include/asm/book3s/64/hash-4k.h
>> index 6472b08fa1b0..c654c376ef8b 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
>> @@ -74,21 +74,6 @@
>>   #define remap_4k_pfn(vma, addr, pfn, prot) \
>>  remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
>>   
>> -#ifdef CONFIG_HUGETLB_PAGE
>> -static inline int hash__hugepd_ok(hugepd_t hpd)
>> -{
>> -unsigned long hpdval = hpd_val(hpd);
>> -/*
>> - * if it is not a pte and have hugepd shift mask
>> - * set, then it is a hugepd directory pointer
>> - */
>> -if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
>> -((hpdval & HUGEPD_SHIFT_MASK) != 0))
>> -return true;
>> -return false;
>> -}
>> -#endif
>> -
>>   /*
>>* 4K PTE format is different from 64K PTE format. Saving the hash_slot is 
>> just
>>* a matter of returning the PTE bits that need to be modified. On 64K PTE,
>> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
>> b/arch/powerpc/include/asm/book3s/64/hash.h
>> index faf3e3b4e4b2..509811ca7695 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hash.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
>> @@ -4,6 +4,7 @@
>>   #ifdef __KERNEL__
>>   
>>   #include 
>> +#include 
>>   
>>   /*
>>* Common bits between 4K and 64K pages in a linux-style PTE.
>> @@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, 
>> unsigned long addr,
>>  pte_t *ptep, unsigned long pte, int huge);
>>   unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long 
>> flags);
>>   /* Atomic PTE updates */
>> -static inline unsigned long hash__pte_update(struct mm_struct *mm,
>> - unsigned long addr,
>> - pte_t *ptep, unsigned long clr,
>> - unsigned long set,
>> - int huge)
>> +static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long 
>> clr,
>> + unsigned long set)
>>   {
>>  __be64 old_be, tmp_be;
>> -unsigned long old;
>>   
>>  __asm__ __volatile__(
>>  "1: ldarx   %0,0,%3 # pte_update\n\
>> @@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct 
>> mm_struct *mm,
>>  : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
>>"r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
>>  : "cc&qu

Re: [RFC PATCH v2 06/20] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-05-20 Thread Christophe Leroy
Hi Oscar, hi Michael,

Le 20/05/2024 à 11:14, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 09:00:00PM +0200, Christophe Leroy wrote:
>> set_huge_pte_at() expects the real page size, not the psize which is
> 
> "expects the size of the huge page" sounds bettter?

Parameter 'pzize' already provides the size of the hugepage, but not in 
the way set_huge_pte_at() expects it.

psize has one of the values defined by MMU_PAGE_XXX macros defined in 
arch/powerpc/include/asm/mmu.h while set_huge_pte_at() expects the size 
as a value.


> 
>> the index of the page definition in table mmu_psize_defs[]
>>
>> Fixes: 935d4f0c6dc8 ("mm: hugetlb: add huge page size param to 
>> set_huge_pte_at()")
>> Signed-off-by: Christophe Leroy 
> 
> Reviewed-by: Oscar Salvador 
> 
> AFAICS, this fixup is not related to the series, right? (yes, you will
> the parameter later)
> I would have it at the very beginning of the series.

You are right, I should have submitted it separately.

Michael can you take it as a fix for 6.10 ?

> 
> 
>> ---
>>   arch/powerpc/mm/nohash/8xx.c | 3 ++-
>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
>> index 43d4842bb1c7..d93433e26ded 100644
>> --- a/arch/powerpc/mm/nohash/8xx.c
>> +++ b/arch/powerpc/mm/nohash/8xx.c
>> @@ -94,7 +94,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long 
>> va, phys_addr_t pa,
>>  return -EINVAL;
>>   
>>  set_huge_pte_at(_mm, va, ptep,
>> -pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize);
>> +pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
>> +1UL << mmu_psize_to_shift(psize));
>>   
>>  return 0;
>>   }
>> -- 
>> 2.44.0
>>
> 


Re: [RFC PATCH v2 01/20] mm: Provide pagesize to pmd_populate()

2024-05-20 Thread Christophe Leroy


Le 20/05/2024 à 11:01, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 08:59:55PM +0200, Christophe Leroy wrote:
>> Unlike many architectures, powerpc 8xx hardware tablewalk requires
>> a two level process for all page sizes, allthough second level only
>> has one entry when pagesize is 8M.
> 
> So, I went on a quick reading on
> 
> https://www.nxp.com/docs/en/application-note-software/AN3066.pdf
> 
> to get more insight, and I realized that some of the questions I made
> in v1 were quite dump.

I had a quick look at that document and it seems to provide a good 
summary of MMU features and principles. However there are some 
theoritical information which is not fully right in practice. For 
instance when they say "Segment attributes. These fields define 
attributes common to all pages in this segment.". This is right in 
theory if you consider it from Linux page table topology point of view, 
hence what they call a segment is a PMD entry for Linux. However, in 
practice each page has its own L1 and L2 attributes and there is not 
requirement at HW level to have all L1 attributes of all pages of a 
segment the same.

> 
>>
>> To fit with Linux page table topology and without requiring special
>> page directory layout like hugepd, the page entry will be replicated
>> 1024 times in the standard page table. However for large pages it is
> 
> You only have to replicate 1024 times in case the page size is 4KB, and you
> will have to replicate that twice and have 2 PMDs pointing to it, right?

Indeed.

> 
> For 16KB, you will have the PMD containing 512 entries of 16KB.

Exactly.

> 
>> necessary to set bits in the level-1 (PMD) entry. At the time being,
>> for 512k pages the flag is kept in the PTE and inserted in the PMD
>> entry at TLB miss exception, that is necessary because we can have
> 
>   rlwimi  r11, r10, 32 - 9, _PMD_PAGE_512K

rlwimi = Rotate Left Word Immediate then Mask Insert. Here it rotates 
r10 by 23 bits to the left (or 9 to the right) then masks with 
_PMD_PAGE_512K and inserts it into r11.

It means _PAGE_HUGE bit is copied into lower bit of PS attribute.

PS takes the following values:

PS = 00 ==> Small page (4k or 16k)
PS = 01 ==> 512k page
PS = 10 ==> Undefined
PS = 11 ==> 8M page

>   mtspr   SPRN_MI_TWC, r11
> 
> So we shift the value and compare it to _PMD_PAGE_512K to see if the PTE
> is a 512K page, and then we set it to SPRN_MI_TWC which I guess is some
> CPU special register?

TWC is where you store the Level 1 attributes, see figure 3 in the 
document you mentioned.

> 
>> pages of different sizes in a page table. However the 12 PTE bits are
>> fully used and there is no room for an additional bit for page size.
> 
> You are referring to the bits in
> arch/powerpc/include/asm/nohash/32/pte-8xx.h ?

Yes, page are 4k so only the 12 lower bits are available to encode PTE 
bits and all are used.

> 
>> For 8M pages, there will be only one page per PMD entry, it is
>> therefore possible to flag the pagesize in the PMD entry, with the
> 
> I am confused, and it might be just terminology, or I am getting wrong
> the design.
> You say that for 8MB pages, there will one page per PMD entry, but
> based on the above, you will have 1024 entries (replicated)?
> So, maybe this wanted to be read as "there will be only one page size per PMD
> entry".

You have 1024 entries in the PTE table. The PMD entry points to that 
table were all 1024 entries are the same because they all define the 
same (half) of a 8M page.

So you are also right, there is only one page size because there is only 
one 8M page.

> 
>> advantage that the information will already be at the right place for
>> the hardware.
>>
>> To do so, add a new helper called pmd_populate_size() which takes the
>> page size as an additional argument, and modify __pte_alloc() to also
> 
> "page size" makes me thing of the standart page size the kernel is
> operating on (aka PAGE_SIZE), but it is actually the size of the huge
> page, so I think we should clarify it.

Page size means "size of the page".

> 
>> take that argument. pte_alloc() is left unmodified in order to
>> reduce churn on callers, and a pte_alloc_size() is added for use by
>> pte_alloc_huge().
>>
>> When an architecture doesn't provide pmd_populate_size(),
>> pmd_populate() is used as a fallback.
> 
> It is a bit unfortunate that we have to touch the code for other
> architectures (in patch#2)

That's a RFC, all ideas are welcome, I needed something to replace 
hugepd_populate()

> 
>> Signed-off-by: Christophe Leroy 
> 
> So far I only looked at this patch and patch#2, and code-wise looks good and
> makes sense,  but I fin

Re: [RFC PATCH v2 00/20] Reimplement huge pages without hugepd on powerpc (8xx, e500, book3s/64)

2024-05-18 Thread Christophe Leroy


Le 17/05/2024 à 21:06, Jason Gunthorpe a écrit :
> On Fri, May 17, 2024 at 08:59:54PM +0200, Christophe Leroy wrote:
>> This is the continuation of the RFC v1 series "Reimplement huge pages
>> without hugepd on powerpc 8xx". It now get rid of hugepd completely
>> after handling also e500 and book3s/64
> 
> This is really amazing, thank you for doing it!
> 

You are welcome.

I have not yet taken into account your review comments on v1. I first 
wanted to have a global picture.

Christophe


[RFC PATCH v2 00/20] Reimplement huge pages without hugepd on powerpc (8xx, e500, book3s/64)

2024-05-17 Thread Christophe Leroy
This is the continuation of the RFC v1 series "Reimplement huge pages
without hugepd on powerpc 8xx". It now get rid of hugepd completely
after handling also e500 and book3s/64

Unlike most architectures, powerpc 8xx HW requires a two-level
pagetable topology for all page sizes. So a leaf PMD-contig approach
is not feasible as such.

Possible sizes are 4k, 16k, 512k and 8M.

First level (PGD/PMD) covers 4M per entry. For 8M pages, two PMD entries
must point to a single entry level-2 page table. Until now that was
done using hugepd. This series changes it to use standard page tables
where the entry is replicated 1024 times on each of the two pagetables
refered by the two associated PMD entries for that 8M page.

At the moment it has to look into each helper to know if the
hugepage ptep is a PTE or a PMD in order to know it is a 8M page or
a lower size. I hope this can me handled by core-mm in the future.

For e500 and book3s/64 there are less constraints because it is not
tied to the HW assisted tablewalk like on 8xx, so it is easier to use
leaf PMDs (and PUDs).

On e500 the supported page sizes are 4M, 16M, 64M, 256M and 1G. All at
PMD level on e500/32 and mix of PMD and PUD for e500/64. We encode page
size with 4 available bits in PTE entries. On e300/32 PGD entries size
is increases to 64 bits in order to allow leaf-PMD entries because PTE
are 64 bits on e500.

On book3s/64 only the hash-4k mode is concerned. It supports 16M pages
as cont-PMD and 16G pages as cont-PUD. In other modes (radix-4k, radix-6k
and hash-64k) the sizes match with PMD and PUD sizes so that's just leaf
entries.

Christophe Leroy (20):
  mm: Provide pagesize to pmd_populate()
  mm: Provide page size to pte_alloc_huge()
  mm: Provide pmd to pte_leaf_size()
  mm: Provide mm_struct and address to huge_ptep_get()
  powerpc/mm: Allow hugepages without hugepd
  powerpc/8xx: Fix size given to set_huge_pte_at()
  powerpc/8xx: Rework support for 8M pages using contiguous PTE entries
  powerpc/8xx: Simplify struct mmu_psize_def
  powerpc/mm: Remove _PAGE_PSIZE
  powerpc/mm: Fix __find_linux_pte() on 32 bits with PMD leaf entries
  powerpc/mm: Complement huge_pte_alloc() for all non HUGEPD setups
  powerpc/64e: Remove unneeded #ifdef CONFIG_PPC_E500
  powerpc/64e: Clean up impossible setups
  powerpc/e500: Remove enc field from struct mmu_psize_def
  powerpc/85xx: Switch to 64 bits PGD
  powerpc/e500: Encode hugepage size in PTE bits
  powerpc/e500: Use contiguous PMD instead of hugepd
  powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
  powerpc/mm: Remove hugepd leftovers
  mm: Remove CONFIG_ARCH_HAS_HUGEPD

 arch/arm/include/asm/hugetlb-3level.h |   2 +-
 arch/arm64/include/asm/hugetlb.h  |   2 +-
 arch/arm64/include/asm/pgtable.h  |   2 +-
 arch/arm64/mm/hugetlbpage.c   |   4 +-
 arch/parisc/mm/hugetlbpage.c  |   2 +-
 arch/powerpc/Kconfig  |   1 -
 arch/powerpc/include/asm/book3s/32/pgalloc.h  |   2 -
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  15 -
 arch/powerpc/include/asm/book3s/64/hash.h |  38 +-
 arch/powerpc/include/asm/book3s/64/hugetlb.h  |  38 --
 .../include/asm/book3s/64/pgtable-4k.h|  34 --
 .../include/asm/book3s/64/pgtable-64k.h   |  20 -
 arch/powerpc/include/asm/hugetlb.h|  26 +-
 .../include/asm/nohash/32/hugetlb-8xx.h   |  58 +--
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h  |   9 +-
 arch/powerpc/include/asm/nohash/32/pgalloc.h  |   2 +
 arch/powerpc/include/asm/nohash/32/pte-40x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-44x.h  |   3 -
 arch/powerpc/include/asm/nohash/32/pte-85xx.h |   3 -
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  |  64 ++-
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  36 +-
 arch/powerpc/include/asm/nohash/mmu-e500.h|   4 -
 arch/powerpc/include/asm/nohash/pgalloc.h |   2 -
 arch/powerpc/include/asm/nohash/pgtable.h |  45 +-
 arch/powerpc/include/asm/nohash/pte-e500.h|  22 +-
 arch/powerpc/include/asm/page.h   |  32 --
 arch/powerpc/include/asm/pgtable-be-types.h   |  10 -
 arch/powerpc/include/asm/pgtable-types.h  |  13 +-
 arch/powerpc/include/asm/pgtable.h|   3 +
 arch/powerpc/kernel/head_85xx.S   |  33 +-
 arch/powerpc/kernel/head_8xx.S|  10 +-
 arch/powerpc/mm/book3s64/hash_utils.c |  11 +-
 arch/powerpc/mm/book3s64/pgtable.c|  12 -
 arch/powerpc/mm/hugetlbpage.c | 450 ++
 arch/powerpc/mm/init-common.c |   8 +-
 arch/powerpc/mm/kasan/8xx.c   |  15 +-
 arch/powerpc/mm/nohash/8xx.c  |  46 +-
 arch/powerpc/mm/nohash/book3e_pgtable.c   |   4 +-
 arch/powerpc/mm/nohash/tlb.c  | 172 ++-
 arch/powerpc/mm/nohash/tlb_low_64e.S  | 257 ++
 arch/powerpc/mm/pgtable.c |  94 ++--
 arch/powerpc/mm/pgtable_32.c 

[RFC PATCH v2 05/20] powerpc/mm: Allow hugepages without hugepd

2024-05-17 Thread Christophe Leroy
In preparation of implementing huge pages on powerpc 8xx
without hugepd, enclose hugepd related code inside an
ifdef CONFIG_ARCH_HAS_HUGEPD

This also allows removing some stubs.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  2 --
 arch/powerpc/include/asm/hugetlb.h   | 10 ++
 arch/powerpc/include/asm/nohash/pgtable.h|  8 +---
 arch/powerpc/mm/hugetlbpage.c| 13 +
 arch/powerpc/mm/pgtable.c|  2 ++
 5 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index dc5c039eb28e..dd4eb3063175 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -47,8 +47,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index ea71f7245a63..79176a499763 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,10 +30,12 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling);
+#endif
 
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -67,14 +69,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct 
*vma,
 {
 }
 
-#define hugepd_shift(x) 0
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned pdshift)
-{
-   return NULL;
-}
-
-
 static inline void __init gigantic_hugetlb_cma_reserve(void)
 {
 }
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 427db14292c9..ac3353f7f2ac 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -340,7 +340,7 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
 
 #define pgprot_writecombine pgprot_noncached_wc
 
-#ifdef CONFIG_HUGETLB_PAGE
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static inline int hugepd_ok(hugepd_t hpd)
 {
 #ifdef CONFIG_PPC_8xx
@@ -351,6 +351,10 @@ static inline int hugepd_ok(hugepd_t hpd)
 #endif
 }
 
+#define is_hugepd(hpd) (hugepd_ok(hpd))
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
 static inline int pmd_huge(pmd_t pmd)
 {
return 0;
@@ -360,8 +364,6 @@ static inline int pud_huge(pud_t pud)
 {
return 0;
 }
-
-#define is_hugepd(hpd) (hugepd_ok(hpd))
 #endif
 
 int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 66ac56b26007..82495b8ea793 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -42,6 +42,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
   unsigned long address, unsigned int pdshift,
   unsigned int pshift, spinlock_t *ptl)
@@ -193,6 +194,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
return hugepte_offset(*hpdp, addr, pdshift);
 }
+#else
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long sz)
+{
+   if (sz < PMD_SIZE)
+   return pte_alloc_huge(mm, pmd_off(mm, addr), addr, sz);
+
+   return NULL;
+}
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
@@ -248,6 +259,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h, int 
nid)
return __alloc_bootmem_huge_page(h, nid);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
 #ifndef CONFIG_PPC_BOOK3S_64
 #define HUGEPD_FREELIST_SIZE \
((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
@@ -505,6 +517,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
}
} while (addr = next, addr != end);
 }
+#endif
 
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9e7ba9c3851f..acdf64c9b93e 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -487,8 +487,10 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
if (!hpdp)
return NULL;
 
+

[RFC PATCH v2 03/20] mm: Provide pmd to pte_leaf_size()

2024-05-17 Thread Christophe Leroy
On powerpc 8xx, when a page is 8M size, the information is in the PMD
entry. So provide it to pte_leaf_size().

Signed-off-by: Christophe Leroy 
---
 arch/arm64/include/asm/pgtable.h | 2 +-
 arch/powerpc/include/asm/nohash/32/pte-8xx.h | 2 +-
 arch/riscv/include/asm/pgtable.h | 2 +-
 arch/sparc/include/asm/pgtable_64.h  | 2 +-
 arch/sparc/mm/hugetlbpage.c  | 2 +-
 include/linux/pgtable.h  | 2 +-
 kernel/events/core.c | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index afdd56d26ad7..57c40f2498ab 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -624,7 +624,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, 
unsigned long pfn,
 #define pmd_bad(pmd)   (!pmd_table(pmd))
 
 #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
-#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+#define pte_leaf_size(pmd, pte)(pte_cont(pte) ? CONT_PTE_SIZE : 
PAGE_SIZE)
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
 static inline bool pud_sect(pud_t pud) { return false; }
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 137dc3c84e45..07df6b664861 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -151,7 +151,7 @@ static inline unsigned long pgd_leaf_size(pgd_t pgd)
 
 #define pgd_leaf_size pgd_leaf_size
 
-static inline unsigned long pte_leaf_size(pte_t pte)
+static inline unsigned long pte_leaf_size(pmd_t pmd, pte_t pte)
 {
pte_basic_t val = pte_val(pte);
 
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 6afd6bb4882e..9d9abe161a89 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -434,7 +434,7 @@ static inline pte_t pte_mkhuge(pte_t pte)
 }
 
 #ifdef CONFIG_RISCV_ISA_SVNAPOT
-#define pte_leaf_size(pte) (pte_napot(pte) ?   
\
+#define pte_leaf_size(pmd, pte)(pte_napot(pte) ?   
\
napot_cont_size(napot_cont_order(pte)) 
:\
PAGE_SIZE)
 #endif
diff --git a/arch/sparc/include/asm/pgtable_64.h 
b/arch/sparc/include/asm/pgtable_64.h
index 4d1bafaba942..67063af2ff8f 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1175,7 +1175,7 @@ extern unsigned long pud_leaf_size(pud_t pud);
 extern unsigned long pmd_leaf_size(pmd_t pmd);
 
 #define pte_leaf_size pte_leaf_size
-extern unsigned long pte_leaf_size(pte_t pte);
+extern unsigned long pte_leaf_size(pmd_t pmd, pte_t pte);
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5a342199e837..60c845a15bee 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -276,7 +276,7 @@ static unsigned long huge_tte_to_size(pte_t pte)
 
 unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t 
*)); }
 unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t 
*)); }
-unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); }
+unsigned long pte_leaf_size(pmd_t pmd, pte_t pte) { return 1UL << 
tte_to_shift(pte); }
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 85fc7554cd52..e605a4149fc7 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1802,7 +1802,7 @@ typedef unsigned int pgtbl_mod_mask;
 #define pmd_leaf_size(x) PMD_SIZE
 #endif
 #ifndef pte_leaf_size
-#define pte_leaf_size(x) PAGE_SIZE
+#define pte_leaf_size(x, y) PAGE_SIZE
 #endif
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..5c1c083222b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7585,7 +7585,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, 
unsigned long addr)
 
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
-   size = pte_leaf_size(pte);
+   size = pte_leaf_size(pmd, pte);
pte_unmap(ptep);
 #endif /* CONFIG_HAVE_FAST_GUP */
 
-- 
2.44.0



[RFC PATCH v2 04/20] mm: Provide mm_struct and address to huge_ptep_get()

2024-05-17 Thread Christophe Leroy
On powerpc 8xx huge_ptep_get() will need to know whether the given
ptep is a PTE entry or a PMD entry. This cannot be known with the
PMD entry itself because there is no easy way to know it from the
content of the entry.

So huge_ptep_get() will need to know either the size of the page
or get the pmd.

In order to be consistent with huge_ptep_get_and_clear(), give
mm and address to huge_ptep_get().

Signed-off-by: Christophe Leroy 
---
v2: Add missing changes in arch implementations
---
 arch/arm/include/asm/hugetlb-3level.h |  2 +-
 arch/arm64/include/asm/hugetlb.h  |  2 +-
 arch/arm64/mm/hugetlbpage.c   |  2 +-
 arch/riscv/include/asm/hugetlb.h  |  2 +-
 arch/riscv/mm/hugetlbpage.c   |  2 +-
 arch/s390/include/asm/hugetlb.h   |  2 +-
 arch/s390/mm/hugetlbpage.c|  2 +-
 fs/hugetlbfs/inode.c  |  2 +-
 fs/proc/task_mmu.c|  8 ++---
 fs/userfaultfd.c  |  2 +-
 include/asm-generic/hugetlb.h |  2 +-
 include/linux/swapops.h   |  2 +-
 mm/damon/vaddr.c  |  6 ++--
 mm/gup.c  |  2 +-
 mm/hmm.c  |  2 +-
 mm/hugetlb.c  | 46 +--
 mm/memory-failure.c   |  2 +-
 mm/mempolicy.c|  2 +-
 mm/migrate.c  |  4 +--
 mm/mincore.c  |  2 +-
 mm/userfaultfd.c  |  2 +-
 21 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/arch/arm/include/asm/hugetlb-3level.h 
b/arch/arm/include/asm/hugetlb-3level.h
index a30be5505793..470c45c22e80 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -18,7 +18,7 @@
  * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
  */
 #define __HAVE_ARCH_HUGE_PTEP_GET
-static inline pte_t huge_ptep_get(pte_t *ptep)
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
 {
pte_t retval = *ptep;
if (pte_val(retval))
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2ddc33d93b13..1af39a74e791 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
   pte_t *ptep, unsigned long sz);
 #define __HAVE_ARCH_HUGE_PTEP_GET
-extern pte_t huge_ptep_get(pte_t *ptep);
+extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t 
*ptep);
 
 void __init arm64_hugetlb_cma_reserve(void);
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index aa7ded49f8cf..7c6a24d29b3f 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -141,7 +141,7 @@ static inline int num_contig_ptes(unsigned long size, 
size_t *pgsize)
return contig_ptes;
 }
 
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
int ncontig, i;
size_t pgsize;
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 22deb7a2a6ec..6321bca08740 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -44,7 +44,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
   pte_t pte, int dirty);
 
 #define __HAVE_ARCH_HUGE_PTEP_GET
-pte_t huge_ptep_get(pte_t *ptep);
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 
 pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index dc77a58c6321..56abd6213ca1 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,7 +3,7 @@
 #include 
 
 #ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
unsigned long pte_num;
int i;
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index deb198a61039..caabc01c1812 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -19,7 +19,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte, unsigned long sz);
 void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte);
-pte_t huge_ptep_get(pte_t *ptep);
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
  unsigned long addr, pte_t *ptep);
 
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index dc3db86e13ff..ee7da593f36c

[RFC PATCH v2 06/20] powerpc/8xx: Fix size given to set_huge_pte_at()

2024-05-17 Thread Christophe Leroy
set_huge_pte_at() expects the real page size, not the psize which is
the index of the page definition in table mmu_psize_defs[]

Fixes: 935d4f0c6dc8 ("mm: hugetlb: add huge page size param to 
set_huge_pte_at()")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/8xx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 43d4842bb1c7..d93433e26ded 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -94,7 +94,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long 
va, phys_addr_t pa,
return -EINVAL;
 
set_huge_pte_at(_mm, va, ptep,
-   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize);
+   pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
+   1UL << mmu_psize_to_shift(psize));
 
return 0;
 }
-- 
2.44.0



[RFC PATCH v2 02/20] mm: Provide page size to pte_alloc_huge()

2024-05-17 Thread Christophe Leroy
In order to be able to flag the PMD entry with _PMD_HUGE_8M on
powerpc 8xx, provide page size to pte_alloc_huge() and use it
through the newly introduced pte_alloc_size().

Signed-off-by: Christophe Leroy 
---
 arch/arm64/mm/hugetlbpage.c   | 2 +-
 arch/parisc/mm/hugetlbpage.c  | 2 +-
 arch/powerpc/mm/hugetlbpage.c | 2 +-
 arch/riscv/mm/hugetlbpage.c   | 2 +-
 arch/sh/mm/hugetlbpage.c  | 2 +-
 arch/sparc/mm/hugetlbpage.c   | 2 +-
 include/linux/hugetlb.h   | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b872b003a55f..aa7ded49f8cf 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -292,7 +292,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
return NULL;
 
WARN_ON(addr & (sz - 1));
-   ptep = pte_alloc_huge(mm, pmdp, addr);
+   ptep = pte_alloc_huge(mm, pmdp, addr, sz);
} else if (sz == PMD_SIZE) {
if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
ptep = huge_pmd_share(mm, vma, addr, pudp);
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index a9f7e21f6656..2f4c6b440710 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -66,7 +66,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
if (pud) {
pmd = pmd_alloc(mm, pud, addr);
if (pmd)
-   pte = pte_alloc_huge(mm, pmd, addr);
+   pte = pte_alloc_huge(mm, pmd, addr, sz);
}
return pte;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 594a4b7b2ca2..66ac56b26007 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -183,7 +183,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
return NULL;
 
if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
-   return pte_alloc_huge(mm, (pmd_t *)hpdp, addr);
+   return pte_alloc_huge(mm, (pmd_t *)hpdp, addr, sz);
 
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 5ef2a6891158..dc77a58c6321 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -67,7 +67,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 
for_each_napot_order(order) {
if (napot_cont_size(order) == sz) {
-   pte = pte_alloc_huge(mm, pmd, addr & 
napot_cont_mask(order));
+   pte = pte_alloc_huge(mm, pmd, addr & 
napot_cont_mask(order), sz);
break;
}
}
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6cb0ad73dbb9..26579429e5ed 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -38,7 +38,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
if (pud) {
pmd = pmd_alloc(mm, pud, addr);
if (pmd)
-   pte = pte_alloc_huge(mm, pmd, addr);
+   pte = pte_alloc_huge(mm, pmd, addr, sz);
}
}
}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index b432500c13a5..5a342199e837 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -298,7 +298,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
return NULL;
if (sz >= PMD_SIZE)
return (pte_t *)pmd;
-   return pte_alloc_huge(mm, pmd, addr);
+   return pte_alloc_huge(mm, pmd, addr, sz);
 }
 
 pte_t *huge_pte_offset(struct mm_struct *mm,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076..d9c5d9daadc5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -193,9 +193,9 @@ static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned 
long address)
return pte_offset_kernel(pmd, address);
 }
 static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd,
-   unsigned long address)
+   unsigned long address, unsigned long sz)
 {
-   return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address);
+   return pte_alloc_size(mm, pmd, sz) ? NULL : pte_offset_huge(pmd, 
address);
 }
 #endif
 
-- 
2.44.0



[RFC PATCH v2 07/20] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-17 Thread Christophe Leroy
In order to fit better with standard Linux page tables layout, add
support for 8M pages using contiguous PTE entries in a standard
page table. Page tables will then be populated with 1024 similar
entries and two PMD entries will point to that page table.

The PMD entries also get a flag to tell it is addressing an 8M page,
this is required for the HW tablewalk assistance.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig  |  1 -
 arch/powerpc/include/asm/hugetlb.h| 11 +++-
 .../include/asm/nohash/32/hugetlb-8xx.h   | 54 --
 arch/powerpc/include/asm/nohash/32/pgalloc.h  |  2 +
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 57 +--
 arch/powerpc/include/asm/nohash/pgtable.h |  4 --
 arch/powerpc/include/asm/page.h   |  5 --
 arch/powerpc/include/asm/pgtable.h|  3 +
 arch/powerpc/kernel/head_8xx.S| 10 +---
 arch/powerpc/mm/hugetlbpage.c | 18 +++---
 arch/powerpc/mm/kasan/8xx.c   | 15 +++--
 arch/powerpc/mm/nohash/8xx.c  | 43 +++---
 arch/powerpc/mm/pgtable.c | 24 +---
 arch/powerpc/mm/pgtable_32.c  |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  2 +
 15 files changed, 139 insertions(+), 112 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a1a3b3363008..6a4ea7dad23f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,7 +135,6 @@ config PPC
select ARCH_HAS_DMA_MAP_DIRECT  if PPC_PSERIES
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
-   select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 79176a499763..36ed6d976cf9 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned 
long addr,
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
 {
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+   pmd_t *pmdp = (pmd_t *)ptep;
+   pte_t pte;
+
+   if (IS_ENABLED(CONFIG_PPC_8xx) && pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
SZ_8M))) {
+   pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), 
~0UL, 0, 1));
+   pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
1);
+   } else {
+   pte = __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+   }
+   return pte;
 }
 
 #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h 
b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 92df40c6cc6b..1414cfd28987 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -4,45 +4,25 @@
 
 #define PAGE_SHIFT_8M  23
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   BUG_ON(!hugepd_ok(hpd));
-
-   return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return PAGE_SHIFT_8M;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   unsigned long idx = (addr & (SZ_4M - 1)) >> PAGE_SHIFT;
-
-   return hugepd_page(hpd) + idx;
-}
-
 static inline void flush_hugetlb_page(struct vm_area_struct *vma,
  unsigned long vmaddr)
 {
flush_tlb_page(vma, vmaddr);
 }
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
+static inline int check_and_get_huge_psize(int shift)
 {
-   *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
+   return shift_to_mmu_psize(shift);
 }
 
-static inline void hugepd_populate_kernel(hugepd_t *hpdp, pte_t *new, unsigned 
int pshift)
+#define __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
 {
-   *hpdp = __hugepd(__pa(new) | _PMD_PRESENT | _PMD_PAGE_8M);
-}
+   pmd_t *pmdp = (pmd_t *)ptep;
 
-static inline int check_and_get_huge_psize(int shift)
-{
-   return shift_to_mmu_psize(shift);
+   if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))
+   ptep = pte_offset_kernel(pmdp, 0);
+   return ptep_get(ptep);
 }
 
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
@@ -53,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long 
addr, pte_t *ptep,
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep, unsigned long sz)
 {
-

[RFC PATCH v2 01/20] mm: Provide pagesize to pmd_populate()

2024-05-17 Thread Christophe Leroy
Unlike many architectures, powerpc 8xx hardware tablewalk requires
a two level process for all page sizes, allthough second level only
has one entry when pagesize is 8M.

To fit with Linux page table topology and without requiring special
page directory layout like hugepd, the page entry will be replicated
1024 times in the standard page table. However for large pages it is
necessary to set bits in the level-1 (PMD) entry. At the time being,
for 512k pages the flag is kept in the PTE and inserted in the PMD
entry at TLB miss exception, that is necessary because we can have
pages of different sizes in a page table. However the 12 PTE bits are
fully used and there is no room for an additional bit for page size.

For 8M pages, there will be only one page per PMD entry, it is
therefore possible to flag the pagesize in the PMD entry, with the
advantage that the information will already be at the right place for
the hardware.

To do so, add a new helper called pmd_populate_size() which takes the
page size as an additional argument, and modify __pte_alloc() to also
take that argument. pte_alloc() is left unmodified in order to
reduce churn on callers, and a pte_alloc_size() is added for use by
pte_alloc_huge().

When an architecture doesn't provide pmd_populate_size(),
pmd_populate() is used as a fallback.

Signed-off-by: Christophe Leroy 
---
 include/linux/mm.h | 12 +++-
 mm/filemap.c   |  2 +-
 mm/internal.h  |  2 +-
 mm/memory.c| 19 ---
 mm/pgalloc-track.h |  2 +-
 mm/userfaultfd.c   |  4 ++--
 6 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6bdaa18b9e9..158cb87bc604 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2803,8 +2803,8 @@ static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
 static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
 #endif
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
-int __pte_alloc_kernel(pmd_t *pmd);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long sz);
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long sz);
 
 #if defined(CONFIG_MMU)
 
@@ -2989,7 +2989,8 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t 
*pmd,
pte_unmap(pte); \
 } while (0)
 
-#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
+#define pte_alloc_size(mm, pmd, sz) (unlikely(pmd_none(*(pmd))) && 
__pte_alloc(mm, pmd, sz))
+#define pte_alloc(mm, pmd) pte_alloc_size(mm, pmd, PAGE_SIZE)
 
 #define pte_alloc_map(mm, pmd, address)\
(pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
@@ -2998,9 +2999,10 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t 
*pmd,
(pte_alloc(mm, pmd) ?   \
 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
 
-#define pte_alloc_kernel(pmd, address) \
-   ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
+#define pte_alloc_kernel_size(pmd, address, sz)\
+   ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, sz))? \
NULL: pte_offset_kernel(pmd, address))
+#define pte_alloc_kernel(pmd, address) pte_alloc_kernel_size(pmd, address, 
PAGE_SIZE)
 
 #if USE_SPLIT_PMD_PTLOCKS
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 30de18c4fd28..5a783063d1f6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3428,7 +3428,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct 
folio *folio,
}
 
if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
-   pmd_install(mm, vmf->pmd, >prealloc_pte);
+   pmd_install(mm, vmf->pmd, >prealloc_pte, PAGE_SIZE);
 
return false;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 07ad2675a88b..4a01bbf55264 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -206,7 +206,7 @@ void folio_activate(struct folio *folio);
 void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
   struct vm_area_struct *start_vma, unsigned long floor,
   unsigned long ceiling, bool mm_wr_locked);
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte, unsigned 
long sz);
 
 struct zap_details;
 void unmap_page_range(struct mmu_gather *tlb,
diff --git a/mm/memory.c b/mm/memory.c
index d2155ced45f8..2a9eba13a95f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -409,7 +409,12 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state 
*mas,
} while (vma);
 }
 
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+#ifndef pmd_populate_size
+#define pmd_populate_size(mm, pmdp, pte, sz) pmd_populate(mm, pmdp, pte)
+#define pmd_populate_kernel_size(mm, pmdp, pte, sz) pmd_populate_kernel(mm, 
pmdp, pte)
+#endif
+
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgt

[RFC PATCH v2 20/20] mm: Remove CONFIG_ARCH_HAS_HUGEPD

2024-05-17 Thread Christophe Leroy
powerpc was the only user of CONFIG_ARCH_HAS_HUGEPD and doesn't
use it anymore, so remove all related code.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/hugetlbpage.c |   1 -
 include/linux/hugetlb.h   |   6 --
 mm/Kconfig|  10 
 mm/gup.c  | 105 +-
 mm/pagewalk.c |  57 ++
 5 files changed, 5 insertions(+), 174 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 6fad89d7bff3..1df9e4fa1001 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -79,7 +79,6 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
vm_area_struct *vma,
return NULL;
return (pte_t *)pmd;
 }
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d9c5d9daadc5..c020e3bdf62b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,12 +20,6 @@ struct user_struct;
 struct mmu_gather;
 struct node;
 
-#ifndef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define is_hugepd(hugepd) (0)
-#define __hugepd(x) ((hugepd_t) { (x) })
-#endif
-
 void free_huge_folio(struct folio *folio);
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/Kconfig b/mm/Kconfig
index b1448aa81e15..a52f8e3224fb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1114,16 +1114,6 @@ config DMAPOOL_TEST
 config ARCH_HAS_PTE_SPECIAL
bool
 
-#
-# Some architectures require a special hugepage directory format that is
-# required to support multiple hugepage sizes. For example a4fe3ce76
-# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
-# introduced it on powerpc.  This allows for a more flexible hugepage
-# pagetable layouts.
-#
-config ARCH_HAS_HUGEPD
-   bool
-
 config MAPPING_DIRTY_HELPERS
 bool
 
diff --git a/mm/gup.c b/mm/gup.c
index 86b5105b82a1..95f121223f04 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2790,89 +2790,6 @@ static int record_subpages(struct page *page, unsigned 
long addr,
return nr;
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
- unsigned long sz)
-{
-   unsigned long __boundary = (addr + sz) & ~(sz-1);
-   return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
-static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-  unsigned long end, unsigned int flags,
-  struct page **pages, int *nr)
-{
-   unsigned long pte_end;
-   struct page *page;
-   struct folio *folio;
-   pte_t pte;
-   int refs;
-
-   pte_end = (addr + sz) & ~(sz-1);
-   if (pte_end < end)
-   end = pte_end;
-
-   pte = huge_ptep_get(NULL, addr, ptep);
-
-   if (!pte_access_permitted(pte, flags & FOLL_WRITE))
-   return 0;
-
-   /* hugepages are never "special" */
-   VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-   page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
-   refs = record_subpages(page, addr, end, pages + *nr);
-
-   folio = try_grab_folio(page, refs, flags);
-   if (!folio)
-   return 0;
-
-   if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   if (!folio_fast_pin_allowed(folio, flags)) {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   if (!pte_write(pte) && gup_must_unshare(NULL, flags, >page)) {
-   gup_put_folio(folio, refs, flags);
-   return 0;
-   }
-
-   *nr += refs;
-   folio_set_referenced(folio);
-   return 1;
-}
-
-static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-   unsigned int pdshift, unsigned long end, unsigned int flags,
-   struct page **pages, int *nr)
-{
-   pte_t *ptep;
-   unsigned long sz = 1UL << hugepd_shift(hugepd);
-   unsigned long next;
-
-   ptep = hugepte_offset(hugepd, addr, pdshift);
-   do {
-   next = hugepte_addr_end(addr, end, sz);
-   if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
-   return 0;
-   } while (ptep++, addr = next, addr != end);
-
-   return 1;
-}
-#else
-static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-   unsigned int pdshift, unsigned long end, unsigned int flags,
-   struct page **pages, int *nr)
-{
-   return 0;
-}
-#endif /* CONFIG_ARCH_HAS_HUGEPD */
-
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
@@ -3026,14 +2943,6 @@ static int gup_p

[RFC PATCH v2 19/20] powerpc/mm: Remove hugepd leftovers

2024-05-17 Thread Christophe Leroy
All targets have now opted out of CONFIG_ARCH_HAS_HUGEPD so
remove left over code.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hugetlb.h  |   7 -
 arch/powerpc/include/asm/page.h |   6 -
 arch/powerpc/include/asm/pgtable-be-types.h |  10 -
 arch/powerpc/include/asm/pgtable-types.h|   9 -
 arch/powerpc/mm/hugetlbpage.c   | 412 
 arch/powerpc/mm/init-common.c   |   8 +-
 arch/powerpc/mm/pgtable.c   |  27 +-
 7 files changed, 3 insertions(+), 476 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index d022722e6530..00327aef2dec 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,13 +30,6 @@ static inline int is_hugepage_only_range(struct mm_struct 
*mm,
 }
 #define is_hugepage_only_range is_hugepage_only_range
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
-   unsigned long end, unsigned long floor,
-   unsigned long ceiling);
-#endif
-
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 pte_t pte, unsigned long sz);
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index c0af246a64ff..83d0a4fc5f75 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -269,12 +269,6 @@ static inline const void *pfn_to_kaddr(unsigned long pfn)
 #define is_kernel_addr(x)  ((x) >= TASK_SIZE)
 #endif
 
-/*
- * Some number of bits at the level of the page table that points to
- * a hugepte are used to encode the size.  This masks those bits.
- */
-#define HUGEPD_SHIFT_MASK 0x3f
-
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h 
b/arch/powerpc/include/asm/pgtable-be-types.h
index 82633200b500..6bd8f89b25dc 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -101,14 +101,4 @@ static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t 
new)
return pmd_raw(old) == prev;
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { __be64 pdbe; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { cpu_to_be64(x) })
-
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return be64_to_cpu(x.pdbe);
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index db965d98e0ae..7b3d4c592a10 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -87,13 +87,4 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t 
new)
 }
 #endif
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { (x) })
-static inline unsigned long hpd_val(hugepd_t x)
-{
-   return x.pd;
-}
-#endif
-
 #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 64b9029d86de..6fad89d7bff3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -28,8 +28,6 @@
 
 bool hugetlb_disabled = false;
 
-#define hugepd_none(hpd)   (hpd_val(hpd) == 0)
-
 #define PTE_T_ORDER(__builtin_ffs(sizeof(pte_basic_t)) - \
 __builtin_ffs(sizeof(void *)))
 
@@ -42,156 +40,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-  unsigned long address, unsigned int pdshift,
-  unsigned int pshift, spinlock_t *ptl)
-{
-   struct kmem_cache *cachep;
-   pte_t *new;
-   int i;
-   int num_hugepd;
-
-   if (pshift >= pdshift) {
-   cachep = PGT_CACHE(PTE_T_ORDER);
-   num_hugepd = 1 << (pshift - pdshift);
-   } else {
-   cachep = PGT_CACHE(pdshift - pshift);
-   num_hugepd = 1;
-   }
-
-   if (!cachep) {
-   WARN_ONCE(1, "No page table cache created for hugetlb tables");
-   return -ENOMEM;
-   }
-
-   new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
-
-   BUG_ON(pshift > HUGEPD_SHIFT_MASK);
-   BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
-
-   if (!new)
-   return -ENOMEM;
-
-   /*
-* Make sure other cpus find the hugepd set only after a
-* properly initialized page table is visible to them.
-* For more details look for comment in __pte_alloc().
-*/
-   smp_wmb();
-
-   spin_lock(ptl

[RFC PATCH v2 18/20] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-17 Thread Christophe Leroy
On book3s/64, the only user of hugepd is hash in 4k mode.

All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.

Rework hash-4k to use contiguous PMD and PUD instead.

In that setup there are only two huge page sizes: 16M and 16G.

16M sits at PMD level and 16G at PUD level.

pte_update doesn't know page size, lets use the same trick as
hpte_need_flush() to get page size from segment properties. That's
not the most efficient way but let's do that until callers of
pte_update() provide page size instead of just a huge flag.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 15 
 arch/powerpc/include/asm/book3s/64/hash.h | 38 +++
 arch/powerpc/include/asm/book3s/64/hugetlb.h  | 38 ---
 .../include/asm/book3s/64/pgtable-4k.h| 34 -
 .../include/asm/book3s/64/pgtable-64k.h   | 20 --
 arch/powerpc/include/asm/hugetlb.h|  4 ++
 .../include/asm/nohash/32/hugetlb-8xx.h   |  4 --
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  4 --
 arch/powerpc/include/asm/page.h   |  8 
 arch/powerpc/mm/book3s64/hash_utils.c | 11 --
 arch/powerpc/mm/book3s64/pgtable.c| 12 --
 arch/powerpc/mm/hugetlbpage.c | 19 --
 arch/powerpc/mm/pgtable.c |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 14 files changed, 43 insertions(+), 167 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 6472b08fa1b0..c654c376ef8b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -74,21 +74,6 @@
 #define remap_4k_pfn(vma, addr, pfn, prot) \
remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
-#ifdef CONFIG_HUGETLB_PAGE
-static inline int hash__hugepd_ok(hugepd_t hpd)
-{
-   unsigned long hpdval = hpd_val(hpd);
-   /*
-* if it is not a pte and have hugepd shift mask
-* set, then it is a hugepd directory pointer
-*/
-   if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
-   ((hpdval & HUGEPD_SHIFT_MASK) != 0))
-   return true;
-   return false;
-}
-#endif
-
 /*
  * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just
  * a matter of returning the PTE bits that need to be modified. On 64K PTE,
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index faf3e3b4e4b2..509811ca7695 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include 
+#include 
 
 /*
  * Common bits between 4K and 64K pages in a linux-style PTE.
@@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, 
unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
 unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long 
flags);
 /* Atomic PTE updates */
-static inline unsigned long hash__pte_update(struct mm_struct *mm,
-unsigned long addr,
-pte_t *ptep, unsigned long clr,
-unsigned long set,
-int huge)
+static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long 
clr,
+unsigned long set)
 {
__be64 old_be, tmp_be;
-   unsigned long old;
 
__asm__ __volatile__(
"1: ldarx   %0,0,%3 # pte_update\n\
@@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct 
mm_struct *mm,
: "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
: "cc" );
+
+   return be64_to_cpu(old_be);
+}
+
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+unsigned long addr,
+pte_t *ptep, unsigned long clr,
+unsigned long set,
+int huge)
+{
+   unsigned long old;
+
+   old = hash__pte_update_one(ptep, clr, set);
+
+   if (huge && IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
+   unsigned int psize = get_slice_psize(mm, addr);
+   int nb, i;
+
+   if (psize == MMU_PAGE_16M)
+   nb = SZ_16M / PMD_SIZE;
+   else if (psize == MMU_PAGE_16G)
+   nb = SZ_16G / PUD_SIZE;
+   else
+   nb = 1;
+
+   for (i = 1; i < nb; i++)
+   hash__pte_update_o

[RFC PATCH v2 17/20] powerpc/e500: Use contiguous PMD instead of hugepd

2024-05-17 Thread Christophe Leroy
e500 supports many page sizes among which the following size are
implemented in the kernel at the time being: 4M, 16M, 64M, 256M, 1G.

On e500, TLB miss for hugepages is exclusively handled by SW even
on e6500 which has HW assistance for 4k pages, so there are no
constraints like on the 8xx.

On e500/32, all are at PGD/PMD level and can be handled as
cont-PMD.

On e500/64, smaller ones are on PMD while bigger ones are on PUD.
Again, they can easily be handled as cont-PMD and cont-PUD instead
of hugepd.

Signed-off-by: Christophe Leroy 
---
 .../powerpc/include/asm/nohash/hugetlb-e500.h | 32 +-
 arch/powerpc/include/asm/nohash/pgalloc.h |  2 -
 arch/powerpc/include/asm/nohash/pgtable.h | 43 +
 arch/powerpc/include/asm/nohash/pte-e500.h| 15 +
 arch/powerpc/include/asm/page.h   | 15 +
 arch/powerpc/kernel/head_85xx.S   | 23 +++
 arch/powerpc/mm/hugetlbpage.c |  2 -
 arch/powerpc/mm/nohash/tlb_low_64e.S  | 63 +++
 arch/powerpc/mm/pgtable.c | 31 +
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 10 files changed, 131 insertions(+), 96 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index d8e51a3f8557..d30e2a3f129d 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -2,38 +2,12 @@
 #ifndef _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 #define _ASM_POWERPC_NOHASH_HUGETLB_E500_H
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   if (WARN_ON(!hugepd_ok(hpd)))
-   return NULL;
-
-   return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return hpd_val(hpd) & HUGEPD_SHIFT_MASK;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   /*
-* On FSL BookE, we have multiple higher-level table entries that
-* point to the same hugepte.  Just use the first one since they're all
-* identical.  So for that case, idx=0.
-*/
-   return hugepd_page(hpd);
-}
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+pte_t pte, unsigned long sz);
 
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
-{
-   /* We use the old format for PPC_E500 */
-   *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
-}
-
 static inline int check_and_get_huge_psize(int shift)
 {
if (shift & 1)  /* Not a power of 4 */
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h 
b/arch/powerpc/include/asm/nohash/pgalloc.h
index 4b62376318e1..d06efac6d7aa 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -44,8 +44,6 @@ static inline void pgtable_free(void *table, int shift)
}
 }
 
-#define get_hugepd_cache_index(x)  (x)
-
 static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int 
shift)
 {
unsigned long pgf = (unsigned long)table;
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index c4be7754e96f..28ecb2c8b433 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -52,11 +52,36 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, 
unsigned long addr, p
 {
pte_basic_t old = pte_val(*p);
pte_basic_t new = (old & ~(pte_basic_t)clr) | set;
+   unsigned long sz;
+   unsigned long pdsize;
+   int i;
 
if (new == old)
return old;
 
-   *p = __pte(new);
+#ifdef CONFIG_PPC_E500
+   if (huge)
+   sz = 1UL << (((old & _PAGE_HSIZE_MSK) >> _PAGE_HSIZE_SHIFT) + 
20);
+   else
+#endif
+   sz = PAGE_SIZE;
+
+   if (!huge || sz < PMD_SIZE)
+   pdsize = PAGE_SIZE;
+   else if (sz < PUD_SIZE)
+   pdsize = PMD_SIZE;
+   else if (sz < P4D_SIZE)
+   pdsize = PUD_SIZE;
+   else if (sz < PGDIR_SIZE)
+   pdsize = P4D_SIZE;
+   else
+   pdsize = PGDIR_SIZE;
+
+   for (i = 0; i < sz / pdsize; i++, p++) {
+   *p = __pte(new);
+   if (new)
+   new += (unsigned long long)(pdsize / PAGE_SIZE) << 
PTE_RPN_SHIFT;
+   }
 
if (IS_ENABLED(CONFIG_44x) && !is_kernel_addr(addr) && (old & 
_PAGE_EXEC))
icache_44x_need_flush = 1;
@@ -340,25 +365,19 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
 
 #define pgprot_writecombine pgpr

[RFC PATCH v2 16/20] powerpc/e500: Encode hugepage size in PTE bits

2024-05-17 Thread Christophe Leroy
Use U0-U3 bits to encode hugepage size, more exactly page shift.

As we start using hugepages at shift 21 (2Mbytes), substract 20
so that it fits into 4 bits. That may change in the future if
we want to use smaller hugepages.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/hugetlb-e500.h | 6 ++
 arch/powerpc/include/asm/nohash/pte-e500.h | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h 
b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index 8f04ad20e040..d8e51a3f8557 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -42,4 +42,10 @@ static inline int check_and_get_huge_psize(int shift)
return shift_to_mmu_psize(shift);
 }
 
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, 
vm_flags_t flags)
+{
+   return __pte(pte_val(entry) | (_PAGE_U3 * (shift - 20)));
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
 #endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h 
b/arch/powerpc/include/asm/nohash/pte-e500.h
index 975facc7e38e..091e4bff1fba 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -46,6 +46,9 @@
 #define _PAGE_NO_CACHE 0x40 /* I: cache inhibit */
 #define _PAGE_WRITETHRU0x80 /* W: cache write-through */
 
+#define _PAGE_HSIZE_MSK (_PAGE_U0 | _PAGE_U1 | _PAGE_U2 | _PAGE_U3)
+#define _PAGE_HSIZE_SHIFT  14
+
 /* "Higher level" linux bit combinations */
 #define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was 
cache cleaned */
 #define _PAGE_READ (_PAGE_BAP_SR | _PAGE_BAP_UR) /* User read 
permission */
-- 
2.44.0



[RFC PATCH v2 15/20] powerpc/85xx: Switch to 64 bits PGD

2024-05-17 Thread Christophe Leroy
In order to allow leaf PMD entries, switch the PGD to 64 bits entries.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/pgtable-types.h |  4 
 arch/powerpc/kernel/head_85xx.S  | 10 ++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index 082c85cc09b1..db965d98e0ae 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -49,7 +49,11 @@ static inline unsigned long pud_val(pud_t x)
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
+#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT)
+typedef struct { unsigned long long pgd; } pgd_t;
+#else
 typedef struct { unsigned long pgd; } pgd_t;
+#endif
 #define __pgd(x)   ((pgd_t) { (x) })
 static inline unsigned long pgd_val(pgd_t x)
 {
diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index 39724ff5ae1f..a305244afc9f 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -307,8 +307,9 @@ set_ivor:
 #ifdef CONFIG_PTE_64BIT
 #ifdef CONFIG_HUGETLB_PAGE
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
blt 1000f;  /* Normal non-huge page */  \
beq 2f; /* Bail if no table */  \
@@ -321,8 +322,9 @@ set_ivor:
 1001:  lwz r11, 4(r12);/* Get pte entry */
 #else
 #define FIND_PTE   \
-   rlwinm  r12, r10, 13, 19, 29;   /* Compute pgdir/pmd offset */  \
-   lwzxr11, r12, r11;  /* Get pgd/pmd entry */ \
+   rlwinm  r12, r10, 14, 18, 28;   /* Compute pgdir/pmd offset */  \
+   add r12, r11, r12;  \
+   lwz r11, 4(r12);/* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */   \
beq 2f; /* Bail if no table */  \
rlwimi  r12, r10, 23, 20, 28;   /* Compute pte address */   \
-- 
2.44.0



[RFC PATCH v2 14/20] powerpc/e500: Remove enc field from struct mmu_psize_def

2024-05-17 Thread Christophe Leroy
enc field is hidden behind BOOK3E_PAGESZ_XX macros, and when you look
closer you realise that this field is nothing else than the value of
shift minus ten.

So remove enc field and calculate tsize from shift field.

Also remove inc filed which is unused.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h |  3 ---
 arch/powerpc/mm/nohash/book3e_pgtable.c|  4 ++--
 arch/powerpc/mm/nohash/tlb.c   | 11 ++-
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 792bfaafd70b..4167da0c0241 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -244,14 +244,11 @@ typedef struct {
 /* Page size definitions, common between 32 and 64-bit
  *
  *shift : is the "PAGE_SHIFT" value for that page size
- *penc  : is the pte encoding mask
  *
  */
 struct mmu_psize_def
 {
unsigned intshift;  /* number of bits */
-   unsigned intenc;/* PTE encoding */
-   unsigned intind;/* Corresponding indirect page size shift */
unsigned intflags;
 #define MMU_PAGE_SIZE_DIRECT   0x1 /* Supported as a direct size */
 #define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c 
b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 1c5e4ecbebeb..ad2a7c26f2a0 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -29,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start,
_PAGE_KERNEL_RW;
 
/* PTEs only contain page size encodings up to 32M */
-   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+   BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf);
 
/* Encode the size in the PTE */
-   flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+   flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8;
 
/* For each PTE for that area, map things. Note that we don't
 * increment phys because all PTEs are of the large size and
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 1caccbf4c138..10b5a6b60450 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -53,37 +53,30 @@
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift  = 12,
-   .enc= BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_2M] = {
.shift  = 21,
-   .enc= BOOK3E_PAGESZ_2M,
},
[MMU_PAGE_4M] = {
.shift  = 22,
-   .enc= BOOK3E_PAGESZ_4M,
},
[MMU_PAGE_16M] = {
.shift  = 24,
-   .enc= BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_64M] = {
.shift  = 26,
-   .enc= BOOK3E_PAGESZ_64M,
},
[MMU_PAGE_256M] = {
.shift  = 28,
-   .enc= BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift  = 30,
-   .enc= BOOK3E_PAGESZ_1GB,
},
 };
 
 static inline int mmu_get_tsize(int psize)
 {
-   return mmu_psize_defs[psize].enc;
+   return mmu_psize_defs[psize].shift - 10;
 }
 #else
 static inline int mmu_get_tsize(int psize)
@@ -371,7 +364,7 @@ void tlb_flush(struct mmu_gather *tlb)
  */
 void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 {
-   int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+   int tsize = mmu_get_tsize(mmu_pte_psize);
 
if (book3e_htw_mode != PPC_HTW_NONE) {
unsigned long start = address & PMD_MASK;
-- 
2.44.0



[RFC PATCH v2 13/20] powerpc/64e: Clean up impossible setups

2024-05-17 Thread Christophe Leroy
All E500 have MMU_FTR_TYPE_FSL_E.

So remove all impossible combinations.

This leads to removing PPC_HTW_IBM and related exceptions.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/mmu-e500.h |   1 -
 arch/powerpc/mm/nohash/tlb.c   | 148 
 arch/powerpc/mm/nohash/tlb_low_64e.S   | 194 -
 3 files changed, 36 insertions(+), 307 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 6ddced0415cb..792bfaafd70b 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -303,7 +303,6 @@ extern unsigned long linear_map_top;
 extern int book3e_htw_mode;
 
 #define PPC_HTW_NONE   0
-#define PPC_HTW_IBM1
 #define PPC_HTW_E6500  2
 
 /*
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index d16f1ef7516c..1caccbf4c138 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -400,13 +400,11 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned 
long address)
 static void __init setup_page_sizes(void)
 {
unsigned int tlb0cfg;
-   unsigned int tlb0ps;
unsigned int eptcfg;
-   int i, psize;
+   int psize;
unsigned int mmucfg = mfspr(SPRN_MMUCFG);
-   int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
 
-   if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+   if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
unsigned int min_pg, max_pg;
 
@@ -429,11 +427,7 @@ static void __init setup_page_sizes(void)
if ((shift >= min_pg) && (shift <= max_pg))
def->flags |= MMU_PAGE_SIZE_DIRECT;
}
-
-   goto out;
-   }
-
-   if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+   } else if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
u32 tlb1cfg, tlb1ps;
 
tlb0cfg = mfspr(SPRN_TLB0CFG);
@@ -465,54 +459,8 @@ static void __init setup_page_sizes(void)
def->flags |= MMU_PAGE_SIZE_INDIRECT;
}
}
-
-   goto out;
-   }
-
-   tlb0cfg = mfspr(SPRN_TLB0CFG);
-   tlb0ps = mfspr(SPRN_TLB0PS);
-   eptcfg = mfspr(SPRN_EPTCFG);
-
-   /* Look for supported direct sizes */
-   for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-   struct mmu_psize_def *def = _psize_defs[psize];
-
-   if (tlb0ps & (1U << (def->shift - 10)))
-   def->flags |= MMU_PAGE_SIZE_DIRECT;
-   }
-
-   /* Indirect page sizes supported ? */
-   if ((tlb0cfg & TLBnCFG_IND) == 0 ||
-   (tlb0cfg & TLBnCFG_PT) == 0)
-   goto out;
-
-   book3e_htw_mode = PPC_HTW_IBM;
-
-   /* Now, we only deal with one IND page size for each
-* direct size. Hopefully all implementations today are
-* unambiguous, but we might want to be careful in the
-* future.
-*/
-   for (i = 0; i < 3; i++) {
-   unsigned int ps, sps;
-
-   sps = eptcfg & 0x1f;
-   eptcfg >>= 5;
-   ps = eptcfg & 0x1f;
-   eptcfg >>= 5;
-   if (!ps || !sps)
-   continue;
-   for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
-   struct mmu_psize_def *def = _psize_defs[psize];
-
-   if (ps == (def->shift - 10))
-   def->flags |= MMU_PAGE_SIZE_INDIRECT;
-   if (sps == (def->shift - 10))
-   def->ind = ps + 10;
-   }
}
 
-out:
/* Cleanup array and print summary */
pr_info("MMU: Supported page sizes\n");
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
@@ -540,10 +488,6 @@ static void __init setup_mmu_htw(void)
 */
 
switch (book3e_htw_mode) {
-   case PPC_HTW_IBM:
-   patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
-   patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
-   break;
case PPC_HTW_E6500:
extlb_level_exc = EX_TLB_SIZE;
patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
@@ -560,6 +504,8 @@ static void __init setup_mmu_htw(void)
 static void early_init_this_mmu(void)
 {
unsigned int mas4;
+   unsigned int num_cams;
+   bool map = true;
 
/* Set MAS4 based on page table setting */
 
@@ -572,12 +518,6 @@ static void early_init_this_mmu(void)
mmu_pte_psize = MMU_PAGE_2M;
break;
 
-   case PPC_HTW_IBM:
-   mas4 |= MAS4_INDD;

  1   2   3   4   5   6   7   8   9   10   >