Re: [PATCH v9 4/7] powerpc: mm: Default p{m,u}d_pte implementations

2023-11-29 Thread Christophe Leroy


Le 30/11/2023 à 03:53, Rohan McLure a écrit :
> For 32-bit systems which have no usecases for p{m,u}d_pte() prior to
> page table checking, implement default stubs.

Is that the best solution ?

If I understand correctly, it is only needed for 
pmd_user_accessible_page(). Why not provide a stub 
pmd_user_accessible_page() that returns false on those architectures ?

Same for pud_user_accessible_page()

But if you decide to keep it I think that:
- It should be squashed with following patch to make it clear it's 
needed for that only.
- Remove the WARN_ONCE().
- Only have a special one for books/64 and a generic only common to he 3 
others.

> 
> Signed-off-by: Rohan McLure 
> ---
> v9: New patch
> ---
>   arch/powerpc/include/asm/book3s/64/pgtable.h |  3 +++
>   arch/powerpc/include/asm/nohash/64/pgtable.h |  2 ++
>   arch/powerpc/include/asm/pgtable.h   | 17 +
>   3 files changed, 22 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 8fdb7667c509..2454174b26cb 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -887,6 +887,8 @@ static inline int pud_present(pud_t pud)
>   
>   extern struct page *pud_page(pud_t pud);
>   extern struct page *pmd_page(pmd_t pmd);
> +
> +#define pud_pte pud_pte
>   static inline pte_t pud_pte(pud_t pud)
>   {
>   return __pte_raw(pud_raw(pud));
> @@ -1043,6 +1045,7 @@ static inline void __kernel_map_pages(struct page 
> *page, int numpages, int enabl
>   }
>   #endif
>   
> +#define pmd_pte pmd_pte
>   static inline pte_t pmd_pte(pmd_t pmd)
>   {
>   return __pte_raw(pmd_raw(pmd));
> diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h 
> b/arch/powerpc/include/asm/nohash/64/pgtable.h
> index f58cbebde26e..09a34fe196ae 100644
> --- a/arch/powerpc/include/asm/nohash/64/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
> @@ -93,6 +93,7 @@ static inline void pmd_clear(pmd_t *pmdp)
>   *pmdp = __pmd(0);
>   }
>   
> +#define pmd_pte pmd_pte
>   static inline pte_t pmd_pte(pmd_t pmd)
>   {
>   return __pte(pmd_val(pmd));
> @@ -134,6 +135,7 @@ static inline pmd_t *pud_pgtable(pud_t pud)
>   
>   extern struct page *pud_page(pud_t pud);
>   
> +#define pud_pte pud_pte
>   static inline pte_t pud_pte(pud_t pud)
>   {
>   return __pte(pud_val(pud));
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index 9c0f2151f08f..d7d0f47760d3 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -233,6 +233,23 @@ static inline int pud_pfn(pud_t pud)
>   }
>   #endif
>   
> +#ifndef pmd_pte
> +#define pmd_pte pmd_pte
> +static inline pte_t pmd_pte(pmd_t pmd)
> +{
> + WARN_ONCE(1, "pmd: platform does not use pmd entries directly");
> + return __pte(pmd_val(pmd));
> +}
> +#endif
> +
> +#ifndef pud_pte
> +#define pud_pte pud_pte
> +static inline pte_t pud_pte(pud_t pud)
> +{
> + WARN_ONCE(1, "pud: platform does not use pud entries directly");
> + return __pte(pud_val(pud));
> +}
> +#endif
>   #endif /* __ASSEMBLY__ */
>   
>   #endif /* _ASM_POWERPC_PGTABLE_H */


Re: [PATCH v9 7/7] powerpc: mm: Support page table check

2023-11-29 Thread Christophe Leroy


Le 30/11/2023 à 03:54, Rohan McLure a écrit :
> On creation and clearing of a page table mapping, instrument such calls
> by invoking page_table_check_pte_set and page_table_check_pte_clear
> respectively. These calls serve as a sanity check against illegal
> mappings.
> 
> Enable ARCH_SUPPORTS_PAGE_TABLE_CHECK for all platforms.
> 
> See also:
> 
> riscv support in commit 3fee229a8eb9 ("riscv/mm: enable
> ARCH_SUPPORTS_PAGE_TABLE_CHECK")
> arm64 in commit 42b2547137f5 ("arm64/mm: enable
> ARCH_SUPPORTS_PAGE_TABLE_CHECK")
> x86_64 in commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
> check")
> 
> Reviewed-by: Christophe Leroy 
> Signed-off-by: Rohan McLure 
> ---
> v9: Updated for new API. Instrument pmdp_collapse_flush's two
> constituent calls to avoid header hell
> ---
>   arch/powerpc/Kconfig |  1 +
>   arch/powerpc/include/asm/book3s/32/pgtable.h |  7 +++-
>   arch/powerpc/include/asm/book3s/64/pgtable.h | 39 
>   arch/powerpc/mm/book3s64/hash_pgtable.c  |  4 ++
>   arch/powerpc/mm/book3s64/pgtable.c   | 13 +--
>   arch/powerpc/mm/book3s64/radix_pgtable.c |  3 ++
>   arch/powerpc/mm/pgtable.c|  4 ++
>   7 files changed, 58 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 6f105ee4f3cf..5bd6d367ef40 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -166,6 +166,7 @@ config PPC
>   select ARCH_STACKWALK
>   select ARCH_SUPPORTS_ATOMIC_RMW
>   select ARCH_SUPPORTS_DEBUG_PAGEALLOCif PPC_BOOK3S || PPC_8xx || 40x
> + select ARCH_SUPPORTS_PAGE_TABLE_CHECK
>   select ARCH_USE_BUILTIN_BSWAP
>   select ARCH_USE_CMPXCHG_LOCKREF if PPC64
>   select ARCH_USE_MEMTEST
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
> b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index bd6f8cdd25aa..48f4e7b98340 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -201,6 +201,7 @@ void unmap_kernel_page(unsigned long va);
>   #ifndef __ASSEMBLY__
>   #include 
>   #include 
> +#include 
>   
>   /* Bits to mask out from a PGD to get to the PUD page */
>   #define PGD_MASKED_BITS 0
> @@ -319,7 +320,11 @@ static inline int __ptep_test_and_clear_young(struct 
> mm_struct *mm,
>   static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long 
> addr,
>  pte_t *ptep)
>   {
> - return __pte(pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0));
> + pte_t old_pte = __pte(pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0));
> +
> + page_table_check_pte_clear(mm, old_pte);
> +
> + return old_pte;
>   }
>   
>   #define __HAVE_ARCH_PTEP_SET_WRPROTECT
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index dd3e7b190ab7..834c997ba657 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -151,6 +151,8 @@
>   #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
>   
>   #ifndef __ASSEMBLY__
> +#include 
> +
>   /*
>* page table defines
>*/
> @@ -421,8 +423,11 @@ static inline void huge_ptep_set_wrprotect(struct 
> mm_struct *mm,
>   static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>  unsigned long addr, pte_t *ptep)
>   {
> - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
> - return __pte(old);
> + pte_t old_pte = __pte(pte_update(mm, addr, ptep, ~0UL, 0, 0));
> +
> + page_table_check_pte_clear(mm, old_pte);
> +
> + return old_pte;
>   }
>   
>   #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
> @@ -431,11 +436,16 @@ static inline pte_t ptep_get_and_clear_full(struct 
> mm_struct *mm,
>   pte_t *ptep, int full)
>   {
>   if (full && radix_enabled()) {
> + pte_t old_pte;
> +
>   /*
>* We know that this is a full mm pte clear and
>* hence can be sure there is no parallel set_pte.
>*/
> - return radix__ptep_get_and_clear_full(mm, addr, ptep, full);
> + old_pte = radix__ptep_get_and_clear_full(mm, addr, ptep, full);
> + page_table_check_pte_clear(mm, old_pte);
> +
> + return old_pte;
>   }
>   return ptep_get_and_clear(mm, addr, ptep);
>   }
> @@ -1339,17 +1349,30 @@ extern int pudp_test_and_clear_young(struct 
> vm_area_struct *vma,
>   static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
>   unsigned long addr, pmd_t *pmdp)
>   {
> - if (radix_enabled())
> - return radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
> - return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
> + pmd_t old_pmd;
> +
> + if (radix_enabled()) {
> + 

Re: [PATCH v9 6/7] powerpc: mm: Use __set_pte_at() for early-boot / internal usages

2023-11-29 Thread Christophe Leroy


Le 30/11/2023 à 03:53, Rohan McLure a écrit :
> In the new set_ptes() API, set_pte_at() (a special case of set_ptes())
> is intended to be instrumented by the page table check facility. There
> are however several other routines that constitute the API for setting
> page table entries, including set_pmd_at() among others. Such routines
> are themselves implemented in terms of set_ptes_at().
> 
> A future patch providing support for page table checking on powerpc
> must take care to avoid duplicate calls to
> page_table_check_p{te,md,ud}_set().
> 
> Cause API-facing routines that call set_pte_at() to instead call
> __set_pte_at(), which will remain uninstrumented by page table check.
> set_ptes() is itself implemented by calls to __set_pte_at(), so this
> eliminates redundant code.
> 
> Also prefer __set_pte_at() in early-boot usages which should not be
> instrumented.

__set_pte_at() lacks the call to set_pte_filter() and is only intended 
to be called directly in very specific situations like early KASAN where 
set_pte_filter() cannot be called but where we absolutely know that 
set_pte_filter() is not needed.

Are you really sure that all places you change are safe and do not 
require a call to set_pte_filter() ?

> 
> Signed-off-by: Rohan McLure 
> ---
> v9: New patch
> ---
>   arch/powerpc/mm/book3s64/hash_pgtable.c  |  2 +-
>   arch/powerpc/mm/book3s64/pgtable.c   |  4 ++--
>   arch/powerpc/mm/book3s64/radix_pgtable.c | 10 +-
>   arch/powerpc/mm/nohash/book3e_pgtable.c  |  2 +-
>   arch/powerpc/mm/pgtable_32.c |  2 +-
>   5 files changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c 
> b/arch/powerpc/mm/book3s64/hash_pgtable.c
> index 988948d69bc1..ae52c8db45b7 100644
> --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> @@ -165,7 +165,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long 
> pa, pgprot_t prot)
>   ptep = pte_alloc_kernel(pmdp, ea);
>   if (!ptep)
>   return -ENOMEM;
> - set_pte_at(_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
> + __set_pte_at(_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 
> prot), 0);
>   } else {
>   /*
>* If the mm subsystem is not fully up, we cannot create a
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
> b/arch/powerpc/mm/book3s64/pgtable.c
> index be229290a6a7..9a0a2accb261 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -116,7 +116,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>   WARN_ON(!(pmd_large(pmd)));
>   #endif
>   trace_hugepage_set_pmd(addr, pmd_val(pmd));
> - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
> + return __set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd), 0);
>   }
>   
>   void set_pud_at(struct mm_struct *mm, unsigned long addr,
> @@ -539,7 +539,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, 
> unsigned long addr,
>   if (radix_enabled())
>   return radix__ptep_modify_prot_commit(vma, addr,
> ptep, old_pte, pte);
> - set_pte_at(vma->vm_mm, addr, ptep, pte);
> + __set_pte_at(vma->vm_mm, addr, ptep, pte, 0);
>   }
>   
>   /*
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 1f8db10693e3..ae4a5f66ccd2 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -109,7 +109,7 @@ static int early_map_kernel_page(unsigned long ea, 
> unsigned long pa,
>   ptep = pte_offset_kernel(pmdp, ea);
>   
>   set_the_pte:
> - set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags));
> + __set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags), 0);
>   asm volatile("ptesync": : :"memory");
>   return 0;
>   }
> @@ -169,7 +169,7 @@ static int __map_kernel_page(unsigned long ea, unsigned 
> long pa,
>   return -ENOMEM;
>   
>   set_the_pte:
> - set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags));
> + __set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags), 0);
>   asm volatile("ptesync": : :"memory");
>   return 0;
>   }
> @@ -1536,7 +1536,7 @@ void radix__ptep_modify_prot_commit(struct 
> vm_area_struct *vma,
>   (atomic_read(>context.copros) > 0))
>   radix__flush_tlb_page(vma, addr);
>   
> - set_pte_at(mm, addr, ptep, pte);
> + __set_pte_at(mm, addr, ptep, pte, 0);
>   }
>   
>   int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
> @@ -1547,7 +1547,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t 
> prot)
>   if (!radix_enabled())
>   return 0;
>   
> - set_pte_at(_mm, 0 /* radix unused */, ptep, new_pud);
> + __set_pte_at(_mm, 0 /* radix unused */, ptep, new_pud, 0);
>   
>   return 1;
>   }
> @@ -1594,7 +1594,7 @@ int pmd_set_huge(pmd_t 

[PATCH] powerpc/ftrace: Fix stack teardown in ftrace_no_trace

2023-11-29 Thread Naveen N Rao
Commit 41a506ef71eb ("powerpc/ftrace: Create a dummy stackframe to fix
stack unwind") added use of a new stack frame on ftrace entry to fix
stack unwind. However, the commit missed updating the offset used while
tearing down the ftrace stack when ftrace is disabled. Fix the same.

In addition, the commit missed saving the correct stack pointer in
pt_regs. Update the same.

Fixes: 41a506ef71eb ("powerpc/ftrace: Create a dummy stackframe to fix stack 
unwind")
Cc: sta...@vger.kernel.org
Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/trace/ftrace_entry.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S 
b/arch/powerpc/kernel/trace/ftrace_entry.S
index 90701885762c..40677416d7b2 100644
--- a/arch/powerpc/kernel/trace/ftrace_entry.S
+++ b/arch/powerpc/kernel/trace/ftrace_entry.S
@@ -62,7 +62,7 @@
.endif
 
/* Save previous stack pointer (r1) */
-   addir8, r1, SWITCH_FRAME_SIZE
+   addir8, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
PPC_STL r8, GPR1(r1)
 
.if \allregs == 1
@@ -182,7 +182,7 @@ ftrace_no_trace:
mflrr3
mtctr   r3
REST_GPR(3, r1)
-   addir1, r1, SWITCH_FRAME_SIZE
+   addir1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
mtlrr0
bctr
 #endif

base-commit: 9a15ae60f2c9707433b01e55815cd9142be102b2
-- 
2.43.0



Re: [PATCH v9 5/7] poweprc: mm: Implement *_user_accessible_page() for ptes

2023-11-29 Thread Christophe Leroy


Le 30/11/2023 à 03:53, Rohan McLure a écrit :
> Page table checking depends on architectures providing an
> implementation of p{te,md,ud}_user_accessible_page. With
> refactorisations made on powerpc/mm, the pte_access_permitted() and
> similar methods verify whether a userland page is accessible with the
> required permissions.
> 
> Since page table checking is the only user of
> p{te,md,ud}_user_accessible_page(), implement these for all platforms,
> using some of the same preliminay checks taken by pte_access_permitted()
> on that platform.

pte_access_permitted() returns false on an exec-only page.

As far as I can see in arm64, pte_user_accessible_page() returns true on 
an exec-only page.

In addition, pte_access_permitted() is called only from GUP so is 
garanteed to be called only for user pages. Do we have the same garantee 
from callers of pte_user_accessible_page() ? If not it is needed to 
check address in addition, see commit a78587473642 ("powerpc: Rely on 
address instead of pte_user()")

> 
> Since Commit 8e9bd41e4ce1 ("powerpc/nohash: Replace pte_user() by pte_read()")
> pte_user() is no longer required to be present on all platforms as it
> may be equivalent to or implied by pte_read(). Hence implementations are
> specialised.

pte_user() is not equivalent nor implies by pte_read(). In most 
platforms it is implied by the address being below TASK_SIZE.
pte_read() will also return true on kernel readable pages.

> 
> Signed-off-by: Rohan McLure 
> ---
> v9: New implementation
> ---
>   arch/powerpc/include/asm/book3s/32/pgtable.h |  5 +
>   arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
>   arch/powerpc/include/asm/nohash/pgtable.h|  5 +
>   arch/powerpc/include/asm/pgtable.h   | 15 +++
>   4 files changed, 30 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
> b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 9cc95a61d2a6..bd6f8cdd25aa 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -441,6 +441,11 @@ static inline bool pte_access_permitted(pte_t pte, bool 
> write)
>   return true;
>   }
>   
> +static inline bool pte_user_accessible_page(pte_t pte)
> +{
> + return pte_present(pte) && pte_read(pte);
> +}
> +
>   /* Conversion functions: convert a page and protection to a page entry,
>* and a page entry and page directory to the page they refer to.
>*
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 2454174b26cb..dd3e7b190ab7 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -544,6 +544,11 @@ static inline bool pte_access_permitted(pte_t pte, bool 
> write)
>   return arch_pte_access_permitted(pte_val(pte), write, 0);
>   }
>   
> +static inline bool pte_user_accessible_page(pte_t pte)
> +{
> + return pte_present(pte) && pte_user(pte) && pte_read(pte);
> +}
> +
>   /*
>* Conversion functions: convert a page and protection to a page entry,
>* and a page entry and page directory to the page they refer to.
> diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
> b/arch/powerpc/include/asm/nohash/pgtable.h
> index 427db14292c9..33b4a4267f66 100644
> --- a/arch/powerpc/include/asm/nohash/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/pgtable.h
> @@ -213,6 +213,11 @@ static inline bool pte_access_permitted(pte_t pte, bool 
> write)
>   return true;
>   }
>   
> +static inline bool pte_user_accessible_page(pte_t pte)
> +{
> + return pte_present(pte) && pte_read(pte);
> +}
> +
>   /* Conversion functions: convert a page and protection to a page entry,
>* and a page entry and page directory to the page they refer to.
>*
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index d7d0f47760d3..661bf3afca37 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -250,6 +250,21 @@ static inline pte_t pud_pte(pud_t pud)
>   return __pte(pud_val(pud));
>   }
>   #endif
> +
> +static inline bool pmd_user_accessible_page(pmd_t pmd)
> +{
> + pte_t pte = pmd_pte(pmd);
> +
> + return pte_user_accessible_page(pte);

No need of that pte local var, can fit as a single line.

> +}
> +
> +static inline bool pud_user_accessible_page(pud_t pud)
> +{
> + pte_t pte = pud_pte(pud);
> +
> + return pte_user_accessible_page(pte);

Same.

> +}
> +
>   #endif /* __ASSEMBLY__ */
>   
>   #endif /* _ASM_POWERPC_PGTABLE_H */


Re: [PATCH v9 3/7] powerpc: mm: Add common pud_pfn stub for all platforms

2023-11-29 Thread Christophe Leroy


Le 30/11/2023 à 03:53, Rohan McLure a écrit :
> Prior to this commit, pud_pfn was implemented with BUILD_BUG as the inline
> function for 64-bit Book3S systems but is never included, as its
> invocations in generic code are guarded by calls to pud_devmap which return
> zero on such systems. A future patch will provide support for page table
> checks, the generic code for which depends on a pud_pfn stub being
> implemented, even while the patch will not interact with puds directly.

This is not correct anymore, that was changed by commit 27af67f35631 
("powerpc/book3s64/mm: enable transparent pud hugepage")


> 
> Remove the 64-bit Book3S stub and define pud_pfn to warn on all
> platforms. pud_pfn may be defined properly on a per-platform basis
> should it grow real usages in future.

Your patch removes nothing, it just adds a fallback, is that still correct ?

> 
> Signed-off-by: Rohan McLure 
> ---
>   arch/powerpc/include/asm/pgtable.h | 14 ++
>   1 file changed, 14 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index db0231afca9d..9c0f2151f08f 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -219,6 +219,20 @@ static inline bool 
> arch_supports_memmap_on_memory(unsigned long vmemmap_size)
>   
>   #endif /* CONFIG_PPC64 */
>   
> +/*
> + * Currently only consumed by page_table_check_pud_{set,clear}. Since clears
> + * and sets to page table entries at any level are done through
> + * page_table_check_pte_{set,clear}, provide stub implementation.
> + */
> +#ifndef pud_pfn
> +#define pud_pfn pud_pfn
> +static inline int pud_pfn(pud_t pud)
> +{
> + WARN_ONCE(1, "pud: platform does not use pud entries directly");
> + return 0;
> +}
> +#endif
> +
>   #endif /* __ASSEMBLY__ */
>   
>   #endif /* _ASM_POWERPC_PGTABLE_H */


Re: [PATCH 5/5] powerpc/64s: Fix CONFIG_NUMA=n build

2023-11-29 Thread Michael Ellerman
"Arnd Bergmann"  writes:
> On Wed, Nov 29, 2023, at 14:19, Michael Ellerman wrote:
>> diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
>> index 7f9ff0640124..72341b9fb552 100644
>> --- a/arch/powerpc/mm/mmu_decl.h
>> +++ b/arch/powerpc/mm/mmu_decl.h
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +int create_section_mapping(unsigned long start, unsigned long end,
>> +   int nid, pgprot_t prot);
>> +#endif
>
> This one should probably go next to the remove_section_mapping()
> declaration in arch/powerpc/include/asm/sparsemem.h for consistency.
 
That doesn't work due to:

In file included from ../include/linux/numa.h:26,
 from ../include/linux/async.h:13,
 from ../init/initramfs.c:3:
../arch/powerpc/include/asm/sparsemem.h:19:44: error: unknown type name 
‘pgprot_t’
   19 |   int nid, pgprot_t prot);
  |^~~~

Which might be fixable, but I'd rather just move
remove_section_mapping() into mmu_decl.h as well.

cheers


Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Michael Ellerman
Sean Christopherson  writes:
> On Wed, Nov 29, 2023, Jason Gunthorpe wrote:
>> On Wed, Nov 29, 2023 at 05:07:45PM -0800, Sean Christopherson wrote:
>> > On Wed, Nov 29, 2023, Michael Ellerman wrote:
>> > > Sean Christopherson  writes:
>> > > > On Fri, Nov 10, 2023, Michael Ellerman wrote:
>> > > >> Jason Gunthorpe  writes:
>> > > >> > There are a bunch of reported randconfig failures now because of 
>> > > >> > this,
>> > > >> > something like:
>> > > >> >
>> > > >> >>> arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7: warning: 
>> > > >> >>> attribute declaration must precede definition 
>> > > >> >>> [-Wignored-attributes]
>> > > >> >fn = symbol_get(vfio_file_iommu_group);
>> > > >> > ^
>> > > >> >include/linux/module.h:805:60: note: expanded from macro 
>> > > >> > 'symbol_get'
>> > > >> >#define symbol_get(x) ({ extern typeof(x) x 
>> > > >> > __attribute__((weak,visibility("hidden"))); &(x); })
>> > > >> >
>> > > >> > It happens because the arch forces KVM_VFIO without knowing if VFIO 
>> > > >> > is
>> > > >> > even enabled.
>> > > >> 
>> > > >> This is still breaking some builds. Can we get this fix in please?
>> > > >> 
>> > > >> cheers
>> > > >> 
>> > > >> > Split the kconfig so the arch selects the usual HAVE_KVM_ARCH_VFIO 
>> > > >> > and
>> > > >> > then KVM_VFIO is only enabled if the arch wants it and VFIO is 
>> > > >> > turned on.
>> > > >
>> > > > Heh, so I was trying to figure out why things like vfio_file_set_kvm() 
>> > > > aren't
>> > > > problematic, i.e. why the existing mess didn't cause failures.  I 
>> > > > can't repro the
>> > > > warning (requires clang-16?), but IIUC the reason only the group code 
>> > > > is problematic
>> > > > is that vfio.h creates a stub for vfio_file_iommu_group() and thus 
>> > > > there's no symbol,
>> > > > whereas vfio.h declares vfio_file_set_kvm() unconditionally.
>> > > 
>> > > That warning I'm unsure about.
>> > 
>> > Ah, it's the same warning, I just missed the CONFIG_MODULES=n requirement.
>> 
>> Oh, wait, doesn't that mean the approach won't work? IIRC doesn't
>> symbol_get turn into just  when non-modular turning this into a
>> link failure without the kconfig part?

It does build.

I haven't boot tested it, but TBH I don't really care as long as the
build is green, I don't think anyone's actually using this weird
combination of config options.

> Yes, but it doesn't cause linker errors.  IIUC, because the extern declaration
> is tagged "weak", a dummy default is used.  E.g. on x86, this is what is 
> generated
> with VFIO=y
>
> fn = symbol_get(vfio_file_is_valid);
> if (!fn)
>0x810396c5 <+5>:   mov$0x81829230,%rax
>0x810396cc <+12>:  test   %rax,%rax
>
> whereas VFIO=n gets
>
> fn = symbol_get(vfio_file_is_valid);
> if (!fn)
>0x810396c5 <+5>:   mov$0x0,%rax
>0x810396cc <+12>:  test   %rax,%rax
>
> I have no idea if the fact that symbol_get() generates '0', i.e. the !NULL 
> checks
> work as expected, is intentional or if KVM works by sheer dumb luck.

I think it's intentional:

  https://lore.kernel.org/all/20030117045054.9a2f72c...@lists.samba.org/

cheers


Re: [PATCH v9 2/7] powerpc: mm: Implement p{m,u,4}d_leaf on all platforms

2023-11-29 Thread Christophe Leroy


Le 30/11/2023 à 03:53, Rohan McLure a écrit :
> The check that a higher-level entry in multi-level pages contains a page
> translation entry (pte) is performed by p{m,u,4}d_leaf stubs, which may
> be specialised for each choice of mmu. In a prior commit, we replace
> uses to the catch-all stubs, p{m,u,4}d_is_leaf with p{m,u,4}d_leaf.
> 
> Replace the catch-all stub definitions for p{m,u,4}d_is_leaf with
> definitions for p{m,u,4}d_leaf. A future patch will assume that
> p{m,u,4}d_leaf is defined on all platforms.
> 
> In particular, implement pud_leaf for Book3E-64, pmd_leaf for all Book3E
> and Book3S-64 platforms, with a catch-all definition for p4d_leaf.

Is that needed ? There are fallback definitions of all pXd_leaf() in 
include/linux/pgtable.h

> 
> Reviewed-by: Christophe Leroy 
> Signed-off-by: Rohan McLure 
> ---
> v9: No longer required in order implement page table check, just
> refactoring.
> ---
>   arch/powerpc/include/asm/book3s/32/pgtable.h |  5 +
>   arch/powerpc/include/asm/book3s/64/pgtable.h | 10 -
>   arch/powerpc/include/asm/nohash/64/pgtable.h |  6 ++
>   arch/powerpc/include/asm/pgtable.h   | 22 ++--
>   4 files changed, 17 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
> b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 52971ee30717..9cc95a61d2a6 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -223,6 +223,11 @@ static inline void pmd_clear(pmd_t *pmdp)
>   *pmdp = __pmd(0);
>   }
>   
> +#define pmd_leaf pmd_leaf
> +static inline bool pmd_leaf(pmd_t pmd)
> +{
> + return false;
> +}

Is that needed ? There is a fallback definition of pmd_leaf() in 
include/linux/pgtable.h

>   
>   /*
>* When flushing the tlb entry for a page, we also need to flush the hash
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index cb77eddca54b..8fdb7667c509 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -1459,16 +1459,14 @@ static inline bool is_pte_rw_upgrade(unsigned long 
> old_val, unsigned long new_va
>   /*
>* Like pmd_huge() and pmd_large(), but works regardless of config options
>*/
> -#define pmd_is_leaf pmd_is_leaf
> -#define pmd_leaf pmd_is_leaf
> -static inline bool pmd_is_leaf(pmd_t pmd)
> +#define pmd_leaf pmd_leaf
> +static inline bool pmd_leaf(pmd_t pmd)
>   {
>   return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
>   }
>   
> -#define pud_is_leaf pud_is_leaf
> -#define pud_leaf pud_is_leaf
> -static inline bool pud_is_leaf(pud_t pud)
> +#define pud_leaf pud_leaf
> +static inline bool pud_leaf(pud_t pud)
>   {
>   return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
>   }
> diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h 
> b/arch/powerpc/include/asm/nohash/64/pgtable.h
> index 2202c78730e8..f58cbebde26e 100644
> --- a/arch/powerpc/include/asm/nohash/64/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
> @@ -116,6 +116,12 @@ static inline void pud_clear(pud_t *pudp)
>   *pudp = __pud(0);
>   }
>   
> +#define pud_leaf pud_leaf
> +static inline bool pud_leaf(pud_t pud)
> +{
> + return false;
> +}
> +

Is that needed ? There is a fallback definition of pud_leaf() in 
include/linux/pgtable.h

>   #define pud_none(pud)   (!pud_val(pud))
>   #define pud_bad(pud)(!is_kernel_addr(pud_val(pud)) \
>|| (pud_val(pud) & PUD_BAD_BITS))
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index 9224f23065ff..db0231afca9d 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -180,29 +180,11 @@ static inline void pte_frag_set(mm_context_t *ctx, void 
> *p)
>   }
>   #endif
>   
> -#ifndef pmd_is_leaf
> -#define pmd_is_leaf pmd_is_leaf
> -static inline bool pmd_is_leaf(pmd_t pmd)
> +#define p4d_leaf p4d_leaf
> +static inline bool p4d_leaf(p4d_t p4d)
>   {
>   return false;
>   }

Is that needed ? There is a fallback definition of p4d_leaf() in 
include/linux/pgtable.h

> -#endif
> -
> -#ifndef pud_is_leaf
> -#define pud_is_leaf pud_is_leaf
> -static inline bool pud_is_leaf(pud_t pud)
> -{
> - return false;
> -}
> -#endif
> -
> -#ifndef p4d_is_leaf
> -#define p4d_is_leaf p4d_is_leaf
> -static inline bool p4d_is_leaf(p4d_t p4d)
> -{
> - return false;
> -}
> -#endif
>   
>   #define pmd_pgtable pmd_pgtable
>   static inline pgtable_t pmd_pgtable(pmd_t pmd)


[PATCH v9 6/7] powerpc: mm: Use __set_pte_at() for early-boot / internal usages

2023-11-29 Thread Rohan McLure
In the new set_ptes() API, set_pte_at() (a special case of set_ptes())
is intended to be instrumented by the page table check facility. There
are however several other routines that constitute the API for setting
page table entries, including set_pmd_at() among others. Such routines
are themselves implemented in terms of set_ptes_at().

A future patch providing support for page table checking on powerpc
must take care to avoid duplicate calls to
page_table_check_p{te,md,ud}_set().

Cause API-facing routines that call set_pte_at() to instead call
__set_pte_at(), which will remain uninstrumented by page table check.
set_ptes() is itself implemented by calls to __set_pte_at(), so this
eliminates redundant code.

Also prefer __set_pte_at() in early-boot usages which should not be
instrumented.

Signed-off-by: Rohan McLure 
---
v9: New patch
---
 arch/powerpc/mm/book3s64/hash_pgtable.c  |  2 +-
 arch/powerpc/mm/book3s64/pgtable.c   |  4 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c | 10 +-
 arch/powerpc/mm/nohash/book3e_pgtable.c  |  2 +-
 arch/powerpc/mm/pgtable_32.c |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c 
b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 988948d69bc1..ae52c8db45b7 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -165,7 +165,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long 
pa, pgprot_t prot)
ptep = pte_alloc_kernel(pmdp, ea);
if (!ptep)
return -ENOMEM;
-   set_pte_at(_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
+   __set_pte_at(_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 
prot), 0);
} else {
/*
 * If the mm subsystem is not fully up, we cannot create a
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index be229290a6a7..9a0a2accb261 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -116,7 +116,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
WARN_ON(!(pmd_large(pmd)));
 #endif
trace_hugepage_set_pmd(addr, pmd_val(pmd));
-   return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+   return __set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd), 0);
 }
 
 void set_pud_at(struct mm_struct *mm, unsigned long addr,
@@ -539,7 +539,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, 
unsigned long addr,
if (radix_enabled())
return radix__ptep_modify_prot_commit(vma, addr,
  ptep, old_pte, pte);
-   set_pte_at(vma->vm_mm, addr, ptep, pte);
+   __set_pte_at(vma->vm_mm, addr, ptep, pte, 0);
 }
 
 /*
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 1f8db10693e3..ae4a5f66ccd2 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -109,7 +109,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned 
long pa,
ptep = pte_offset_kernel(pmdp, ea);
 
 set_the_pte:
-   set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags));
+   __set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags), 0);
asm volatile("ptesync": : :"memory");
return 0;
 }
@@ -169,7 +169,7 @@ static int __map_kernel_page(unsigned long ea, unsigned 
long pa,
return -ENOMEM;
 
 set_the_pte:
-   set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags));
+   __set_pte_at(_mm, ea, ptep, pfn_pte(pfn, flags), 0);
asm volatile("ptesync": : :"memory");
return 0;
 }
@@ -1536,7 +1536,7 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct 
*vma,
(atomic_read(>context.copros) > 0))
radix__flush_tlb_page(vma, addr);
 
-   set_pte_at(mm, addr, ptep, pte);
+   __set_pte_at(mm, addr, ptep, pte, 0);
 }
 
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
@@ -1547,7 +1547,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t 
prot)
if (!radix_enabled())
return 0;
 
-   set_pte_at(_mm, 0 /* radix unused */, ptep, new_pud);
+   __set_pte_at(_mm, 0 /* radix unused */, ptep, new_pud, 0);
 
return 1;
 }
@@ -1594,7 +1594,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t 
prot)
if (!radix_enabled())
return 0;
 
-   set_pte_at(_mm, 0 /* radix unused */, ptep, new_pmd);
+   __set_pte_at(_mm, 0 /* radix unused */, ptep, new_pmd, 0);
 
return 1;
 }
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c 
b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 1c5e4ecbebeb..4de91b54fd89 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -111,7 +111,7 @@ int __ref map_kernel_page(unsigned long ea, phys_addr_t pa, 
pgprot_t prot)
  

[PATCH v9 2/7] powerpc: mm: Implement p{m,u,4}d_leaf on all platforms

2023-11-29 Thread Rohan McLure
The check that a higher-level entry in multi-level pages contains a page
translation entry (pte) is performed by p{m,u,4}d_leaf stubs, which may
be specialised for each choice of mmu. In a prior commit, we replace
uses to the catch-all stubs, p{m,u,4}d_is_leaf with p{m,u,4}d_leaf.

Replace the catch-all stub definitions for p{m,u,4}d_is_leaf with
definitions for p{m,u,4}d_leaf. A future patch will assume that
p{m,u,4}d_leaf is defined on all platforms.

In particular, implement pud_leaf for Book3E-64, pmd_leaf for all Book3E
and Book3S-64 platforms, with a catch-all definition for p4d_leaf.

Reviewed-by: Christophe Leroy 
Signed-off-by: Rohan McLure 
---
v9: No longer required in order implement page table check, just
refactoring.
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  5 +
 arch/powerpc/include/asm/book3s/64/pgtable.h | 10 -
 arch/powerpc/include/asm/nohash/64/pgtable.h |  6 ++
 arch/powerpc/include/asm/pgtable.h   | 22 ++--
 4 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 52971ee30717..9cc95a61d2a6 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -223,6 +223,11 @@ static inline void pmd_clear(pmd_t *pmdp)
*pmdp = __pmd(0);
 }
 
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pmd)
+{
+   return false;
+}
 
 /*
  * When flushing the tlb entry for a page, we also need to flush the hash
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cb77eddca54b..8fdb7667c509 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1459,16 +1459,14 @@ static inline bool is_pte_rw_upgrade(unsigned long 
old_val, unsigned long new_va
 /*
  * Like pmd_huge() and pmd_large(), but works regardless of config options
  */
-#define pmd_is_leaf pmd_is_leaf
-#define pmd_leaf pmd_is_leaf
-static inline bool pmd_is_leaf(pmd_t pmd)
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pmd)
 {
return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
 }
 
-#define pud_is_leaf pud_is_leaf
-#define pud_leaf pud_is_leaf
-static inline bool pud_is_leaf(pud_t pud)
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
 {
return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
 }
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h 
b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 2202c78730e8..f58cbebde26e 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -116,6 +116,12 @@ static inline void pud_clear(pud_t *pudp)
*pudp = __pud(0);
 }
 
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
+{
+   return false;
+}
+
 #define pud_none(pud)  (!pud_val(pud))
 #definepud_bad(pud)(!is_kernel_addr(pud_val(pud)) \
 || (pud_val(pud) & PUD_BAD_BITS))
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 9224f23065ff..db0231afca9d 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -180,29 +180,11 @@ static inline void pte_frag_set(mm_context_t *ctx, void 
*p)
 }
 #endif
 
-#ifndef pmd_is_leaf
-#define pmd_is_leaf pmd_is_leaf
-static inline bool pmd_is_leaf(pmd_t pmd)
+#define p4d_leaf p4d_leaf
+static inline bool p4d_leaf(p4d_t p4d)
 {
return false;
 }
-#endif
-
-#ifndef pud_is_leaf
-#define pud_is_leaf pud_is_leaf
-static inline bool pud_is_leaf(pud_t pud)
-{
-   return false;
-}
-#endif
-
-#ifndef p4d_is_leaf
-#define p4d_is_leaf p4d_is_leaf
-static inline bool p4d_is_leaf(p4d_t p4d)
-{
-   return false;
-}
-#endif
 
 #define pmd_pgtable pmd_pgtable
 static inline pgtable_t pmd_pgtable(pmd_t pmd)
-- 
2.43.0



[PATCH v9 3/7] powerpc: mm: Add common pud_pfn stub for all platforms

2023-11-29 Thread Rohan McLure
Prior to this commit, pud_pfn was implemented with BUILD_BUG as the inline
function for 64-bit Book3S systems but is never included, as its
invocations in generic code are guarded by calls to pud_devmap which return
zero on such systems. A future patch will provide support for page table
checks, the generic code for which depends on a pud_pfn stub being
implemented, even while the patch will not interact with puds directly.

Remove the 64-bit Book3S stub and define pud_pfn to warn on all
platforms. pud_pfn may be defined properly on a per-platform basis
should it grow real usages in future.

Signed-off-by: Rohan McLure 
---
 arch/powerpc/include/asm/pgtable.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index db0231afca9d..9c0f2151f08f 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -219,6 +219,20 @@ static inline bool arch_supports_memmap_on_memory(unsigned 
long vmemmap_size)
 
 #endif /* CONFIG_PPC64 */
 
+/*
+ * Currently only consumed by page_table_check_pud_{set,clear}. Since clears
+ * and sets to page table entries at any level are done through
+ * page_table_check_pte_{set,clear}, provide stub implementation.
+ */
+#ifndef pud_pfn
+#define pud_pfn pud_pfn
+static inline int pud_pfn(pud_t pud)
+{
+   WARN_ONCE(1, "pud: platform does not use pud entries directly");
+   return 0;
+}
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
-- 
2.43.0



[PATCH v9 1/7] powerpc: mm: Replace p{u,m,4}d_is_leaf with p{u,m,4}_leaf

2023-11-29 Thread Rohan McLure
Replace occurrences of p{u,m,4}d_is_leaf with p{u,m,4}_leaf, as the
latter is the name given to checking that a higher-level entry in
multi-level paging contains a page translation entry (pte) throughout
all other archs.

A future patch will implement p{u,m,4}_leaf stubs on all platforms so
that they may be referenced in generic code.

Reviewed-by: Christophe Leroy 
Signed-off-by: Rohan McLure 
---
v9: No longer required in order to implement page table check, just a
refactor.
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 12 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c | 14 +++---
 arch/powerpc/mm/pgtable.c|  6 +++---
 arch/powerpc/mm/pgtable_64.c |  6 +++---
 arch/powerpc/xmon/xmon.c |  6 +++---
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 175a8eb2681f..fdb178fe754c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -498,7 +498,7 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t 
*pmd, bool full,
for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
if (!pmd_present(*p))
continue;
-   if (pmd_is_leaf(*p)) {
+   if (pmd_leaf(*p)) {
if (full) {
pmd_clear(p);
} else {
@@ -527,7 +527,7 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t 
*pud,
for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
if (!pud_present(*p))
continue;
-   if (pud_is_leaf(*p)) {
+   if (pud_leaf(*p)) {
pud_clear(p);
} else {
pmd_t *pmd;
@@ -630,12 +630,12 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, 
pte_t pte,
new_pud = pud_alloc_one(kvm->mm, gpa);
 
pmd = NULL;
-   if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
+   if (pud && pud_present(*pud) && !pud_leaf(*pud))
pmd = pmd_offset(pud, gpa);
else if (level <= 1)
new_pmd = kvmppc_pmd_alloc();
 
-   if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
+   if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_leaf(*pmd)))
new_ptep = kvmppc_pte_alloc();
 
/* Check if we might have been invalidated; let the guest retry if so */
@@ -653,7 +653,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, 
pte_t pte,
new_pud = NULL;
}
pud = pud_offset(p4d, gpa);
-   if (pud_is_leaf(*pud)) {
+   if (pud_leaf(*pud)) {
unsigned long hgpa = gpa & PUD_MASK;
 
/* Check if we raced and someone else has set the same thing */
@@ -704,7 +704,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, 
pte_t pte,
new_pmd = NULL;
}
pmd = pmd_offset(pud, gpa);
-   if (pmd_is_leaf(*pmd)) {
+   if (pmd_leaf(*pmd)) {
unsigned long lgpa = gpa & PMD_MASK;
 
/* Check if we raced and someone else has set the same thing */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index c6a4ac766b2b..1f8db10693e3 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -204,14 +204,14 @@ static void radix__change_memory_range(unsigned long 
start, unsigned long end,
pudp = pud_alloc(_mm, p4dp, idx);
if (!pudp)
continue;
-   if (pud_is_leaf(*pudp)) {
+   if (pud_leaf(*pudp)) {
ptep = (pte_t *)pudp;
goto update_the_pte;
}
pmdp = pmd_alloc(_mm, pudp, idx);
if (!pmdp)
continue;
-   if (pmd_is_leaf(*pmdp)) {
+   if (pmd_leaf(*pmdp)) {
ptep = pmdp_ptep(pmdp);
goto update_the_pte;
}
@@ -767,7 +767,7 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, 
unsigned long addr,
if (!pmd_present(*pmd))
continue;
 
-   if (pmd_is_leaf(*pmd)) {
+   if (pmd_leaf(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE)) {
if (!direct)
@@ -807,7 +807,7 @@ static void __meminit remove_pud_table(pud_t *pud_start, 
unsigned long addr,
if (!pud_present(*pud))
continue;
 
-   if (pud_is_leaf(*pud)) {
+   if (pud_leaf(*pud)) {
if (!IS_ALIGNED(addr, PUD_SIZE) ||
!IS_ALIGNED(next, PUD_SIZE)) {
 

[PATCH v9 7/7] powerpc: mm: Support page table check

2023-11-29 Thread Rohan McLure
On creation and clearing of a page table mapping, instrument such calls
by invoking page_table_check_pte_set and page_table_check_pte_clear
respectively. These calls serve as a sanity check against illegal
mappings.

Enable ARCH_SUPPORTS_PAGE_TABLE_CHECK for all platforms.

See also:

riscv support in commit 3fee229a8eb9 ("riscv/mm: enable
ARCH_SUPPORTS_PAGE_TABLE_CHECK")
arm64 in commit 42b2547137f5 ("arm64/mm: enable
ARCH_SUPPORTS_PAGE_TABLE_CHECK")
x86_64 in commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check")

Reviewed-by: Christophe Leroy 
Signed-off-by: Rohan McLure 
---
v9: Updated for new API. Instrument pmdp_collapse_flush's two
constituent calls to avoid header hell
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/include/asm/book3s/32/pgtable.h |  7 +++-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 39 
 arch/powerpc/mm/book3s64/hash_pgtable.c  |  4 ++
 arch/powerpc/mm/book3s64/pgtable.c   | 13 +--
 arch/powerpc/mm/book3s64/radix_pgtable.c |  3 ++
 arch/powerpc/mm/pgtable.c|  4 ++
 7 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6f105ee4f3cf..5bd6d367ef40 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -166,6 +166,7 @@ config PPC
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOCif PPC_BOOK3S || PPC_8xx || 40x
+   select ARCH_SUPPORTS_PAGE_TABLE_CHECK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_MEMTEST
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index bd6f8cdd25aa..48f4e7b98340 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -201,6 +201,7 @@ void unmap_kernel_page(unsigned long va);
 #ifndef __ASSEMBLY__
 #include 
 #include 
+#include 
 
 /* Bits to mask out from a PGD to get to the PUD page */
 #define PGD_MASKED_BITS0
@@ -319,7 +320,11 @@ static inline int __ptep_test_and_clear_young(struct 
mm_struct *mm,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long 
addr,
   pte_t *ptep)
 {
-   return __pte(pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0));
+   pte_t old_pte = __pte(pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0));
+
+   page_table_check_pte_clear(mm, old_pte);
+
+   return old_pte;
 }
 
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index dd3e7b190ab7..834c997ba657 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -151,6 +151,8 @@
 #define PAGE_KERNEL_ROX__pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
 
 #ifndef __ASSEMBLY__
+#include 
+
 /*
  * page table defines
  */
@@ -421,8 +423,11 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
 {
-   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
-   return __pte(old);
+   pte_t old_pte = __pte(pte_update(mm, addr, ptep, ~0UL, 0, 0));
+
+   page_table_check_pte_clear(mm, old_pte);
+
+   return old_pte;
 }
 
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
@@ -431,11 +436,16 @@ static inline pte_t ptep_get_and_clear_full(struct 
mm_struct *mm,
pte_t *ptep, int full)
 {
if (full && radix_enabled()) {
+   pte_t old_pte;
+
/*
 * We know that this is a full mm pte clear and
 * hence can be sure there is no parallel set_pte.
 */
-   return radix__ptep_get_and_clear_full(mm, addr, ptep, full);
+   old_pte = radix__ptep_get_and_clear_full(mm, addr, ptep, full);
+   page_table_check_pte_clear(mm, old_pte);
+
+   return old_pte;
}
return ptep_get_and_clear(mm, addr, ptep);
 }
@@ -1339,17 +1349,30 @@ extern int pudp_test_and_clear_young(struct 
vm_area_struct *vma,
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
 {
-   if (radix_enabled())
-   return radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
-   return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
+   pmd_t old_pmd;
+
+   if (radix_enabled()) {
+   old_pmd = radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
+   } else {
+   old_pmd = hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
+   }
+
+   page_table_check_pmd_clear(mm, old_pmd);
+
+   return old_pmd;
 }
 
 #define 

[PATCH v9 5/7] poweprc: mm: Implement *_user_accessible_page() for ptes

2023-11-29 Thread Rohan McLure
Page table checking depends on architectures providing an
implementation of p{te,md,ud}_user_accessible_page. With
refactorisations made on powerpc/mm, the pte_access_permitted() and
similar methods verify whether a userland page is accessible with the
required permissions.

Since page table checking is the only user of
p{te,md,ud}_user_accessible_page(), implement these for all platforms,
using some of the same preliminay checks taken by pte_access_permitted()
on that platform.

Since Commit 8e9bd41e4ce1 ("powerpc/nohash: Replace pte_user() by pte_read()")
pte_user() is no longer required to be present on all platforms as it
may be equivalent to or implied by pte_read(). Hence implementations are
specialised.

Signed-off-by: Rohan McLure 
---
v9: New implementation
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  5 +
 arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
 arch/powerpc/include/asm/nohash/pgtable.h|  5 +
 arch/powerpc/include/asm/pgtable.h   | 15 +++
 4 files changed, 30 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 9cc95a61d2a6..bd6f8cdd25aa 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -441,6 +441,11 @@ static inline bool pte_access_permitted(pte_t pte, bool 
write)
return true;
 }
 
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+   return pte_present(pte) && pte_read(pte);
+}
+
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  *
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 2454174b26cb..dd3e7b190ab7 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -544,6 +544,11 @@ static inline bool pte_access_permitted(pte_t pte, bool 
write)
return arch_pte_access_permitted(pte_val(pte), write, 0);
 }
 
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+   return pte_present(pte) && pte_user(pte) && pte_read(pte);
+}
+
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 427db14292c9..33b4a4267f66 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -213,6 +213,11 @@ static inline bool pte_access_permitted(pte_t pte, bool 
write)
return true;
 }
 
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+   return pte_present(pte) && pte_read(pte);
+}
+
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  *
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index d7d0f47760d3..661bf3afca37 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -250,6 +250,21 @@ static inline pte_t pud_pte(pud_t pud)
return __pte(pud_val(pud));
 }
 #endif
+
+static inline bool pmd_user_accessible_page(pmd_t pmd)
+{
+   pte_t pte = pmd_pte(pmd);
+
+   return pte_user_accessible_page(pte);
+}
+
+static inline bool pud_user_accessible_page(pud_t pud)
+{
+   pte_t pte = pud_pte(pud);
+
+   return pte_user_accessible_page(pte);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
-- 
2.43.0



[PATCH v9 4/7] powerpc: mm: Default p{m,u}d_pte implementations

2023-11-29 Thread Rohan McLure
For 32-bit systems which have no usecases for p{m,u}d_pte() prior to
page table checking, implement default stubs.

Signed-off-by: Rohan McLure 
---
v9: New patch
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  3 +++
 arch/powerpc/include/asm/nohash/64/pgtable.h |  2 ++
 arch/powerpc/include/asm/pgtable.h   | 17 +
 3 files changed, 22 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8fdb7667c509..2454174b26cb 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -887,6 +887,8 @@ static inline int pud_present(pud_t pud)
 
 extern struct page *pud_page(pud_t pud);
 extern struct page *pmd_page(pmd_t pmd);
+
+#define pud_pte pud_pte
 static inline pte_t pud_pte(pud_t pud)
 {
return __pte_raw(pud_raw(pud));
@@ -1043,6 +1045,7 @@ static inline void __kernel_map_pages(struct page *page, 
int numpages, int enabl
 }
 #endif
 
+#define pmd_pte pmd_pte
 static inline pte_t pmd_pte(pmd_t pmd)
 {
return __pte_raw(pmd_raw(pmd));
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h 
b/arch/powerpc/include/asm/nohash/64/pgtable.h
index f58cbebde26e..09a34fe196ae 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -93,6 +93,7 @@ static inline void pmd_clear(pmd_t *pmdp)
*pmdp = __pmd(0);
 }
 
+#define pmd_pte pmd_pte
 static inline pte_t pmd_pte(pmd_t pmd)
 {
return __pte(pmd_val(pmd));
@@ -134,6 +135,7 @@ static inline pmd_t *pud_pgtable(pud_t pud)
 
 extern struct page *pud_page(pud_t pud);
 
+#define pud_pte pud_pte
 static inline pte_t pud_pte(pud_t pud)
 {
return __pte(pud_val(pud));
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 9c0f2151f08f..d7d0f47760d3 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -233,6 +233,23 @@ static inline int pud_pfn(pud_t pud)
 }
 #endif
 
+#ifndef pmd_pte
+#define pmd_pte pmd_pte
+static inline pte_t pmd_pte(pmd_t pmd)
+{
+   WARN_ONCE(1, "pmd: platform does not use pmd entries directly");
+   return __pte(pmd_val(pmd));
+}
+#endif
+
+#ifndef pud_pte
+#define pud_pte pud_pte
+static inline pte_t pud_pte(pud_t pud)
+{
+   WARN_ONCE(1, "pud: platform does not use pud entries directly");
+   return __pte(pud_val(pud));
+}
+#endif
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
-- 
2.43.0



[PATCH v9 0/7] Support page table check

2023-11-29 Thread Rohan McLure
Support the page table check sanitiser on all PowerPC platforms. This
sanitiser works by serialising assignments, reassignments and clears of
page table entries at each level in order to ensure that anonymous
mappings have at most one writable consumer, and likewise that
file-backed mappings are not simultaneously also anonymous mappings.

In order to support this infrastructure, a number of stubs must be
defined for all powerpc platforms. Additionally, seperate set_pte_at
and set_pte, to allow for internal, uninstrumented mappings.

v9:
 * Adapt to using the set_ptes() API, using __set_pte_at() where we need
   must avoid instrumentation.
 * Use the logic of *_access_permitted() for implementing
   *_user_accessible_page(), which are required routines for page table
   check.
 * Even though we no longer need p{m,u,4}d_leaf(), still default
   implement these to assist in refactoring out extant
   p{m,u,4}_is_leaf().
 * Add p{m,u}_pte() stubs where asm-generic does not provide them, as
   page table check wants all *user_accessible_page() variants, and we
   would like to default implement the variants in terms of
   pte_user_accessible_page().
 * Avoid the ugly pmdp_collapse_flush() macro nonsense! Just instrument
   its constituent calls instead for radix and hash.

v8:
 * Fix linux/page_table_check.h include in asm/pgtable.h breaking
   32-bit.
Link: 
https://lore.kernel.org/linuxppc-dev/20230215231153.2147454-1-rmcl...@linux.ibm.com/

v7:
 * Remove use of extern in set_pte prototypes
 * Clean up pmdp_collapse_flush macro
 * Replace set_pte_at with static inline function
 * Fix commit message for patch 7
Link: 
https://lore.kernel.org/linuxppc-dev/20230215020155.1969194-1-rmcl...@linux.ibm.com/

v6:
 * Support huge pages and p{m,u}d accounting.
 * Remove instrumentation from set_pte from kernel internal pages.
 * 64s: Implement pmdp_collapse_flush in terms of __pmdp_collapse_flush
   as access to the mm_struct * is required.
Link: 
https://lore.kernel.org/linuxppc-dev/20230214015939.1853438-1-rmcl...@linux.ibm.com/

v5:
Link: 
https://lore.kernel.org/linuxppc-dev/20221118002146.25979-1-rmcl...@linux.ibm.com/

Rohan McLure (7):
  powerpc: mm: Replace p{u,m,4}d_is_leaf with p{u,m,4}_leaf
  powerpc: mm: Implement p{m,u,4}d_leaf on all platforms
  powerpc: mm: Add common pud_pfn stub for all platforms
  powerpc: mm: Default p{m,u}d_pte implementations
  poweprc: mm: Implement *_user_accessible_page() for ptes
  powerpc: mm: Use __set_pte_at() for early-boot / internal usages
  powerpc: mm: Support page table check

 arch/powerpc/Kconfig |  1 +
 arch/powerpc/include/asm/book3s/32/pgtable.h | 17 -
 arch/powerpc/include/asm/book3s/64/pgtable.h | 57 
 arch/powerpc/include/asm/nohash/64/pgtable.h |  8 +++
 arch/powerpc/include/asm/nohash/pgtable.h|  5 ++
 arch/powerpc/include/asm/pgtable.h   | 68 ++--
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 12 ++--
 arch/powerpc/mm/book3s64/hash_pgtable.c  |  6 +-
 arch/powerpc/mm/book3s64/pgtable.c   | 17 +++--
 arch/powerpc/mm/book3s64/radix_pgtable.c | 27 
 arch/powerpc/mm/nohash/book3e_pgtable.c  |  2 +-
 arch/powerpc/mm/pgtable.c| 10 ++-
 arch/powerpc/mm/pgtable_32.c |  2 +-
 arch/powerpc/mm/pgtable_64.c |  6 +-
 arch/powerpc/xmon/xmon.c |  6 +-
 15 files changed, 173 insertions(+), 71 deletions(-)

-- 
2.43.0



Re: [PATCH v3 2/7] kexec_file: print out debugging message if required

2023-11-29 Thread Joe Perches
On Thu, 2023-11-30 at 10:39 +0800, Baoquan He wrote:
> Replace pr_debug() with the newly added kexec_dprintk() in kexec_file
> loading related codes.

trivia:

> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
[]
> @@ -551,9 +551,12 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
> int need_kernel_map,
>   phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
>   phdr->p_align = 0;
>   ehdr->e_phnum++;
> - pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, 
> paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
> - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
> - ehdr->e_phnum, phdr->p_offset);
> +#ifdef CONFIG_KEXEC_FILE
> + kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, 
> paddr=0x%llx, "
> +   "sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
> +   phdr, phdr->p_vaddr, phdr->p_paddr, 
> phdr->p_filesz,
> +   ehdr->e_phnum, phdr->p_offset);
> +#endif

Perhaps run checkpatch and coalesce the format string.



[PATCH v3 7/7] kexec_file, parisc: print out debugging message if required

2023-11-29 Thread Baoquan He
Replace pr_debug() with the newly added kexec_dprintk() in kexec_file
loading related codes.

Signed-off-by: Baoquan He 
---
 arch/parisc/kernel/kexec_file.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/parisc/kernel/kexec_file.c b/arch/parisc/kernel/kexec_file.c
index 8c534204f0fd..3fc82130b6c3 100644
--- a/arch/parisc/kernel/kexec_file.c
+++ b/arch/parisc/kernel/kexec_file.c
@@ -38,8 +38,8 @@ static void *elf_load(struct kimage *image, char *kernel_buf,
for (i = 0; i < image->nr_segments; i++)
image->segment[i].mem = __pa(image->segment[i].mem);
 
-   pr_debug("Loaded the kernel at 0x%lx, entry at 0x%lx\n",
-kernel_load_addr, image->start);
+   kexec_dprintk("Loaded the kernel at 0x%lx, entry at 0x%lx\n",
+ kernel_load_addr, image->start);
 
if (initrd != NULL) {
kbuf.buffer = initrd;
@@ -51,7 +51,7 @@ static void *elf_load(struct kimage *image, char *kernel_buf,
if (ret)
goto out;
 
-   pr_debug("Loaded initrd at 0x%lx\n", kbuf.mem);
+   kexec_dprintk("Loaded initrd at 0x%lx\n", kbuf.mem);
image->arch.initrd_start = kbuf.mem;
image->arch.initrd_end = kbuf.mem + initrd_len;
}
@@ -68,7 +68,7 @@ static void *elf_load(struct kimage *image, char *kernel_buf,
if (ret)
goto out;
 
-   pr_debug("Loaded cmdline at 0x%lx\n", kbuf.mem);
+   kexec_dprintk("Loaded cmdline at 0x%lx\n", kbuf.mem);
image->arch.cmdline = kbuf.mem;
}
 out:
-- 
2.41.0



[PATCH v3 6/7] kexec_file, power: print out debugging message if required

2023-11-29 Thread Baoquan He
Replace pr_debug() with the newly added kexec_dprintk() in kexec_file
loading related codes.

Signed-off-by: Baoquan He 
---
 arch/powerpc/kexec/elf_64.c   |  8 
 arch/powerpc/kexec/file_load_64.c | 18 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index eeb258002d1e..904016cf89ea 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -59,7 +59,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
if (ret)
goto out;
 
-   pr_debug("Loaded the kernel at 0x%lx\n", kernel_load_addr);
+   kexec_dprintk("Loaded the kernel at 0x%lx\n", kernel_load_addr);
 
ret = kexec_load_purgatory(image, );
if (ret) {
@@ -67,7 +67,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
goto out;
}
 
-   pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+   kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
 
/* Load additional segments needed for panic kernel */
if (image->type == KEXEC_TYPE_CRASH) {
@@ -99,7 +99,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
goto out;
initrd_load_addr = kbuf.mem;
 
-   pr_debug("Loaded initrd at 0x%lx\n", initrd_load_addr);
+   kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_load_addr);
}
 
fdt = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr,
@@ -132,7 +132,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
 
fdt_load_addr = kbuf.mem;
 
-   pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr);
+   kexec_dprintk("Loaded device tree at 0x%lx\n", fdt_load_addr);
 
slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset;
ret = setup_purgatory_ppc64(image, slave_code, fdt, kernel_load_addr,
diff --git a/arch/powerpc/kexec/file_load_64.c 
b/arch/powerpc/kexec/file_load_64.c
index 961a6dd67365..5b4c5cb23354 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -577,7 +577,7 @@ static int add_usable_mem_property(void *fdt, struct 
device_node *dn,
   NODE_PATH_LEN, dn);
return -EOVERFLOW;
}
-   pr_debug("Memory node path: %s\n", path);
+   kexec_dprintk("Memory node path: %s\n", path);
 
/* Now that we know the path, find its offset in kdump kernel's fdt */
node = fdt_path_offset(fdt, path);
@@ -590,8 +590,8 @@ static int add_usable_mem_property(void *fdt, struct 
device_node *dn,
/* Get the address & size cells */
n_mem_addr_cells = of_n_addr_cells(dn);
n_mem_size_cells = of_n_size_cells(dn);
-   pr_debug("address cells: %d, size cells: %d\n", n_mem_addr_cells,
-n_mem_size_cells);
+   kexec_dprintk("address cells: %d, size cells: %d\n", n_mem_addr_cells,
+ n_mem_size_cells);
 
um_info->idx  = 0;
if (!check_realloc_usable_mem(um_info, 2)) {
@@ -664,7 +664,7 @@ static int update_usable_mem_fdt(void *fdt, struct 
crash_mem *usable_mem)
 
node = fdt_path_offset(fdt, "/ibm,dynamic-reconfiguration-memory");
if (node == -FDT_ERR_NOTFOUND)
-   pr_debug("No dynamic reconfiguration memory found\n");
+   kexec_dprintk("No dynamic reconfiguration memory found\n");
else if (node < 0) {
pr_err("Malformed device tree: error reading 
/ibm,dynamic-reconfiguration-memory.\n");
return -EINVAL;
@@ -776,8 +776,8 @@ static void update_backup_region_phdr(struct kimage *image, 
Elf64_Ehdr *ehdr)
for (i = 0; i < ehdr->e_phnum; i++) {
if (phdr->p_paddr == BACKUP_SRC_START) {
phdr->p_offset = image->arch.backup_start;
-   pr_debug("Backup region offset updated to 0x%lx\n",
-image->arch.backup_start);
+   kexec_dprintk("Backup region offset updated to 0x%lx\n",
+ image->arch.backup_start);
return;
}
}
@@ -850,7 +850,7 @@ int load_crashdump_segments_ppc64(struct kimage *image,
pr_err("Failed to load backup segment\n");
return ret;
}
-   pr_debug("Loaded the backup region at 0x%lx\n", kbuf->mem);
+   kexec_dprintk("Loaded the backup region at 0x%lx\n", kbuf->mem);
 
/* Load elfcorehdr segment - to export crashing kernel's vmcore */
ret = load_elfcorehdr_segment(image, kbuf);
@@ -858,8 +858,8 @@ int load_crashdump_segments_ppc64(struct kimage *image,
pr_err("Failed to load elfcorehdr segment\n");
return ret;
}
-   pr_debug("Loaded elf core header at 0x%lx, bufsz=0x%lx memsz=0x%lx\n",
-image->elf_load_addr, kbuf->bufsz, 

[PATCH v3 5/7] kexec_file, ricv: print out debugging message if required

2023-11-29 Thread Baoquan He
Replace pr_debug() with the newly added kexec_dprintk() in kexec_file
loading related codes.

And also remove kexec_image_info() because the content has been printed
out in generic code.

Signed-off-by: Baoquan He 
---
 arch/riscv/kernel/elf_kexec.c | 11 ++-
 arch/riscv/kernel/machine_kexec.c | 26 --
 2 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
index e60fbd8660c4..5bd1ec3341fe 100644
--- a/arch/riscv/kernel/elf_kexec.c
+++ b/arch/riscv/kernel/elf_kexec.c
@@ -216,7 +216,6 @@ static void *elf_kexec_load(struct kimage *image, char 
*kernel_buf,
if (ret)
goto out;
kernel_start = image->start;
-   pr_notice("The entry point of kernel at 0x%lx\n", image->start);
 
/* Add the kernel binary to the image */
ret = riscv_kexec_elf_load(image, , _info,
@@ -252,8 +251,8 @@ static void *elf_kexec_load(struct kimage *image, char 
*kernel_buf,
image->elf_load_addr = kbuf.mem;
image->elf_headers_sz = headers_sz;
 
-   pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx 
memsz=0x%lx\n",
-image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+   kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx 
memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
 
/* Setup cmdline for kdump kernel case */
modified_cmdline = setup_kdump_cmdline(image, cmdline,
@@ -275,6 +274,8 @@ static void *elf_kexec_load(struct kimage *image, char 
*kernel_buf,
pr_err("Error loading purgatory ret=%d\n", ret);
goto out;
}
+   kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem);
+
ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry",
 _start,
 sizeof(kernel_start), 0);
@@ -293,7 +294,7 @@ static void *elf_kexec_load(struct kimage *image, char 
*kernel_buf,
if (ret)
goto out;
initrd_pbase = kbuf.mem;
-   pr_notice("Loaded initrd at 0x%lx\n", initrd_pbase);
+   kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase);
}
 
/* Add the DTB to the image */
@@ -318,7 +319,7 @@ static void *elf_kexec_load(struct kimage *image, char 
*kernel_buf,
}
/* Cache the fdt buffer address for memory cleanup */
image->arch.fdt = fdt;
-   pr_notice("Loaded device tree at 0x%lx\n", kbuf.mem);
+   kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem);
goto out;
 
 out_free_fdt:
diff --git a/arch/riscv/kernel/machine_kexec.c 
b/arch/riscv/kernel/machine_kexec.c
index 2d139b724bc8..ed9cad20c039 100644
--- a/arch/riscv/kernel/machine_kexec.c
+++ b/arch/riscv/kernel/machine_kexec.c
@@ -18,30 +18,6 @@
 #include 
 #include 
 
-/*
- * kexec_image_info - Print received image details
- */
-static void
-kexec_image_info(const struct kimage *image)
-{
-   unsigned long i;
-
-   pr_debug("Kexec image info:\n");
-   pr_debug("\ttype:%d\n", image->type);
-   pr_debug("\tstart:   %lx\n", image->start);
-   pr_debug("\thead:%lx\n", image->head);
-   pr_debug("\tnr_segments: %lu\n", image->nr_segments);
-
-   for (i = 0; i < image->nr_segments; i++) {
-   pr_debug("\tsegment[%lu]: %016lx - %016lx", i,
-   image->segment[i].mem,
-   image->segment[i].mem + image->segment[i].memsz);
-   pr_debug("\t\t0x%lx bytes, %lu pages\n",
-   (unsigned long) image->segment[i].memsz,
-   (unsigned long) image->segment[i].memsz /  PAGE_SIZE);
-   }
-}
-
 /*
  * machine_kexec_prepare - Initialize kexec
  *
@@ -60,8 +36,6 @@ machine_kexec_prepare(struct kimage *image)
unsigned int control_code_buffer_sz = 0;
int i = 0;
 
-   kexec_image_info(image);
-
/* Find the Flattened Device Tree and save its physical address */
for (i = 0; i < image->nr_segments; i++) {
if (image->segment[i].memsz <= sizeof(fdt))
-- 
2.41.0



[PATCH v3 4/7] kexec_file, arm64: print out debugging message if required

2023-11-29 Thread Baoquan He
Replace pr_debug() with the newly added kexec_dprintk() in kexec_file
loading related codes.

And also remove the kimage->segment[] printing because the generic code
has done the printing.

Signed-off-by: Baoquan He 
---
 arch/arm64/kernel/kexec_image.c|  6 +++---
 arch/arm64/kernel/machine_kexec.c  | 26 ++
 arch/arm64/kernel/machine_kexec_file.c | 12 ++--
 3 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c
index 636be6715155..532d72ea42ee 100644
--- a/arch/arm64/kernel/kexec_image.c
+++ b/arch/arm64/kernel/kexec_image.c
@@ -122,9 +122,9 @@ static void *image_load(struct kimage *image,
kernel_segment->memsz -= text_offset;
image->start = kernel_segment->mem;
 
-   pr_debug("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-   kernel_segment->mem, kbuf.bufsz,
-   kernel_segment->memsz);
+   kexec_dprintk("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_segment->mem, kbuf.bufsz,
+ kernel_segment->memsz);
 
return NULL;
 }
diff --git a/arch/arm64/kernel/machine_kexec.c 
b/arch/arm64/kernel/machine_kexec.c
index 078910db77a4..b38aae5b488d 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -32,26 +32,12 @@
 static void _kexec_image_info(const char *func, int line,
const struct kimage *kimage)
 {
-   unsigned long i;
-
-   pr_debug("%s:%d:\n", func, line);
-   pr_debug("  kexec kimage info:\n");
-   pr_debug("type:%d\n", kimage->type);
-   pr_debug("start:   %lx\n", kimage->start);
-   pr_debug("head:%lx\n", kimage->head);
-   pr_debug("nr_segments: %lu\n", kimage->nr_segments);
-   pr_debug("dtb_mem: %pa\n", >arch.dtb_mem);
-   pr_debug("kern_reloc: %pa\n", >arch.kern_reloc);
-   pr_debug("el2_vectors: %pa\n", >arch.el2_vectors);
-
-   for (i = 0; i < kimage->nr_segments; i++) {
-   pr_debug("  segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu 
pages\n",
-   i,
-   kimage->segment[i].mem,
-   kimage->segment[i].mem + kimage->segment[i].memsz,
-   kimage->segment[i].memsz,
-   kimage->segment[i].memsz /  PAGE_SIZE);
-   }
+   kexec_dprintk("%s:%d:\n", func, line);
+   kexec_dprintk("  kexec kimage info:\n");
+   kexec_dprintk("type:%d\n", kimage->type);
+   kexec_dprintk("head:%lx\n", kimage->head);
+   kexec_dprintk("kern_reloc: %pa\n", >arch.kern_reloc);
+   kexec_dprintk("el2_vectors: %pa\n", >arch.el2_vectors);
 }
 
 void machine_kexec_cleanup(struct kimage *kimage)
diff --git a/arch/arm64/kernel/machine_kexec_file.c 
b/arch/arm64/kernel/machine_kexec_file.c
index a11a6e14ba89..0e017358f4ba 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -127,8 +127,8 @@ int load_other_segments(struct kimage *image,
image->elf_load_addr = kbuf.mem;
image->elf_headers_sz = headers_sz;
 
-   pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx 
memsz=0x%lx\n",
-image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+   kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx 
memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
}
 
/* load initrd */
@@ -148,8 +148,8 @@ int load_other_segments(struct kimage *image,
goto out_err;
initrd_load_addr = kbuf.mem;
 
-   pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-   initrd_load_addr, kbuf.bufsz, kbuf.memsz);
+   kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx 
memsz=0x%lx\n",
+ initrd_load_addr, kbuf.bufsz, kbuf.memsz);
}
 
/* load dtb */
@@ -179,8 +179,8 @@ int load_other_segments(struct kimage *image,
image->arch.dtb = dtb;
image->arch.dtb_mem = kbuf.mem;
 
-   pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-   kbuf.mem, kbuf.bufsz, kbuf.memsz);
+   kexec_dprintk("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kbuf.mem, kbuf.bufsz, kbuf.memsz);
 
return 0;
 
-- 
2.41.0



[PATCH v3 3/7] kexec_file, x86: print out debugging message if required

2023-11-29 Thread Baoquan He
Replace pr_debug() with the newly added kexec_dprintk() in kexec_file
loading related codes.

And also print out e820 memmap passed to 2nd kernel just as kexec_load
interface has been doing.

Signed-off-by: Baoquan He 
---
 arch/x86/kernel/crash.c   |  4 ++--
 arch/x86/kernel/kexec-bzimage64.c | 23 ++-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c92d88680dbf..1715e5f06a59 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -386,8 +386,8 @@ int crash_load_segments(struct kimage *image)
if (ret)
return ret;
image->elf_load_addr = kbuf.mem;
-   pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+   kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
 
return ret;
 }
diff --git a/arch/x86/kernel/kexec-bzimage64.c 
b/arch/x86/kernel/kexec-bzimage64.c
index a61c12c01270..e9ae0eac6bf9 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -82,7 +82,7 @@ static int setup_cmdline(struct kimage *image, struct 
boot_params *params,
 
cmdline_ptr[cmdline_len - 1] = '\0';
 
-   pr_debug("Final command line is: %s\n", cmdline_ptr);
+   kexec_dprintk("Final command line is: %s\n", cmdline_ptr);
cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
cmdline_low_32 = cmdline_ptr_phys & 0xUL;
cmdline_ext_32 = cmdline_ptr_phys >> 32;
@@ -272,7 +272,12 @@ setup_boot_parameters(struct kimage *image, struct 
boot_params *params,
 
nr_e820_entries = params->e820_entries;
 
+   kexec_dprintk("E820 memmap:\n");
for (i = 0; i < nr_e820_entries; i++) {
+   kexec_dprintk("%016llx-%016llx (%d)\n",
+ params->e820_table[i].addr,
+ params->e820_table[i].addr + 
params->e820_table[i].size - 1,
+ params->e820_table[i].type);
if (params->e820_table[i].type != E820_TYPE_RAM)
continue;
start = params->e820_table[i].addr;
@@ -424,7 +429,7 @@ static void *bzImage64_load(struct kimage *image, char 
*kernel,
 * command line. Make sure it does not overflow
 */
if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
-   pr_debug("Appending elfcorehdr= to command line exceeds 
maximum allowed length\n");
+   kexec_dprintk("Appending elfcorehdr= to command line 
exceeds maximum allowed length\n");
return ERR_PTR(-EINVAL);
}
 
@@ -445,7 +450,7 @@ static void *bzImage64_load(struct kimage *image, char 
*kernel,
return ERR_PTR(ret);
}
 
-   pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+   kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
 
 
/*
@@ -490,8 +495,8 @@ static void *bzImage64_load(struct kimage *image, char 
*kernel,
if (ret)
goto out_free_params;
bootparam_load_addr = kbuf.mem;
-   pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx 
memsz=0x%lx\n",
-bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
+   kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx 
bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
 
/* Load kernel */
kbuf.buffer = kernel + kern16_size;
@@ -505,8 +510,8 @@ static void *bzImage64_load(struct kimage *image, char 
*kernel,
goto out_free_params;
kernel_load_addr = kbuf.mem;
 
-   pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+   kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
 
/* Load initrd high */
if (initrd) {
@@ -520,8 +525,8 @@ static void *bzImage64_load(struct kimage *image, char 
*kernel,
goto out_free_params;
initrd_load_addr = kbuf.mem;
 
-   pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-   initrd_load_addr, initrd_len, initrd_len);
+   kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx 
memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
 
setup_initrd(params, initrd_load_addr, initrd_len);
}
-- 
2.41.0



[PATCH v3 1/7] kexec_file: add kexec_file flag to control debug printing

2023-11-29 Thread Baoquan He
When specifying 'kexec -c -d', kexec_load interface will print loading
information, e.g the regions where kernel/initrd/purgatory/cmdline
are put, the memmap passed to 2nd kernel taken as system RAM ranges,
and printing all contents of struct kexec_segment, etc. These are
very helpful for analyzing or positioning what's happening when
kexec/kdump itself failed. The debugging printing for kexec_load
interface is made in user space utility kexec-tools.

Whereas, with kexec_file_load interface, 'kexec -s -d' print nothing.
Because kexec_file code is mostly implemented in kernel space, and the
debugging printing functionality is missed. It's not convenient when
debugging kexec/kdump loading and jumping with kexec_file_load
interface.

Now add KEXEC_FILE_DEBUG to kexec_file flag to control the debugging
message printing. And add global variable kexec_file_dbg_print and
macro kexec_dprintk() to facilitate the printing.

This is a preparation, later kexec_dprintk() will be used to replace the
existing pr_debug(). Once 'kexec -s -d' is specified, it will print out
kexec/kdump loading information. If '-d' is not specified, it regresses
to pr_debug().

Signed-off-by: Baoquan He 
---
 include/linux/kexec.h  | 9 -
 include/uapi/linux/kexec.h | 1 +
 kernel/kexec_core.c| 2 ++
 kernel/kexec_file.c| 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 8227455192b7..400cb6c02176 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type);
 
 /* List of defined/legal kexec file flags */
 #define KEXEC_FILE_FLAGS   (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
-KEXEC_FILE_NO_INITRAMFS)
+KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG)
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
@@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { 
return 0; }
 static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
 #endif
 
+extern bool kexec_file_dbg_print;
+
+#define kexec_dprintk(fmt, ...)\
+   printk("%s" fmt,\
+  kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG,   \
+  ##__VA_ARGS__)
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 01766dd839b0..c17bb096ea68 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -25,6 +25,7 @@
 #define KEXEC_FILE_UNLOAD  0x0001
 #define KEXEC_FILE_ON_CRASH0x0002
 #define KEXEC_FILE_NO_INITRAMFS0x0004
+#define KEXEC_FILE_DEBUG   0x0008
 
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index be5642a4ec49..f70bf3a7885c 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,6 +52,8 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
+bool kexec_file_dbg_print;
+
 int kexec_should_crash(struct task_struct *p)
 {
/*
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f9a419cd22d4..aca5dac74044 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 */
kfree(image->image_loader_data);
image->image_loader_data = NULL;
+
+   kexec_file_dbg_print = false;
 }
 
 #ifdef CONFIG_KEXEC_SIG
@@ -278,6 +280,7 @@ kimage_file_alloc_init(struct kimage **rimage, int 
kernel_fd,
if (!image)
return -ENOMEM;
 
+   kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
image->file_mode = 1;
 
if (kexec_on_panic) {
-- 
2.41.0



[PATCH v3 2/7] kexec_file: print out debugging message if required

2023-11-29 Thread Baoquan He
Replace pr_debug() with the newly added kexec_dprintk() in kexec_file
loading related codes.

And also print out type/start/head of kimage and flags to help debug.

Signed-off-by: Baoquan He 
---
 kernel/crash_core.c|  9 ++---
 kernel/kexec_file.c| 11 ---
 security/integrity/ima/ima_kexec.c |  4 ++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index efe87d501c8c..b2531eaacd1e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -551,9 +551,12 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int 
need_kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
-   pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, 
paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
-   phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
-   ehdr->e_phnum, phdr->p_offset);
+#ifdef CONFIG_KEXEC_FILE
+   kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, 
paddr=0x%llx, "
+ "sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, 
phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+#endif
phdr++;
}
 
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index aca5dac74044..76de1ac7c424 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -204,6 +204,8 @@ kimage_file_prepare_segments(struct kimage *image, int 
kernel_fd, int initrd_fd,
if (ret < 0)
return ret;
image->kernel_buf_len = ret;
+   kexec_dprintk("kernel: %p kernel_size: %#lx\n",
+ image->kernel_buf, image->kernel_buf_len);
 
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
@@ -387,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, 
initrd_fd,
if (ret)
goto out;
 
+   kexec_dprintk("nr_segments = %lu\n", image->nr_segments);
for (i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
 
ksegment = >segment[i];
-   pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx 
memsz=0x%zx\n",
-i, ksegment->buf, ksegment->bufsz, ksegment->mem,
-ksegment->memsz);
+   kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx 
memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
 
ret = kimage_load_segment(image, >segment[i]);
if (ret)
@@ -406,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, 
initrd_fd,
if (ret)
goto out;
 
+   kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx 
flags:0x%lx\n",
+ image->type, image->start, image->head, flags);
/*
 * Free up any temporary buffers allocated which are not needed
 * after image has been loaded
diff --git a/security/integrity/ima/ima_kexec.c 
b/security/integrity/ima/ima_kexec.c
index ad133fe120db..dadc1d138118 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -129,8 +129,8 @@ void ima_add_kexec_buffer(struct kimage *image)
image->ima_buffer_size = kexec_segment_size;
image->ima_buffer = kexec_buffer;
 
-   pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
-kbuf.mem);
+   kexec_dprintk("kexec measurement buffer for the loaded kernel at 
0x%lx.\n",
+ kbuf.mem);
 }
 #endif /* IMA_KEXEC */
 
-- 
2.41.0



[PATCH v3 0/7] kexec_file: print out debugging message if required

2023-11-29 Thread Baoquan He
Currently, specifying '-d' on kexec command will print a lot of debugging
informationabout kexec/kdump loading with kexec_load interface.

However, kexec_file_load prints nothing even though '-d' is specified.
It's very inconvenient to debug or analyze the kexec/kdump loading when
something wrong happened with kexec/kdump itself or develper want to
check the kexec/kdump loading.

In this patchset, a kexec_file flag is KEXEC_FILE_DEBUG added and checked
in code. If it's passed in, debugging message of kexec_file code will be
printed out and can be seen from console and dmesg. Otherwise, the
debugging message is printed like beofre when pr_debug() is taken.

Note:

=
1) The code in kexec-tools utility also need be changed to support
passing KEXEC_FILE_DEBUG to kernel when 'kexec -s -d' is specified.
The patch link is here:
=
[PATCH] kexec_file: add kexec_file flag to support debug printing
http://lists.infradead.org/pipermail/kexec/2023-November/028505.html

2) s390 also has kexec_file code, while I am not sure what debugging
information is necessary. So leave it to s390 developer.

Test:


Testing was done in v1 on x86_64 and arm64. For v3, tested on x86_64
again. And on x86_64, the printed messages look like below:
--
kexec measurement buffer for the loaded kernel at 0x207fffe000.
Loaded purgatory at 0x207fff9000
Loaded boot_param, command line and misc at 0x207fff3000 bufsz=0x1180 
memsz=0x1180
Loaded 64bit kernel at 0x207c00 bufsz=0xc88200 memsz=0x3c4a000
Loaded initrd at 0x2079e79000 bufsz=0x2186280 memsz=0x2186280
Final command line is: 
root=/dev/mapper/fedora_intel--knightslanding--lb--02-root ro
rd.lvm.lv=fedora_intel-knightslanding-lb-02/root console=ttyS0,115200N81 
crashkernel=256M
E820 memmap:
-0009a3ff (1)
0009a400-0009 (2)
000e-000f (2)
0010-6ff83fff (1)
6ff84000-7ac50fff (2)
..
00207fff6150-00207fff615f (128)
00207fff6160-00207fff714f (1)
00207fff7150-00207fff715f (128)
00207fff7160-00207fff814f (1)
00207fff8150-00207fff815f (128)
00207fff8160-00207fff (1)
nr_segments = 5
segment[0]: buf=0x4e5ece74 bufsz=0x211 mem=0x207fffe000 memsz=0x1000
segment[1]: buf=0x9e871498 bufsz=0x4000 mem=0x207fff9000 memsz=0x5000
segment[2]: buf=0xd879f1fe bufsz=0x1180 mem=0x207fff3000 memsz=0x2000
segment[3]: buf=0x1101cd86 bufsz=0xc88200 mem=0x207c00 
memsz=0x3c4a000
segment[4]: buf=0xc6e38ac7 bufsz=0x2186280 mem=0x2079e79000 
memsz=0x2187000
kexec_file_load: type:0, start:0x207fff91a0 head:0x109e004002 flags:0x8
---

History:

=
v2->v3:
- Adjust all the indentation of continuation line to the open parenthesis
  for all kexec_dprintk() call sites. Thank Joe to point this out.
- Fix the LKP report that macro kexec_dprintk() is invalid when
  CONFIG_KEXEC=Y, CONFIG_KEXEC_FILE=n, CONFIG_CRASH_DUMP=y.

v1->v2:
- Take the new format of kexec_dprintk() suggested by Joe which can
  reduce kernel text size.
- Fix building error of patch 2 in kernel/crash_core.c reported by LKP.
- Fix building warning on arm64 in patch 4 reported by LKP.

Baoquan He (7):
  kexec_file: add kexec_file flag to control debug printing
  kexec_file: print out debugging message if required
  kexec_file, x86: print out debugging message if required
  kexec_file, arm64: print out debugging message if required
  kexec_file, ricv: print out debugging message if required
  kexec_file, power: print out debugging message if required
  kexec_file, parisc: print out debugging message if required

 arch/arm64/kernel/kexec_image.c|  6 +++---
 arch/arm64/kernel/machine_kexec.c  | 26 ++
 arch/arm64/kernel/machine_kexec_file.c | 12 ++--
 arch/parisc/kernel/kexec_file.c|  8 
 arch/powerpc/kexec/elf_64.c|  8 
 arch/powerpc/kexec/file_load_64.c  | 18 +-
 arch/riscv/kernel/elf_kexec.c  | 11 ++-
 arch/riscv/kernel/machine_kexec.c  | 26 --
 arch/x86/kernel/crash.c|  4 ++--
 arch/x86/kernel/kexec-bzimage64.c  | 23 ++-
 include/linux/kexec.h  |  9 -
 include/uapi/linux/kexec.h |  1 +
 kernel/crash_core.c|  9 ++---
 kernel/kexec_core.c|  2 ++
 kernel/kexec_file.c| 14 +++---
 security/integrity/ima/ima_kexec.c |  4 ++--
 16 files changed, 84 insertions(+), 97 deletions(-)

-- 
2.41.0



Re: [PATCH v2] powerpc/pseries/vas: Use usleep_range() to support HCALL delay

2023-11-29 Thread Michael Ellerman
Haren Myneni  writes:
> VAS allocate, modify and deallocate HCALLs returns
> H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC for busy
> delay and expects OS to reissue HCALL after that delay. But using
> msleep() will often sleep at least 20 msecs even though the
> hypervisor expects to reissue these HCALLs after 1 or 10msecs.
> It might cause these HCALLs takes longer when multiple threads
> issue open or close VAS windows simultaneously.
>
> So instead of msleep(), use usleep_range() to ensure sleep with
> the expected value before issuing HCALL again.
>
> Signed-off-by: Haren Myneni 
> Suggested-by: Nathan Lynch 
>
> ---
> v1 -> v2:
> - Use usleep_range instead of using RTAS sleep routine as
>   suggested by Nathan
> ---
>  arch/powerpc/platforms/pseries/vas.c | 24 +++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/pseries/vas.c 
> b/arch/powerpc/platforms/pseries/vas.c
> index 71d52a670d95..bade4402741f 100644
> --- a/arch/powerpc/platforms/pseries/vas.c
> +++ b/arch/powerpc/platforms/pseries/vas.c
> @@ -36,9 +36,31 @@ static bool migration_in_progress;
>  
>  static long hcall_return_busy_check(long rc)
>  {
> + unsigned int ms;
> +
>   /* Check if we are stalled for some time */
>   if (H_IS_LONG_BUSY(rc)) {
> - msleep(get_longbusy_msecs(rc));
> + ms = get_longbusy_msecs(rc);
> + /*
> +  * Allocate, Modify and Deallocate HCALLs returns
> +  * H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC
> +  * for the long delay. So the delay should always be 1
> +  * or 10msecs, but sleeps 1msec in case if the long
> +  * delay is > H_LONG_BUSY_ORDER_10_MSEC.
> +  */
> + if (ms > 10)
> + ms = 1;
 
I don't understand this. The hypervisor asked you to sleep for more than
10 milliseconds, so instead you sleep for 1?

I can understand that we don't want to usleep() for the longer durations
that could be returned, but so shouldn't the code be using msleep() for
those values?

Sleeping for a very short duration definitely seems wrong.


> + /*
> +  * msleep() will often sleep at least 20 msecs even
> +  * though the hypervisor expects to reissue these
 
That makes it sound like the hypervisor is reissuing the hcalls.

Better would be "the hypervisor suggests the kernel should reissue the
hcall after ...".

> +  * HCALLs after 1 or 10msecs. So use usleep_range()
> +  * to sleep with the expected value.
> +  *
> +  * See Documentation/timers/timers-howto.rst on using
> +  * the value range in usleep_range().
> +  */
> + usleep_range(ms * 100, ms * 1000);

If ms == 1, then that's 100 usecs, which is not 1 millisecond?

Please use USEC_PER_MSEC.

>   rc = H_BUSY;
>   } else if (rc == H_BUSY) {
>   cond_resched();

cheers


Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Sean Christopherson
On Wed, Nov 29, 2023, Jason Gunthorpe wrote:
> On Wed, Nov 29, 2023 at 05:07:45PM -0800, Sean Christopherson wrote:
> > On Wed, Nov 29, 2023, Michael Ellerman wrote:
> > > Sean Christopherson  writes:
> > > > On Fri, Nov 10, 2023, Michael Ellerman wrote:
> > > >> Jason Gunthorpe  writes:
> > > >> > There are a bunch of reported randconfig failures now because of 
> > > >> > this,
> > > >> > something like:
> > > >> >
> > > >> >>> arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7: warning: attribute 
> > > >> >>> declaration must precede definition [-Wignored-attributes]
> > > >> >fn = symbol_get(vfio_file_iommu_group);
> > > >> > ^
> > > >> >include/linux/module.h:805:60: note: expanded from macro 
> > > >> > 'symbol_get'
> > > >> >#define symbol_get(x) ({ extern typeof(x) x 
> > > >> > __attribute__((weak,visibility("hidden"))); &(x); })
> > > >> >
> > > >> > It happens because the arch forces KVM_VFIO without knowing if VFIO 
> > > >> > is
> > > >> > even enabled.
> > > >> 
> > > >> This is still breaking some builds. Can we get this fix in please?
> > > >> 
> > > >> cheers
> > > >> 
> > > >> > Split the kconfig so the arch selects the usual HAVE_KVM_ARCH_VFIO 
> > > >> > and
> > > >> > then KVM_VFIO is only enabled if the arch wants it and VFIO is 
> > > >> > turned on.
> > > >
> > > > Heh, so I was trying to figure out why things like vfio_file_set_kvm() 
> > > > aren't
> > > > problematic, i.e. why the existing mess didn't cause failures.  I can't 
> > > > repro the
> > > > warning (requires clang-16?), but IIUC the reason only the group code 
> > > > is problematic
> > > > is that vfio.h creates a stub for vfio_file_iommu_group() and thus 
> > > > there's no symbol,
> > > > whereas vfio.h declares vfio_file_set_kvm() unconditionally.
> > > 
> > > That warning I'm unsure about.
> > 
> > Ah, it's the same warning, I just missed the CONFIG_MODULES=n requirement.
> 
> Oh, wait, doesn't that mean the approach won't work? IIRC doesn't
> symbol_get turn into just  when non-modular turning this into a
> link failure without the kconfig part?

Yes, but it doesn't cause linker errors.  IIUC, because the extern declaration
is tagged "weak", a dummy default is used.  E.g. on x86, this is what is 
generated
with VFIO=y

fn = symbol_get(vfio_file_is_valid);
if (!fn)
   0x810396c5 <+5>: mov$0x81829230,%rax
   0x810396cc <+12>:test   %rax,%rax

whereas VFIO=n gets

fn = symbol_get(vfio_file_is_valid);
if (!fn)
   0x810396c5 <+5>: mov$0x0,%rax
   0x810396cc <+12>:test   %rax,%rax

I have no idea if the fact that symbol_get() generates '0', i.e. the !NULL 
checks
work as expected, is intentional or if KVM works by sheer dumb luck.  I'm not
entirely sure I want to find out, though it's definitely extra motiviation to 
get
the KVM mess fixed sooner than later. 


Re: [PATCH v2] powerpc/pseries/vas: Use usleep_range() to support HCALL delay

2023-11-29 Thread Nathan Lynch
Haren Myneni  writes:
> VAS allocate, modify and deallocate HCALLs returns
> H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC for busy
> delay and expects OS to reissue HCALL after that delay. But using
> msleep() will often sleep at least 20 msecs even though the
> hypervisor expects to reissue these HCALLs after 1 or 10msecs.

I would word this as "the architecture suggests that the OS reissue
these [...]" instead of framing it as something the platform "expects".

> It might cause these HCALLs takes longer when multiple threads
> issue open or close VAS windows simultaneously.

This is imprecise. Over-sleeping by the OS doesn't cause individual
hcalls to take longer. It is more accurate to say that the higher-level
operation (allocate, modify, free) may take longer than necessary in
cases where the OS must retry the hcalls involved.

> So instead of msleep(), use usleep_range() to ensure sleep with
> the expected value before issuing HCALL again.
>
> Signed-off-by: Haren Myneni 
> Suggested-by: Nathan Lynch 
>
> ---
> v1 -> v2:
> - Use usleep_range instead of using RTAS sleep routine as
>   suggested by Nathan
> ---
>  arch/powerpc/platforms/pseries/vas.c | 24 +++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/pseries/vas.c 
> b/arch/powerpc/platforms/pseries/vas.c
> index 71d52a670d95..bade4402741f 100644
> --- a/arch/powerpc/platforms/pseries/vas.c
> +++ b/arch/powerpc/platforms/pseries/vas.c
> @@ -36,9 +36,31 @@ static bool migration_in_progress;
>  
>  static long hcall_return_busy_check(long rc)
>  {
> + unsigned int ms;

This should move down into the H_IS_LONG_BUSY() block if it's not used
outside of it.

> +
>   /* Check if we are stalled for some time */
>   if (H_IS_LONG_BUSY(rc)) {
> - msleep(get_longbusy_msecs(rc));
> + ms = get_longbusy_msecs(rc);
> + /*
> +  * Allocate, Modify and Deallocate HCALLs returns
> +  * H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC
> +  * for the long delay. So the delay should always be 1
> +  * or 10msecs, but sleeps 1msec in case if the long
> +  * delay is > H_LONG_BUSY_ORDER_10_MSEC.
> +  */
> + if (ms > 10)
> + ms = 1;

It's strange to coerce ms to 1 when it's greater than 10. Just clamp it
to 10, e.g.

ms = clamp(get_longbusy_msecs(rc), 1, 10);

> +
> + /*
> +  * msleep() will often sleep at least 20 msecs even
> +  * though the hypervisor expects to reissue these
> +  * HCALLs after 1 or 10msecs. So use usleep_range()
> +  * to sleep with the expected value.
> +  *
> +  * See Documentation/timers/timers-howto.rst on using
> +  * the value range in usleep_range().
> +  */
> + usleep_range(ms * 100, ms * 1000);

If there's going to be commentary here I think it should just explain
why potentially sleeping for less than the suggested time is OK. There
is wording you can crib in rtas_busy_delay().


>   rc = H_BUSY;
>   } else if (rc == H_BUSY) {
>   cond_resched();
> -- 
> 2.26.3


Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Jason Gunthorpe
On Wed, Nov 29, 2023 at 05:07:45PM -0800, Sean Christopherson wrote:
> On Wed, Nov 29, 2023, Michael Ellerman wrote:
> > Sean Christopherson  writes:
> > > On Fri, Nov 10, 2023, Michael Ellerman wrote:
> > >> Jason Gunthorpe  writes:
> > >> > There are a bunch of reported randconfig failures now because of this,
> > >> > something like:
> > >> >
> > >> >>> arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7: warning: attribute 
> > >> >>> declaration must precede definition [-Wignored-attributes]
> > >> >fn = symbol_get(vfio_file_iommu_group);
> > >> > ^
> > >> >include/linux/module.h:805:60: note: expanded from macro 
> > >> > 'symbol_get'
> > >> >#define symbol_get(x) ({ extern typeof(x) x 
> > >> > __attribute__((weak,visibility("hidden"))); &(x); })
> > >> >
> > >> > It happens because the arch forces KVM_VFIO without knowing if VFIO is
> > >> > even enabled.
> > >> 
> > >> This is still breaking some builds. Can we get this fix in please?
> > >> 
> > >> cheers
> > >> 
> > >> > Split the kconfig so the arch selects the usual HAVE_KVM_ARCH_VFIO and
> > >> > then KVM_VFIO is only enabled if the arch wants it and VFIO is turned 
> > >> > on.
> > >
> > > Heh, so I was trying to figure out why things like vfio_file_set_kvm() 
> > > aren't
> > > problematic, i.e. why the existing mess didn't cause failures.  I can't 
> > > repro the
> > > warning (requires clang-16?), but IIUC the reason only the group code is 
> > > problematic
> > > is that vfio.h creates a stub for vfio_file_iommu_group() and thus 
> > > there's no symbol,
> > > whereas vfio.h declares vfio_file_set_kvm() unconditionally.
> > 
> > That warning I'm unsure about.
> 
> Ah, it's the same warning, I just missed the CONFIG_MODULES=n requirement.

Oh, wait, doesn't that mean the approach won't work? IIRC doesn't
symbol_get turn into just  when non-modular turning this into a
link failure without the kconfig part?

Jason


Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Sean Christopherson
On Wed, Nov 29, 2023, Michael Ellerman wrote:
> Sean Christopherson  writes:
> > On Fri, Nov 10, 2023, Michael Ellerman wrote:
> >> Jason Gunthorpe  writes:
> >> > There are a bunch of reported randconfig failures now because of this,
> >> > something like:
> >> >
> >> >>> arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7: warning: attribute 
> >> >>> declaration must precede definition [-Wignored-attributes]
> >> >fn = symbol_get(vfio_file_iommu_group);
> >> > ^
> >> >include/linux/module.h:805:60: note: expanded from macro 'symbol_get'
> >> >#define symbol_get(x) ({ extern typeof(x) x 
> >> > __attribute__((weak,visibility("hidden"))); &(x); })
> >> >
> >> > It happens because the arch forces KVM_VFIO without knowing if VFIO is
> >> > even enabled.
> >> 
> >> This is still breaking some builds. Can we get this fix in please?
> >> 
> >> cheers
> >> 
> >> > Split the kconfig so the arch selects the usual HAVE_KVM_ARCH_VFIO and
> >> > then KVM_VFIO is only enabled if the arch wants it and VFIO is turned on.
> >
> > Heh, so I was trying to figure out why things like vfio_file_set_kvm() 
> > aren't
> > problematic, i.e. why the existing mess didn't cause failures.  I can't 
> > repro the
> > warning (requires clang-16?), but IIUC the reason only the group code is 
> > problematic
> > is that vfio.h creates a stub for vfio_file_iommu_group() and thus there's 
> > no symbol,
> > whereas vfio.h declares vfio_file_set_kvm() unconditionally.
> 
> That warning I'm unsure about.

Ah, it's the same warning, I just missed the CONFIG_MODULES=n requirement.


Re: [PATCH v13 17/35] KVM: Add transparent hugepage support for dedicated guest memory

2023-11-29 Thread Sean Christopherson
On Mon, Nov 27, 2023, Vlastimil Babka wrote:
> On 11/2/23 16:46, Paolo Bonzini wrote:
> > On Thu, Nov 2, 2023 at 4:38 PM Sean Christopherson  
> > wrote:
> >> Actually, looking that this again, there's not actually a hard dependency 
> >> on THP.
> >> A THP-enabled kernel _probably_  gives a higher probability of using 
> >> hugepages,
> >> but mostly because THP selects COMPACTION, and I suppose because using THP 
> >> for
> >> other allocations reduces overall fragmentation.
> > 
> > Yes, that's why I didn't even bother enabling it unless THP is
> > enabled, but it makes even more sense to just try.
> > 
> >> So rather than honor KVM_GUEST_MEMFD_ALLOW_HUGEPAGE iff THP is enabled, I 
> >> think
> >> we should do the below (I verified KVM can create hugepages with THP=n).  
> >> We'll
> >> need another capability, but (a) we probably should have that anyways and 
> >> (b) it
> >> provides a cleaner path to adding PUD-sized hugepage support in the future.
> > 
> > I wonder if we need KVM_CAP_GUEST_MEMFD_HUGEPAGE_PMD_SIZE though. This
> > should be a generic kernel API and in fact the sizes are available in
> > a not-so-friendly format in /sys/kernel/mm/hugepages.
> > 
> > We should just add /sys/kernel/mm/hugepages/sizes that contains
> > "2097152 1073741824" on x86 (only the former if 1G pages are not
> > supported).
> > 
> > Plus: is this the best API if we need something else for 1G pages?
> > 
> > Let's drop *this* patch and proceed incrementally. (Again, this is
> > what I want to do with this final review: identify places that are
> > stil sticky, and don't let them block the rest).
> > 
> > Coincidentially we have an open spot next week at plumbers. Let's
> > extend Fuad's section to cover more guestmem work.
> 
> Hi,
> 
> was there any outcome wrt this one?

No, we punted on hugepage support for the initial guest_memfd merge.  We 
definitely
plan on adding hugeapge support sooner than later, but we haven't yet agreed on
exactly what that will look like.

> Based on my experience with THP's it would be best if userspace didn't have
> to opt-in, nor care about the supported size. If the given size is unaligned,
> provide a mix of large pages up to an aligned size, and for the rest fallback
> to base pages, which should be better than -EINVAL on creation (is it
> possible with the current implementation? I'd hope so so?).

guest_memfd serves a different use case than THP.  For modern VMs, and 
especially
for slice-of-hardware VMs that are one of the main targets for guest_memfd, if 
not
_the_ main target, guest memory should _always_ be backed by hugepages in the
physical domain.  The actual guest mappings might not be huge, e.g. x86 needs to
do partial mappings to skip over (legacy) memory holes, but KVM already 
gracefully
handles that.

In other words, for most guest_memfd use cases, if userspace wants hugepages but
KVM can't provide hugepages, then it is much more desirable to return an error
than to silently fall back to small pages.

I 100% agree that having to opt-in is suboptimal, but IMO providing "error on an
incompatible configuration" semantics without requiring userspace to opt-in is 
an
even worse experience for userspace.

> A way to opt-out from huge pages could be useful although there's always the
> risk of some initial troubles resulting in various online sources cargo-cult
> recommending to opt-out forever.


Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Alex Williamson
On Wed, 29 Nov 2023 10:31:03 -0800
Sean Christopherson  wrote:

> On Wed, Nov 29, 2023, Jason Gunthorpe wrote:
> > On Tue, Nov 28, 2023 at 06:21:42PM -0800, Sean Christopherson wrote:  
> > > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > > index 454e9295970c..a65b2513f8cd 100644
> > > --- a/include/linux/vfio.h
> > > +++ b/include/linux/vfio.h
> > > @@ -289,16 +289,12 @@ void vfio_combine_iova_ranges(struct rb_root_cached 
> > > *root, u32 cur_nodes,
> > >  /*
> > >   * External user API
> > >   */
> > > -#if IS_ENABLED(CONFIG_VFIO_GROUP)
> > >  struct iommu_group *vfio_file_iommu_group(struct file *file);
> > > +
> > > +#if IS_ENABLED(CONFIG_VFIO_GROUP)
> > >  bool vfio_file_is_group(struct file *file);
> > >  bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
> > >  #else
> > > -static inline struct iommu_group *vfio_file_iommu_group(struct file 
> > > *file)
> > > -{
> > > -   return NULL;
> > > -}
> > > -
> > >  static inline bool vfio_file_is_group(struct file *file)
> > >  {
> > > return false;
> > >   
> > 
> > So you symbol get on a symbol that can never be defined? Still says to
> > me the kconfig needs fixing :|  
> 
> Yeah, I completely agree, and if KVM didn't already rely on this horrific
> behavior and there wasn't a more complete overhaul in-flight, I wouldn't 
> suggest
> this.
> 
> I'll send the KVM Kconfig/Makefile cleanups from my "Hide KVM internals from 
> others"
> series separately (which is still the bulk of the series) so as to prioritize
> getting the cleanups landed.
> 

Seems we have agreement and confirmation of the fix above as an
interim, do you want to post it formally and I can pick it up for
v6.7-rc?  Thanks,

Alex



[PATCH v4 4/4] PCI: layerscape: Add suspend/resume for ls1043a

2023-11-29 Thread Frank Li
In the suspend path, PME_Turn_Off message is sent to the endpoint to
transition the link to L2/L3_Ready state. In this SoC, there is no way to
check if the controller has received the PME_To_Ack from the endpoint or
not. So to be on the safer side, the driver just waits for
PCIE_PME_TO_L2_TIMEOUT_US before asserting the SoC specific PMXMTTURNOFF
bit to complete the PME_Turn_Off handshake. This link would then enter
L2/L3 state depending on the VAUX supply.

In the resume path, the link is brought back from L2 to L0 by doing a
software reset.

Signed-off-by: Frank Li 
---

Notes:
Change from v3 to v4
- Call scfg_pcie_send_turnoff_msg() shared with ls1021a
- update commit message

Change from v2 to v3
- Remove ls_pcie_lut_readl(writel) function

Change from v1 to v2
- Update subject 'a' to 'A'

 drivers/pci/controller/dwc/pci-layerscape.c | 63 -
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape.c 
b/drivers/pci/controller/dwc/pci-layerscape.c
index 590e07bb27002..d39700b3afaaa 100644
--- a/drivers/pci/controller/dwc/pci-layerscape.c
+++ b/drivers/pci/controller/dwc/pci-layerscape.c
@@ -41,6 +41,15 @@
 #define SCFG_PEXSFTRSTCR   0x190
 #define PEXSR(idx) BIT(idx)
 
+/* LS1043A PEX PME control register */
+#define SCFG_PEXPMECR  0x144
+#define PEXPME(idx)BIT(31 - (idx) * 4)
+
+/* LS1043A PEX LUT debug register */
+#define LS_PCIE_LDBG   0x7fc
+#define LDBG_SRBIT(30)
+#define LDBG_WEBIT(31)
+
 #define PCIE_IATU_NUM  6
 
 struct ls_pcie_drvdata {
@@ -225,6 +234,45 @@ static int ls1021a_pcie_exit_from_l2(struct dw_pcie_rp *pp)
return scfg_pcie_exit_from_l2(pcie->scfg, SCFG_PEXSFTRSTCR, 
PEXSR(pcie->index));
 }
 
+static void ls1043a_pcie_send_turnoff_msg(struct dw_pcie_rp *pp)
+{
+   struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+   struct ls_pcie *pcie = to_ls_pcie(pci);
+
+   scfg_pcie_send_turnoff_msg(pcie->scfg, SCFG_PEXPMECR, 
PEXPME(pcie->index));
+}
+
+static int ls1043a_pcie_exit_from_l2(struct dw_pcie_rp *pp)
+{
+   struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+   struct ls_pcie *pcie = to_ls_pcie(pci);
+   u32 val;
+
+   /*
+* Only way let PEX module exit L2 is do a software reset.
+* LDBG_WE: allows the user to have write access to the PEXDBG[SR] for 
both setting and
+*  clearing the soft reset on the PEX module.
+* LDBG_SR: When SR is set to 1, the PEX module enters soft reset.
+*/
+   val = ls_pcie_pf_lut_readl(pcie, LS_PCIE_LDBG);
+   val |= LDBG_WE;
+   ls_pcie_pf_lut_writel(pcie, LS_PCIE_LDBG, val);
+
+   val = ls_pcie_pf_lut_readl(pcie, LS_PCIE_LDBG);
+   val |= LDBG_SR;
+   ls_pcie_pf_lut_writel(pcie, LS_PCIE_LDBG, val);
+
+   val = ls_pcie_pf_lut_readl(pcie, LS_PCIE_LDBG);
+   val &= ~LDBG_SR;
+   ls_pcie_pf_lut_writel(pcie, LS_PCIE_LDBG, val);
+
+   val = ls_pcie_pf_lut_readl(pcie, LS_PCIE_LDBG);
+   val &= ~LDBG_WE;
+   ls_pcie_pf_lut_writel(pcie, LS_PCIE_LDBG, val);
+
+   return 0;
+}
+
 static const struct dw_pcie_host_ops ls_pcie_host_ops = {
.host_init = ls_pcie_host_init,
.pme_turn_off = ls_pcie_send_turnoff_msg,
@@ -242,6 +290,19 @@ static const struct ls_pcie_drvdata ls1021a_drvdata = {
.exit_from_l2 = ls1021a_pcie_exit_from_l2,
 };
 
+static const struct dw_pcie_host_ops ls1043a_pcie_host_ops = {
+   .host_init = ls_pcie_host_init,
+   .pme_turn_off = ls1043a_pcie_send_turnoff_msg,
+};
+
+static const struct ls_pcie_drvdata ls1043a_drvdata = {
+   .pf_lut_off = 0x1,
+   .pm_support = true,
+   .scfg_support = true,
+   .ops = _pcie_host_ops,
+   .exit_from_l2 = ls1043a_pcie_exit_from_l2,
+};
+
 static const struct ls_pcie_drvdata layerscape_drvdata = {
.pf_lut_off = 0xc,
.pm_support = true,
@@ -252,7 +313,7 @@ static const struct of_device_id ls_pcie_of_match[] = {
{ .compatible = "fsl,ls1012a-pcie", .data = _drvdata },
{ .compatible = "fsl,ls1021a-pcie", .data = _drvdata },
{ .compatible = "fsl,ls1028a-pcie", .data = _drvdata },
-   { .compatible = "fsl,ls1043a-pcie", .data = _drvdata },
+   { .compatible = "fsl,ls1043a-pcie", .data = _drvdata },
{ .compatible = "fsl,ls1046a-pcie", .data = _drvdata },
{ .compatible = "fsl,ls2080a-pcie", .data = _drvdata },
{ .compatible = "fsl,ls2085a-pcie", .data = _drvdata },
-- 
2.34.1



[PATCH v4 3/4] PCI: layerscape: Rename pf_* as pf_lut_*

2023-11-29 Thread Frank Li
'pf' and 'lut' is just difference name in difference chips, but basic it is
a MMIO base address plus an offset.

Rename it to avoid duplicate pf_* and lut_* in driver.

Signed-off-by: Frank Li 
---

Notes:
pf_lut is better than pf_* or lut* because some chip use 'pf', some chip
use 'lut'.

change from v1 to v4
- new patch at v3

 drivers/pci/controller/dwc/pci-layerscape.c | 34 ++---
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape.c 
b/drivers/pci/controller/dwc/pci-layerscape.c
index 42bca2c3b5c3e..590e07bb27002 100644
--- a/drivers/pci/controller/dwc/pci-layerscape.c
+++ b/drivers/pci/controller/dwc/pci-layerscape.c
@@ -44,7 +44,7 @@
 #define PCIE_IATU_NUM  6
 
 struct ls_pcie_drvdata {
-   const u32 pf_off;
+   const u32 pf_lut_off;
const struct dw_pcie_host_ops *ops;
int (*exit_from_l2)(struct dw_pcie_rp *pp);
bool scfg_support;
@@ -54,13 +54,13 @@ struct ls_pcie_drvdata {
 struct ls_pcie {
struct dw_pcie *pci;
const struct ls_pcie_drvdata *drvdata;
-   void __iomem *pf_base;
+   void __iomem *pf_lut_base;
struct regmap *scfg;
int index;
bool big_endian;
 };
 
-#define ls_pcie_pf_readl_addr(addr)ls_pcie_pf_readl(pcie, addr)
+#define ls_pcie_pf_lut_readl_addr(addr)ls_pcie_pf_lut_readl(pcie, addr)
 #define to_ls_pcie(x)  dev_get_drvdata((x)->dev)
 
 static bool ls_pcie_is_bridge(struct ls_pcie *pcie)
@@ -101,20 +101,20 @@ static void ls_pcie_fix_error_response(struct ls_pcie 
*pcie)
iowrite32(PCIE_ABSERR_SETTING, pci->dbi_base + PCIE_ABSERR);
 }
 
-static u32 ls_pcie_pf_readl(struct ls_pcie *pcie, u32 off)
+static u32 ls_pcie_pf_lut_readl(struct ls_pcie *pcie, u32 off)
 {
if (pcie->big_endian)
-   return ioread32be(pcie->pf_base + off);
+   return ioread32be(pcie->pf_lut_base + off);
 
-   return ioread32(pcie->pf_base + off);
+   return ioread32(pcie->pf_lut_base + off);
 }
 
-static void ls_pcie_pf_writel(struct ls_pcie *pcie, u32 off, u32 val)
+static void ls_pcie_pf_lut_writel(struct ls_pcie *pcie, u32 off, u32 val)
 {
if (pcie->big_endian)
-   iowrite32be(val, pcie->pf_base + off);
+   iowrite32be(val, pcie->pf_lut_base + off);
else
-   iowrite32(val, pcie->pf_base + off);
+   iowrite32(val, pcie->pf_lut_base + off);
 }
 
 static void ls_pcie_send_turnoff_msg(struct dw_pcie_rp *pp)
@@ -124,11 +124,11 @@ static void ls_pcie_send_turnoff_msg(struct dw_pcie_rp 
*pp)
u32 val;
int ret;
 
-   val = ls_pcie_pf_readl(pcie, LS_PCIE_PF_MCR);
+   val = ls_pcie_pf_lut_readl(pcie, LS_PCIE_PF_MCR);
val |= PF_MCR_PTOMR;
-   ls_pcie_pf_writel(pcie, LS_PCIE_PF_MCR, val);
+   ls_pcie_pf_lut_writel(pcie, LS_PCIE_PF_MCR, val);
 
-   ret = readx_poll_timeout(ls_pcie_pf_readl_addr, LS_PCIE_PF_MCR,
+   ret = readx_poll_timeout(ls_pcie_pf_lut_readl_addr, LS_PCIE_PF_MCR,
 val, !(val & PF_MCR_PTOMR),
 PCIE_PME_TO_L2_TIMEOUT_US/10,
 PCIE_PME_TO_L2_TIMEOUT_US);
@@ -147,15 +147,15 @@ static int ls_pcie_exit_from_l2(struct dw_pcie_rp *pp)
 * Set PF_MCR_EXL2S bit in LS_PCIE_PF_MCR register for the link
 * to exit L2 state.
 */
-   val = ls_pcie_pf_readl(pcie, LS_PCIE_PF_MCR);
+   val = ls_pcie_pf_lut_readl(pcie, LS_PCIE_PF_MCR);
val |= PF_MCR_EXL2S;
-   ls_pcie_pf_writel(pcie, LS_PCIE_PF_MCR, val);
+   ls_pcie_pf_lut_writel(pcie, LS_PCIE_PF_MCR, val);
 
/*
 * L2 exit timeout of 10ms is not defined in the specifications,
 * it was chosen based on empirical observations.
 */
-   ret = readx_poll_timeout(ls_pcie_pf_readl_addr, LS_PCIE_PF_MCR,
+   ret = readx_poll_timeout(ls_pcie_pf_lut_readl_addr, LS_PCIE_PF_MCR,
 val, !(val & PF_MCR_EXL2S),
 1000,
 1);
@@ -243,7 +243,7 @@ static const struct ls_pcie_drvdata ls1021a_drvdata = {
 };
 
 static const struct ls_pcie_drvdata layerscape_drvdata = {
-   .pf_off = 0xc,
+   .pf_lut_off = 0xc,
.pm_support = true,
.exit_from_l2 = ls_pcie_exit_from_l2,
 };
@@ -293,7 +293,7 @@ static int ls_pcie_probe(struct platform_device *pdev)
 
pcie->big_endian = of_property_read_bool(dev->of_node, "big-endian");
 
-   pcie->pf_base = pci->dbi_base + pcie->drvdata->pf_off;
+   pcie->pf_lut_base = pci->dbi_base + pcie->drvdata->pf_lut_off;
 
if (pcie->drvdata->scfg_support) {
pcie->scfg = syscon_regmap_lookup_by_phandle(dev->of_node, 
"fsl,pcie-scfg");
-- 
2.34.1



[PATCH v4 2/4] PCI: layerscape: Add suspend/resume for ls1021a

2023-11-29 Thread Frank Li
ls1021a add suspend/resume support.

In the suspend path, PME_Turn_Off message is sent to the endpoint to
transition the link to L2/L3_Ready state. In this SoC, there is no way to
check if the controller has received the PME_To_Ack from the endpoint or
not. So to be on the safer side, the driver just waits for
PCIE_PME_TO_L2_TIMEOUT_US before asserting the SoC specific PMXMTTURNOFF
bit to complete the PME_Turn_Off handshake. This link would then enter
L2/L3 state depending on the VAUX supply.

In the resume path, the link is brought back from L2 to L0 by doing a
software reset.

Signed-off-by: Frank Li 
---

Notes:
Change from v3 to v4
- update commit message.
- it is reset a glue logic part for PCI controller.
- use regmap_write_bits() to reduce code change.

Change from v2 to v3
- update according to mani's feedback
change from v1 to v2
- change subject 'a' to 'A'

 drivers/pci/controller/dwc/pci-layerscape.c | 83 -
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape.c 
b/drivers/pci/controller/dwc/pci-layerscape.c
index aea89926bcc4f..42bca2c3b5c3e 100644
--- a/drivers/pci/controller/dwc/pci-layerscape.c
+++ b/drivers/pci/controller/dwc/pci-layerscape.c
@@ -35,11 +35,19 @@
 #define PF_MCR_PTOMR   BIT(0)
 #define PF_MCR_EXL2S   BIT(1)
 
+/* LS1021A PEXn PM Write Control Register */
+#define SCFG_PEXPMWRCR(idx)(0x5c + (idx) * 0x64)
+#define PMXMTTURNOFF   BIT(31)
+#define SCFG_PEXSFTRSTCR   0x190
+#define PEXSR(idx) BIT(idx)
+
 #define PCIE_IATU_NUM  6
 
 struct ls_pcie_drvdata {
const u32 pf_off;
+   const struct dw_pcie_host_ops *ops;
int (*exit_from_l2)(struct dw_pcie_rp *pp);
+   bool scfg_support;
bool pm_support;
 };
 
@@ -47,6 +55,8 @@ struct ls_pcie {
struct dw_pcie *pci;
const struct ls_pcie_drvdata *drvdata;
void __iomem *pf_base;
+   struct regmap *scfg;
+   int index;
bool big_endian;
 };
 
@@ -171,13 +181,65 @@ static int ls_pcie_host_init(struct dw_pcie_rp *pp)
return 0;
 }
 
+static void scfg_pcie_send_turnoff_msg(struct regmap *scfg, u32 reg, u32 mask)
+{
+   /* Send PME_Turn_Off message */
+   regmap_write_bits(scfg, reg, mask, mask);
+
+   /*
+* There is no specific register to check for PME_To_Ack from endpoint.
+* So on the safe side, wait for PCIE_PME_TO_L2_TIMEOUT_US.
+*/
+   mdelay(PCIE_PME_TO_L2_TIMEOUT_US/1000);
+
+   /*
+* Layerscape hardware reference manual recommends clearing the 
PMXMTTURNOFF bit
+* to complete the PME_Turn_Off handshake.
+*/
+   regmap_write_bits(scfg, reg, mask, 0);
+}
+
+static void ls1021a_pcie_send_turnoff_msg(struct dw_pcie_rp *pp)
+{
+   struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+   struct ls_pcie *pcie = to_ls_pcie(pci);
+
+   scfg_pcie_send_turnoff_msg(pcie->scfg, SCFG_PEXPMWRCR(pcie->index), 
PMXMTTURNOFF);
+}
+
+static int scfg_pcie_exit_from_l2(struct regmap *scfg, u32 reg, u32 mask)
+{
+   /* Only way exit from l2 is that do software reset */
+   regmap_write_bits(scfg, reg, mask, mask);
+
+   regmap_write_bits(scfg, reg, mask, 0);
+
+   return 0;
+}
+
+static int ls1021a_pcie_exit_from_l2(struct dw_pcie_rp *pp)
+{
+   struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+   struct ls_pcie *pcie = to_ls_pcie(pci);
+
+   return scfg_pcie_exit_from_l2(pcie->scfg, SCFG_PEXSFTRSTCR, 
PEXSR(pcie->index));
+}
+
 static const struct dw_pcie_host_ops ls_pcie_host_ops = {
.host_init = ls_pcie_host_init,
.pme_turn_off = ls_pcie_send_turnoff_msg,
 };
 
+static const struct dw_pcie_host_ops ls1021a_pcie_host_ops = {
+   .host_init = ls_pcie_host_init,
+   .pme_turn_off = ls1021a_pcie_send_turnoff_msg,
+};
+
 static const struct ls_pcie_drvdata ls1021a_drvdata = {
-   .pm_support = false,
+   .pm_support = true,
+   .scfg_support = true,
+   .ops = _pcie_host_ops,
+   .exit_from_l2 = ls1021a_pcie_exit_from_l2,
 };
 
 static const struct ls_pcie_drvdata layerscape_drvdata = {
@@ -205,6 +267,8 @@ static int ls_pcie_probe(struct platform_device *pdev)
struct dw_pcie *pci;
struct ls_pcie *pcie;
struct resource *dbi_base;
+   u32 index[2];
+   int ret;
 
pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
if (!pcie)
@@ -220,6 +284,7 @@ static int ls_pcie_probe(struct platform_device *pdev)
pci->pp.ops = _pcie_host_ops;
 
pcie->pci = pci;
+   pci->pp.ops = pcie->drvdata->ops ? pcie->drvdata->ops : 
_pcie_host_ops;
 
dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_base);
@@ -230,6 +295,22 @@ static int ls_pcie_probe(struct platform_device *pdev)
 
pcie->pf_base = pci->dbi_base + pcie->drvdata->pf_off;
 
+   

[PATCH v4 1/4] PCI: layerscape: Add function pointer for exit_from_l2()

2023-11-29 Thread Frank Li
Since difference SoCs require different sequence for exiting L2, let's add
a separate "exit_from_l2()" callback. This callback can be used to execute
SoC specific sequence.

Change ls_pcie_exit_from_l2() return value from void to int. Return error
if exit_from_l2() failure at exit resume flow.

Reviewed-by: Manivannan Sadhasivam 
Signed-off-by: Frank Li 
---

Notes:
Change from v3 to v4
- update commit message
  Add mani's review by tag
Change from v2 to v3
- fixed according to mani's feedback
  1. update commit message
  2. move dw_pcie_host_ops to next patch
  3. check return value from exit_from_l2()
Change from v1 to v2
- change subject 'a' to 'A'

Change from v1 to v2
- change subject 'a' to 'A'

 drivers/pci/controller/dwc/pci-layerscape.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape.c 
b/drivers/pci/controller/dwc/pci-layerscape.c
index 37956e09c65bd..aea89926bcc4f 100644
--- a/drivers/pci/controller/dwc/pci-layerscape.c
+++ b/drivers/pci/controller/dwc/pci-layerscape.c
@@ -39,6 +39,7 @@
 
 struct ls_pcie_drvdata {
const u32 pf_off;
+   int (*exit_from_l2)(struct dw_pcie_rp *pp);
bool pm_support;
 };
 
@@ -125,7 +126,7 @@ static void ls_pcie_send_turnoff_msg(struct dw_pcie_rp *pp)
dev_err(pcie->pci->dev, "PME_Turn_off timeout\n");
 }
 
-static void ls_pcie_exit_from_l2(struct dw_pcie_rp *pp)
+static int ls_pcie_exit_from_l2(struct dw_pcie_rp *pp)
 {
struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
struct ls_pcie *pcie = to_ls_pcie(pci);
@@ -150,6 +151,8 @@ static void ls_pcie_exit_from_l2(struct dw_pcie_rp *pp)
 1);
if (ret)
dev_err(pcie->pci->dev, "L2 exit timeout\n");
+
+   return ret;
 }
 
 static int ls_pcie_host_init(struct dw_pcie_rp *pp)
@@ -180,6 +183,7 @@ static const struct ls_pcie_drvdata ls1021a_drvdata = {
 static const struct ls_pcie_drvdata layerscape_drvdata = {
.pf_off = 0xc,
.pm_support = true,
+   .exit_from_l2 = ls_pcie_exit_from_l2,
 };
 
 static const struct of_device_id ls_pcie_of_match[] = {
@@ -247,11 +251,14 @@ static int ls_pcie_suspend_noirq(struct device *dev)
 static int ls_pcie_resume_noirq(struct device *dev)
 {
struct ls_pcie *pcie = dev_get_drvdata(dev);
+   int ret;
 
if (!pcie->drvdata->pm_support)
return 0;
 
-   ls_pcie_exit_from_l2(>pci->pp);
+   ret = pcie->drvdata->exit_from_l2(>pci->pp);
+   if (ret)
+   return ret;
 
return dw_pcie_resume_noirq(pcie->pci);
 }
-- 
2.34.1



[PATCH v4 0/4] dwc general suspend/resume functionality

2023-11-29 Thread Frank Li
Add a API for dwc suspend/resume.
In layerscape platform call this api.

Frank Li (4):
  PCI: layerscape: Add function pointer for exit_from_l2()
  PCI: layerscape: Add suspend/resume for ls1021a
  PCI: layerscape: Rename pf_* as pf_lut_*
  PCI: layerscape: Add suspend/resume for ls1043a

 drivers/pci/controller/dwc/pci-layerscape.c | 191 +---
 1 file changed, 170 insertions(+), 21 deletions(-)

-- 
2.34.1



Re: [PATCH] perf test record+probe_libc_inet_pton: Fix call chain match on powerpc

2023-11-29 Thread Arnaldo Carvalho de Melo
Em Sun, Nov 26, 2023 at 02:09:14AM -0500, Likhitha Korrapati escreveu:
> The perf test "probe libc's inet_pton & backtrace it with ping" fails on
> powerpc as below:
> 
> root@xxx perf]# perf test -v "probe libc's inet_pton & backtrace it with
> ping"
>  85: probe libc's inet_pton & backtrace it with ping :
> --- start ---
> test child forked, pid 96028
> ping 96056 [002] 127271.101961: probe_libc:inet_pton: (7fffa1779a60)
> 7fffa1779a60 __GI___inet_pton+0x0
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> 7fffa172a73c getaddrinfo+0x121c
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> FAIL: expected backtrace entry
> "gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\(/usr/lib64/glibc-hwcaps/power10/libc.so.6\)$"
> got "7fffa172a73c getaddrinfo+0x121c
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)"
> test child finished with -1
>  end 

Try to have quoted output, the ones separated by  at the beginning
of the line indented two spaces, so as to avoid:

perf test record+probe_libc_inet_pton: Fix call chain match on powerpc

The perf test "probe libc's inet_pton & backtrace it with ping" fails on
powerpc as below:

root@xxx perf]# perf test -v "probe libc's inet_pton & backtrace it with
ping"
 85: probe libc's inet_pton & backtrace it with ping :

# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
#
# Author:Likhitha Korrapati 
# Date:  Sun Nov 26 02:09:14 2023 -0500


I'm copy and pasting from the original post, thanks!

- Arnaldo

> probe libc's inet_pton & backtrace it with ping: FAILED!
> 
> This test installs a probe on libc's inet_pton function, which will use
> uprobes and then uses perf trace on a ping to localhost. It gets 3
> levels deep backtrace and checks whether it is what we expected or not.
> 
> The test started failing from RHEL 9.4 where as it works in previous
> distro version (RHEL 9.2). Test expects gaih_inet function to be part of
> backtrace. But in the glibc version (2.34-86) which is part of distro
> where it fails, this function is missing and hence the test is failing.
> 
> From nm and ping command output we can confirm that gaih_inet function
> is not present in the expected backtrace for glibc version glibc-2.34-86
> 
> [root@xxx perf]# nm /usr/lib64/glibc-hwcaps/power10/libc.so.6 | grep gaih_inet
> 001273e0 t gaih_inet_serv
> 001cd8d8 r gaih_inet_typeproto
> 
> [root@xxx perf]# perf script -i /tmp/perf.data.6E8
> ping  104048 [000] 128582.508976: probe_libc:inet_pton: (7fff83779a60)
> 7fff83779a60 __GI___inet_pton+0x0
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> 7fff8372a73c getaddrinfo+0x121c
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
>11dc73534 [unknown] (/usr/bin/ping)
> 7fff8362a8c4 __libc_start_call_main+0x84
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> 
> FAIL: expected backtrace entry
> "gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\(/usr/lib64/glibc-hwcaps/power10/libc.so.6\)$"
> got "7fff9d52a73c getaddrinfo+0x121c
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)"
> 
> With version glibc-2.34-60 gaih_inet function is present as part of the
> expected backtrace. So we cannot just remove the gaih_inet function from
> the backtrace.
> 
> [root@xxx perf]# nm /usr/lib64/glibc-hwcaps/power10/libc.so.6 | grep gaih_inet
> 00130490 t gaih_inet.constprop.0
> 0012e830 t gaih_inet_serv
> 001d45e4 r gaih_inet_typeproto
> 
> [root@xxx perf]# ./perf script -i /tmp/perf.data.b6S
> ping   67906 [000] 22699.591699: probe_libc:inet_pton_3: (7fffbdd80820)
> 7fffbdd80820 __GI___inet_pton+0x0
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> 7fffbdd31160 gaih_inet.constprop.0+0xcd0
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> 7fffbdd31c7c getaddrinfo+0x14c
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
>1140d3558 [unknown] (/usr/bin/ping)
> 
> This patch solves this issue by doing a conditional skip. If there is a
> gaih_inet function present in the libc then it will be added to the
> expected backtrace else the function will be skipped from being added
> to the expected backtrace.
> 
> Output with the patch
> 
> [root@xxx perf]# ./perf test -v "probe libc's inet_pton & backtrace it
> with ping"
>  83: probe libc's inet_pton & backtrace it with ping :
> --- start ---
> test child forked, pid 102662
> ping 102692 [000] 127935.549973: probe_libc:inet_pton: (7fff93379a60)
> 7fff93379a60 __GI___inet_pton+0x0
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> 7fff9332a73c getaddrinfo+0x121c
> (/usr/lib64/glibc-hwcaps/power10/libc.so.6)
> 11ef03534 [unknown] (/usr/bin/ping)
> test child finished with 0
>  end 
> probe libc's inet_pton & backtrace it with ping: Ok
> 
> Signed-off-by: Likhitha Korrapati 
> Reported-by: Disha Goel 
> ---
>  tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 5 -
>  1 file 

Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Sean Christopherson
On Wed, Nov 29, 2023, Jason Gunthorpe wrote:
> On Tue, Nov 28, 2023 at 06:21:42PM -0800, Sean Christopherson wrote:
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index 454e9295970c..a65b2513f8cd 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -289,16 +289,12 @@ void vfio_combine_iova_ranges(struct rb_root_cached 
> > *root, u32 cur_nodes,
> >  /*
> >   * External user API
> >   */
> > -#if IS_ENABLED(CONFIG_VFIO_GROUP)
> >  struct iommu_group *vfio_file_iommu_group(struct file *file);
> > +
> > +#if IS_ENABLED(CONFIG_VFIO_GROUP)
> >  bool vfio_file_is_group(struct file *file);
> >  bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
> >  #else
> > -static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
> > -{
> > -   return NULL;
> > -}
> > -
> >  static inline bool vfio_file_is_group(struct file *file)
> >  {
> > return false;
> > 
> 
> So you symbol get on a symbol that can never be defined? Still says to
> me the kconfig needs fixing :|

Yeah, I completely agree, and if KVM didn't already rely on this horrific
behavior and there wasn't a more complete overhaul in-flight, I wouldn't suggest
this.

I'll send the KVM Kconfig/Makefile cleanups from my "Hide KVM internals from 
others"
series separately (which is still the bulk of the series) so as to prioritize
getting the cleanups landed.


Re: [PATCH 00/17] Prepare the PowerQUICC QMC and TSA for the HDLC QMC driver

2023-11-29 Thread Arnd Bergmann
On Tue, Nov 28, 2023, at 15:07, Herve Codina wrote:
> Hi,
>
> This series updates PowerQUICC QMC and TSA drivers to prepare the
> support for the QMC HDLC driver.
>
> Patches were previously sent as part of a full feature series:
> "Add support for QMC HDLC, framer infrastructure and PEF2256 framer" [1]
>
> The full feature series reached the v9 iteration.
> The v1 was sent the 07/25/2023 followed by the other iterations
> (07/26/2023, 08/09/2023, 08/18/2023, 09/12/2023, 09/22/2023, 09/28/2023,
> 10/11/23, 11/15/2023) and was ready to be merged in its v8.
>   https://lore.kernel.org/linux-kernel/20231025123215.5caca...@kernel.org/
>
> The lack of feedback from the Freescale SoC and the Quicc Engine
> maintainers (i.e. drivers/soc/fsl/qe/ to which the QMC and TSA drivers
> belong) blocks the entire full feature series.
> These patches are fixes and improvements to TSA and QMC drivers.
> These drivers were previously acked by Li Yang but without any feedback
> from Li Yang nor Qiang Zhao the series cannot move forward.
>
> In order to ease the review/merge, the full feature series has been
> split and this series contains patches related to the PowerQUICC SoC
> part (QMC and TSA).
>  - Perform some fixes (patches 1 to 5)
>  - Add support for child devices (patch 6)
>  - Add QMC dynamic timeslot support (patches 7 to 17)
>
> From the original full feature series, a patches extraction without any
> modification was done.

I took a rough look at the entire series and only have a few
minor comments to one patch, otherwise it looks fine to me.

I would still prefer for Li Yang to merge the patches and
send the pull request to s...@kernel.org, but if this also
gets stalled because of lack of feedback and there are no
objections to the content, you can send a PR there yourself.

 Arnd


Re: [PATCH 15/17] soc: fsl: cpm1: qmc: Handle timeslot entries at channel start() and stop()

2023-11-29 Thread Arnd Bergmann
On Tue, Nov 28, 2023, at 15:08, Herve Codina wrote:
> @@ -272,6 +274,8 @@ int qmc_chan_get_info(struct qmc_chan *chan, struct 
> qmc_chan_info *info)
>   if (ret)
>   return ret;
> 
> + spin_lock_irqsave(>ts_lock, flags);
> +
>   info->mode = chan->mode;
>   info->rx_fs_rate = tsa_info.rx_fs_rate;
>   info->rx_bit_rate = tsa_info.rx_bit_rate;
> @@ -280,6 +284,8 @@ int qmc_chan_get_info(struct qmc_chan *chan, struct 
> qmc_chan_info *info)
>   info->tx_bit_rate = tsa_info.tx_bit_rate;
>   info->nb_rx_ts = hweight64(chan->rx_ts_mask);
> 
> + spin_unlock_irqrestore(>ts_lock, flags);
> +
>   return 0;
>  }

I would normally use spin_lock_irq() instead of spin_lock_irqsave()
in functions that are only called outside of atomic context.

> +static int qmc_chan_start_rx(struct qmc_chan *chan);
> +
>  int qmc_chan_stop(struct qmc_chan *chan, int direction)
>  {
... 
> -static void qmc_chan_start_rx(struct qmc_chan *chan)
> +static int qmc_setup_chan_trnsync(struct qmc *qmc, struct qmc_chan *chan);
> +
> +static int qmc_chan_start_rx(struct qmc_chan *chan)
>  {

Can you reorder the static functions in a way that avoids the
forward declarations?

 Arnd


Re: [PATCH 1/5] powerpc/suspend: Add prototype for do_after_copyback()

2023-11-29 Thread Arnd Bergmann
On Wed, Nov 29, 2023, at 14:19, Michael Ellerman wrote:
> With HIBERNATION=y the build breaks with:
>
>   arch/powerpc/kernel/swsusp_64.c:14:6: error: no previous prototype 
> for ‘do_after_copyback’ [-Werror=missing-prototypes]
>   14 | void do_after_copyback(void)
>  |  ^
>
> do_after_copyback() is only called from asm, so there is no prototype,
> nor any header where it makes sense to place one. Just add a prototype
> in the C file to fix the build error.
>
> Signed-off-by: Michael Ellerman 

Thanks for helping out with the rest. For the series (aside from the
very minor comment):

Reviewed-by: Arnd Bergmann 


Re: [PATCH 5/5] powerpc/64s: Fix CONFIG_NUMA=n build

2023-11-29 Thread Arnd Bergmann
On Wed, Nov 29, 2023, at 14:19, Michael Ellerman wrote:
> diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
> index 7f9ff0640124..72341b9fb552 100644
> --- a/arch/powerpc/mm/mmu_decl.h
> +++ b/arch/powerpc/mm/mmu_decl.h
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +int create_section_mapping(unsigned long start, unsigned long end,
> +int nid, pgprot_t prot);
> +#endif

This one should probably go next to the remove_section_mapping()
declaration in arch/powerpc/include/asm/sparsemem.h for consistency.

I would personally remove the #ifdef as well, but there is
already one in that file.

 Arnd


Re: [PATCH v4 05/13] powerpc/rtas: Facilitate high-level call sequences

2023-11-29 Thread Michael Ellerman
Nathan Lynch  writes:
> Michael Ellerman  writes:
>
>> Nathan Lynch via B4 Relay 
>> writes:
>>> From: Nathan Lynch 
>>>
>>> On RTAS platforms there is a general restriction that the OS must not
>>> enter RTAS on more than one CPU at a time. This low-level
>>> serialization requirement is satisfied by holding a spin
>>> lock (rtas_lock) across most RTAS function invocations.
>> ...
>>> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
>>> index 1fc0b3fffdd1..52f2242d0c28 100644
>>> --- a/arch/powerpc/kernel/rtas.c
>>> +++ b/arch/powerpc/kernel/rtas.c
>>> @@ -581,6 +652,28 @@ static const struct rtas_function 
>>> *rtas_token_to_function(s32 token)
>>> return NULL;
>>>  }
>>>  
>>> +static void __rtas_function_lock(struct rtas_function *func)
>>> +{
>>> +   if (func && func->lock)
>>> +   mutex_lock(func->lock);
>>> +}
>>
>> This is obviously going to defeat most static analysis tools.
>
> I guess it's not that obvious to me :-) Is it because the mutex_lock()
> is conditional? I'll improve this if it's possible.

Well maybe I'm not giving modern static analysis tools enough credit :)

But what I mean that it's not easy to reason about what the function
does in isolation. ie. all you can say is that it may or may not lock a
mutex, and you can't say which mutex.

>> I assume lockdep is OK with it though?
>
> Seems to be, yes.

OK.

cheers


[PATCH 3/5] powerpc/512x: Fix missing prototype warnings

2023-11-29 Thread Michael Ellerman
The mpc512x_defconfig build fails with:

  arch/powerpc/platforms/512x/mpc5121_ads_cpld.c:142:1: error: no previous 
prototype for ‘mpc5121_ads_cpld_map’ [-Werror=missing-prototypes]
  142 | mpc5121_ads_cpld_map(void)
  | ^~~~
  arch/powerpc/platforms/512x/mpc5121_ads_cpld.c:157:1: error: no previous 
prototype for ‘mpc5121_ads_cpld_pic_init’ [-Werror=missing-prototypes]
  157 | mpc5121_ads_cpld_pic_init(void)
  | ^

There are prototypes for these functions but the header they are in is
not included by mpc5121_ads_cpld.c. Include it to fix the build error.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/512x/mpc5121_ads_cpld.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c 
b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
index 6f08d07aee3b..e995eb30bf09 100644
--- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
+++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
@@ -17,6 +17,8 @@
 #include 
 #include 
 
+#include "mpc5121_ads.h"
+
 static struct device_node *cpld_pic_node;
 static struct irq_domain *cpld_pic_host;
 
-- 
2.41.0



[PATCH 5/5] powerpc/64s: Fix CONFIG_NUMA=n build

2023-11-29 Thread Michael Ellerman
With CONFIG_NUMA=n the build fails with:

  arch/powerpc/mm/book3s64/pgtable.c:275:15: error: no previous prototype for 
‘create_section_mapping’ [-Werror=missing-prototypes]
  275 | int __meminit create_section_mapping(unsigned long start, unsigned long 
end,
  |   ^~

That happens because the prototype for create_section_mapping() is in
asm/mmzone.h, but asm/mmzone.h is only included by linux/mmzone.h
when CONFIG_NUMA=y.

In fact the prototype is only needed by arch/powerpc/mm code, so move
the prototype into arch/powerpc/mm/mmu_decl.h, which also fixes the
build error.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/mmzone.h | 5 -
 arch/powerpc/mm/mmu_decl.h| 5 +
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/mmzone.h 
b/arch/powerpc/include/asm/mmzone.h
index 4c6c6dbd182f..4740ca230d36 100644
--- a/arch/powerpc/include/asm/mmzone.h
+++ b/arch/powerpc/include/asm/mmzone.h
@@ -46,10 +46,5 @@ u64 memory_hotplug_max(void);
 #define __HAVE_ARCH_RESERVED_KERNEL_PAGES
 #endif
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end,
- int nid, pgprot_t prot);
-#endif
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_MMZONE_H_ */
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7f9ff0640124..72341b9fb552 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -181,3 +181,8 @@ static inline bool debug_pagealloc_enabled_or_kfence(void)
 {
return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled();
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int create_section_mapping(unsigned long start, unsigned long end,
+  int nid, pgprot_t prot);
+#endif
-- 
2.41.0



[PATCH 4/5] powerpc/44x: Make ppc44x_idle_init() static

2023-11-29 Thread Michael Ellerman
The 44x/fsp2_defconfig build fails with:

  arch/powerpc/platforms/44x/idle.c:30:12: error: no previous prototype for 
‘ppc44x_idle_init’ [-Werror=missing-prototypes]
  30 | int __init ppc44x_idle_init(void)
 |^~~~

Fix it by making ppc44x_idle_init() static.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/44x/idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/44x/idle.c 
b/arch/powerpc/platforms/44x/idle.c
index f533b495e7db..e2eeef8dff78 100644
--- a/arch/powerpc/platforms/44x/idle.c
+++ b/arch/powerpc/platforms/44x/idle.c
@@ -27,7 +27,7 @@ static void ppc44x_idle(void)
isync();
 }
 
-int __init ppc44x_idle_init(void)
+static int __init ppc44x_idle_init(void)
 {
if (!mode_spin) {
/* If we are not setting spin mode 
-- 
2.41.0



[PATCH 2/5] powerpc/512x: Make pdm360ng_init() static

2023-11-29 Thread Michael Ellerman
The mpc512x_defconfig config fails with:

  arch/powerpc/platforms/512x/pdm360ng.c:104:13: error: no previous prototype 
for ‘pdm360ng_init’ [-Werror=missing-prototypes]
  104 | void __init pdm360ng_init(void)
  | ^

Fix it by making pdm360ng_init() static.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/512x/pdm360ng.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/512x/pdm360ng.c 
b/arch/powerpc/platforms/512x/pdm360ng.c
index ce51cfeeb066..8bbbf78bb42b 100644
--- a/arch/powerpc/platforms/512x/pdm360ng.c
+++ b/arch/powerpc/platforms/512x/pdm360ng.c
@@ -101,7 +101,7 @@ static inline void __init pdm360ng_touchscreen_init(void)
 }
 #endif /* CONFIG_TOUCHSCREEN_ADS7846 */
 
-void __init pdm360ng_init(void)
+static void __init pdm360ng_init(void)
 {
mpc512x_init();
pdm360ng_touchscreen_init();
-- 
2.41.0



[PATCH 1/5] powerpc/suspend: Add prototype for do_after_copyback()

2023-11-29 Thread Michael Ellerman
With HIBERNATION=y the build breaks with:

  arch/powerpc/kernel/swsusp_64.c:14:6: error: no previous prototype for 
‘do_after_copyback’ [-Werror=missing-prototypes]
  14 | void do_after_copyback(void)
 |  ^

do_after_copyback() is only called from asm, so there is no prototype,
nor any header where it makes sense to place one. Just add a prototype
in the C file to fix the build error.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/swsusp_64.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/swsusp_64.c b/arch/powerpc/kernel/swsusp_64.c
index 16ee3baaf09a..50fa8fc9ef95 100644
--- a/arch/powerpc/kernel/swsusp_64.c
+++ b/arch/powerpc/kernel/swsusp_64.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 
+void do_after_copyback(void);
+
 void do_after_copyback(void)
 {
iommu_restore();
-- 
2.41.0



Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Jason Gunthorpe
On Tue, Nov 28, 2023 at 06:21:42PM -0800, Sean Christopherson wrote:
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 454e9295970c..a65b2513f8cd 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -289,16 +289,12 @@ void vfio_combine_iova_ranges(struct rb_root_cached 
> *root, u32 cur_nodes,
>  /*
>   * External user API
>   */
> -#if IS_ENABLED(CONFIG_VFIO_GROUP)
>  struct iommu_group *vfio_file_iommu_group(struct file *file);
> +
> +#if IS_ENABLED(CONFIG_VFIO_GROUP)
>  bool vfio_file_is_group(struct file *file);
>  bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
>  #else
> -static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
> -{
> -   return NULL;
> -}
> -
>  static inline bool vfio_file_is_group(struct file *file)
>  {
> return false;
> 

So you symbol get on a symbol that can never be defined? Still says to
me the kconfig needs fixing :|

But sure, as a small patch it looks fine

Thanks,
Jason


Re: [PATCH] powerpc: Don't clobber fr0/vs0 during fp|altivec register save

2023-11-29 Thread Christophe Leroy


Le 29/11/2023 à 11:30, Salvatore Bonaccorso a écrit :
> Hi,
> 
> On Sun, Nov 19, 2023 at 06:14:50AM -0700, Jens Axboe wrote:
>> On 11/18/23 4:45 PM, Timothy Pearson wrote:
>>> During floating point and vector save to thread data fr0/vs0 are clobbered
>>> by the FPSCR/VSCR store routine.  This leads to userspace register 
>>> corruption
>>> and application data corruption / crash under the following rare condition:
>>>
>>>   * A userspace thread is executing with VSX/FP mode enabled
>>>   * The userspace thread is making active use of fr0 and/or vs0
>>>   * An IPI is taken in kernel mode, forcing the userspace thread to 
>>> reschedule
>>>   * The userspace thread is interrupted by the IPI before accessing data it
>>> previously stored in fr0/vs0
>>>   * The thread being switched in by the IPI has a pending signal
>>>
>>> If these exact criteria are met, then the following sequence happens:
>>>
>>>   * The existing thread FP storage is still valid before the IPI, due to a
>>> prior call to save_fpu() or store_fp_state().  Note that the current
>>> fr0/vs0 registers have been clobbered, so the FP/VSX state in registers
>>> is now invalid pending a call to restore_fp()/restore_altivec().
>>>   * IPI -- FP/VSX register state remains invalid
>>>   * interrupt_exit_user_prepare_main() calls do_notify_resume(),
>>> due to the pending signal
>>>   * do_notify_resume() eventually calls save_fpu() via giveup_fpu(), which
>>> merrily reads and saves the invalid FP/VSX state to thread local 
>>> storage.
>>>   * interrupt_exit_user_prepare_main() calls restore_math(), writing the 
>>> invalid
>>> FP/VSX state back to registers.
>>>   * Execution is released to userspace, and the application crashes or 
>>> corrupts
>>> data.
>>
>> What an epic bug hunt! Hats off to you for seeing it through and getting
>> to the bottom of it. Particularly difficult as the commit that made it
>> easier to trigger was in no way related to where the actual bug was.
>>
>> I ran this on the vm I have access to, and it survived 2x500 iterations.
>> Happy to call that good:
>>
>> Tested-by: Jens Axboe 
> 
> Thanks to all involved!
> 
> Is this going to land soon in mainline so it can be picked as well for
> the affected stable trees?
> 

This version of the patch has been superseded.

As said by Michael in the relavant thread, the plan is to have version 2 
of this patch in 6.7-rc4, see 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/1921539696.48534988.1700407082933.javamail.zim...@raptorengineeringinc.com/

Christophe


Re: [PATCH] powerpc: Don't clobber fr0/vs0 during fp|altivec register save

2023-11-29 Thread Salvatore Bonaccorso
Hi,

On Sun, Nov 19, 2023 at 06:14:50AM -0700, Jens Axboe wrote:
> On 11/18/23 4:45 PM, Timothy Pearson wrote:
> > During floating point and vector save to thread data fr0/vs0 are clobbered
> > by the FPSCR/VSCR store routine.  This leads to userspace register 
> > corruption
> > and application data corruption / crash under the following rare condition:
> > 
> >  * A userspace thread is executing with VSX/FP mode enabled
> >  * The userspace thread is making active use of fr0 and/or vs0
> >  * An IPI is taken in kernel mode, forcing the userspace thread to 
> > reschedule
> >  * The userspace thread is interrupted by the IPI before accessing data it
> >previously stored in fr0/vs0
> >  * The thread being switched in by the IPI has a pending signal
> > 
> > If these exact criteria are met, then the following sequence happens:
> > 
> >  * The existing thread FP storage is still valid before the IPI, due to a
> >prior call to save_fpu() or store_fp_state().  Note that the current
> >fr0/vs0 registers have been clobbered, so the FP/VSX state in registers
> >is now invalid pending a call to restore_fp()/restore_altivec().
> >  * IPI -- FP/VSX register state remains invalid
> >  * interrupt_exit_user_prepare_main() calls do_notify_resume(),
> >due to the pending signal
> >  * do_notify_resume() eventually calls save_fpu() via giveup_fpu(), which
> >merrily reads and saves the invalid FP/VSX state to thread local storage.
> >  * interrupt_exit_user_prepare_main() calls restore_math(), writing the 
> > invalid
> >FP/VSX state back to registers.
> >  * Execution is released to userspace, and the application crashes or 
> > corrupts
> >data.
> 
> What an epic bug hunt! Hats off to you for seeing it through and getting
> to the bottom of it. Particularly difficult as the commit that made it
> easier to trigger was in no way related to where the actual bug was.
> 
> I ran this on the vm I have access to, and it survived 2x500 iterations.
> Happy to call that good:
> 
> Tested-by: Jens Axboe 

Thanks to all involved!

Is this going to land soon in mainline so it can be picked as well for
the affected stable trees?

Regards,
Salvatore


Re: Ping? Re: [PATCH rc] kvm: Prevent compiling virt/kvm/vfio.c unless VFIO is selected

2023-11-29 Thread Michael Ellerman
Sean Christopherson  writes:
> On Fri, Nov 10, 2023, Michael Ellerman wrote:
>> Jason Gunthorpe  writes:
>> > There are a bunch of reported randconfig failures now because of this,
>> > something like:
>> >
>> >>> arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7: warning: attribute 
>> >>> declaration must precede definition [-Wignored-attributes]
>> >fn = symbol_get(vfio_file_iommu_group);
>> > ^
>> >include/linux/module.h:805:60: note: expanded from macro 'symbol_get'
>> >#define symbol_get(x) ({ extern typeof(x) x 
>> > __attribute__((weak,visibility("hidden"))); &(x); })
>> >
>> > It happens because the arch forces KVM_VFIO without knowing if VFIO is
>> > even enabled.
>> 
>> This is still breaking some builds. Can we get this fix in please?
>> 
>> cheers
>> 
>> > Split the kconfig so the arch selects the usual HAVE_KVM_ARCH_VFIO and
>> > then KVM_VFIO is only enabled if the arch wants it and VFIO is turned on.
>
> Heh, so I was trying to figure out why things like vfio_file_set_kvm() aren't
> problematic, i.e. why the existing mess didn't cause failures.  I can't repro 
> the
> warning (requires clang-16?), but IIUC the reason only the group code is 
> problematic
> is that vfio.h creates a stub for vfio_file_iommu_group() and thus there's no 
> symbol,
> whereas vfio.h declares vfio_file_set_kvm() unconditionally.

That warning I'm unsure about.

But the final report linked in Jason's mail shows a different one:

   In file included from arch/powerpc/kvm/../../../virt/kvm/vfio.c:17:
   include/linux/vfio.h: In function 'kvm_vfio_file_iommu_group':
   include/linux/vfio.h:294:35: error: weak declaration of 
'vfio_file_iommu_group' being applied to a already existing, static definition
 294 | static inline struct iommu_group *vfio_file_iommu_group(struct file 
*file)
 |   ^

Which is simple to reproduce, just build ppc64le_defconfig and then turn
off CONFIG_MODULES (I'm using GCC 13, the report is for GCC 12).

> Because KVM is doing symbol_get() and not taking a direct dependency, the 
> lack of
> an exported symbol doesn't cause problems, i.e. simply declaring the symbol 
> makes
> the compiler happy.
>
> Given that the vfio_file_iommu_group() stub shouldn't exist (KVM is the only 
> user,
> and so if I'm correct the stub is worthless), what about this as a temporary 
> "fix"?
>
> I'm 100% on-board with fixing KVM properly, my motivation is purely to 
> minimize
> the total amount of churn.  E.g. if this works, then the only extra churn is 
> to
> move the declaration of vfio_file_iommu_group() back under the #if, versus 
> having
> to churn all of the KVM Kconfigs twice (once now, and again for the full 
> cleanup).

Fine by me.

> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 454e9295970c..a65b2513f8cd 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -289,16 +289,12 @@ void vfio_combine_iova_ranges(struct rb_root_cached 
> *root, u32 cur_nodes,
>  /*
>   * External user API
>   */
> -#if IS_ENABLED(CONFIG_VFIO_GROUP)
>  struct iommu_group *vfio_file_iommu_group(struct file *file);
> +
> +#if IS_ENABLED(CONFIG_VFIO_GROUP)
>  bool vfio_file_is_group(struct file *file);
>  bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
>  #else
> -static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
> -{
> -   return NULL;
> -}
> -
>  static inline bool vfio_file_is_group(struct file *file)
>  {
> return false;

That fixes the build for me.

Tested-by: Michael Ellerman 


cheers


Re: linux-next: duplicate patches in the char-misc tree

2023-11-29 Thread Greg KH
On Wed, Nov 29, 2023 at 12:24:05PM +1100, Stephen Rothwell wrote:
> Hi all,
> 
> The following commits are also in the powerpc tree as different commits
> (but the same patches):
> 
>   bc1183a63057 ("misc: ocxl: main: Remove unnecessary ‘0’ values from rc")
>   29eb0dc7bd1e ("misc: ocxl: link: Remove unnecessary (void*) conversions")
>   0e425d703c30 ("misc: ocxl: afu_irq: Remove unnecessary (void*) conversions")
>   62df29a542f9 ("misc: ocxl: context: Remove unnecessary (void*) conversions")
> 
> These are commits
> 
>   29685ea5754f ("misc: ocxl: main: Remove unnecessary ‘0’ values from rc")
>   220f3ced8e42 ("misc: ocxl: link: Remove unnecessary (void*) conversions")
>   84ba5d3675e2 ("misc: ocxl: afu_irq: Remove unnecessary (void*) conversions")
>   82d30723d58f ("misc: ocxl: context: Remove unnecessary (void*) conversions")
> 
> in the powerpc tree.

Thanks, that should be fine, I didn't realize these ended up in the
powerpc tree already.

greg k-h


[PATCH v2] powerpc: Add PVN support for HeXin C2000 processor

2023-11-29 Thread Zhao Ke
HeXin Tech Co. has applied for a new PVN from the OpenPower Community
for its new processor C2000. The OpenPower has assigned a new PVN
and this newly assigned PVN is 0x0066, add pvr register related
support for this PVN.

Signed-off-by: Zhao Ke 
Link: 
https://discuss.openpower.foundation/t/how-to-get-a-new-pvr-for-processors-follow-power-isa/477/10
---
v1 -> v2:
- Fix pvr_mask and cpu_name
- Fix alignment pattern to match other lines
v0 -> v1:
- Fix .cpu_name with the correct description
---
---
 arch/powerpc/include/asm/reg.h|  1 +
 arch/powerpc/kernel/cpu_specs_book3s_64.h | 15 +++
 arch/powerpc/kvm/book3s_pr.c  |  1 +
 arch/powerpc/mm/book3s64/pkeys.c  |  3 ++-
 arch/powerpc/platforms/powernv/subcore.c  |  3 ++-
 drivers/misc/cxl/cxl.h|  3 ++-
 6 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4ae4ab9090a2..7fd09f25452d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1361,6 +1361,7 @@
 #define PVR_POWER8E0x004B
 #define PVR_POWER8NVL  0x004C
 #define PVR_POWER8 0x004D
+#define PVR_HX_C2000   0x0066
 #define PVR_POWER9 0x004E
 #define PVR_POWER100x0080
 #define PVR_BE 0x0070
diff --git a/arch/powerpc/kernel/cpu_specs_book3s_64.h 
b/arch/powerpc/kernel/cpu_specs_book3s_64.h
index c370c1b804a9..3ff9757df4c0 100644
--- a/arch/powerpc/kernel/cpu_specs_book3s_64.h
+++ b/arch/powerpc/kernel/cpu_specs_book3s_64.h
@@ -238,6 +238,21 @@ static struct cpu_spec cpu_specs[] __initdata = {
.machine_check_early= __machine_check_early_realmode_p8,
.platform   = "power8",
},
+   {   /* 2.07-compliant processor, HeXin C2000 processor */
+   .pvr_mask   = 0x,
+   .pvr_value  = 0x0066,
+   .cpu_name   = "HX-C2000",
+   .cpu_features   = CPU_FTRS_POWER8,
+   .cpu_user_features  = COMMON_USER_POWER8,
+   .cpu_user_features2 = COMMON_USER2_POWER8,
+   .mmu_features   = MMU_FTRS_POWER8,
+   .icache_bsize   = 128,
+   .dcache_bsize   = 128,
+   .cpu_setup  = __setup_cpu_power8,
+   .cpu_restore= __restore_cpu_power8,
+   .machine_check_early= __machine_check_early_realmode_p8,
+   .platform   = "power8",
+   },
{   /* 3.00-compliant processor, i.e. Power9 "architected" mode */
.pvr_mask   = 0x,
.pvr_value  = 0x0f05,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 9118242063fb..5b92619a05fd 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -604,6 +604,7 @@ static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 
pvr)
case PVR_POWER8:
case PVR_POWER8E:
case PVR_POWER8NVL:
+   case PVR_HX_C2000:
case PVR_POWER9:
vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
BOOK3S_HFLAG_NEW_TLBIE;
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 125733962033..a974baf8f327 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -89,7 +89,8 @@ static int __init scan_pkey_feature(void)
unsigned long pvr = mfspr(SPRN_PVR);
 
if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == 
PVR_POWER8E ||
-   PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == 
PVR_POWER9)
+   PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == 
PVR_POWER9 ||
+   PVR_VER(pvr) == PVR_HX_C2000)
pkeys_total = 32;
}
}
diff --git a/arch/powerpc/platforms/powernv/subcore.c 
b/arch/powerpc/platforms/powernv/subcore.c
index 191424468f10..393e747541fb 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -425,7 +425,8 @@ static int subcore_init(void)
 
if (pvr_ver != PVR_POWER8 &&
pvr_ver != PVR_POWER8E &&
-   pvr_ver != PVR_POWER8NVL)
+   pvr_ver != PVR_POWER8NVL &&
+   pvr_ver != PVR_HX_C2000)
return 0;
 
/*
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 0562071cdd4a..6ad0ab892675 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -836,7 +836,8 @@ static inline bool cxl_is_power8(void)
 {
if ((pvr_version_is(PVR_POWER8E)) ||
(pvr_version_is(PVR_POWER8NVL)) ||
-   (pvr_version_is(PVR_POWER8)))
+   (pvr_version_is(PVR_POWER8)) ||
+