date:20200414

Re: [RFC PATCH] powerpc/lib: Fixing use a temporary mm for code patching

2020-04-14 Thread Christopher M Riedl

> On March 26, 2020 9:42 AM Christophe Leroy  wrote:
> 
>  
> This patch fixes the RFC series identified below.
> It fixes three points:
> - Failure with CONFIG_PPC_KUAP
> - Failure to write do to lack of DIRTY bit set on the 8xx
> - Inadequaly complex WARN post verification
> 
> However, it has an impact on the CPU load. Here is the time
> needed on an 8xx to run the ftrace selftests without and
> with this series:
> - Without CONFIG_STRICT_KERNEL_RWX==> 38 seconds
> - With CONFIG_STRICT_KERNEL_RWX   ==> 40 seconds
> - With CONFIG_STRICT_KERNEL_RWX + this series ==> 43 seconds
> 
> Link: https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=166003
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/lib/code-patching.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index f156132e8975..4ccff427592e 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -97,6 +97,7 @@ static int map_patch(const void *addr, struct patch_mapping 
> *patch_mapping)
>   }
>  
>   pte = mk_pte(page, pgprot);
> + pte = pte_mkdirty(pte);
>   set_pte_at(patching_mm, patching_addr, ptep, pte);
>  
>   init_temp_mm(_mapping->temp_mm, patching_mm);
> @@ -168,7 +169,9 @@ static int do_patch_instruction(unsigned int *addr, 
> unsigned int instr)
>   (offset_in_page((unsigned long)addr) /
>   sizeof(unsigned int));
>  
> + allow_write_to_user(patch_addr, sizeof(instr));
>   __patch_instruction(addr, instr, patch_addr);
> + prevent_write_to_user(patch_addr, sizeof(instr));
> 

On radix we can map the page with PAGE_KERNEL protection which ends up
setting EAA[0] in the radix PTE. This means the KUAP (AMR) protection is
ignored (ISA v3.0b Fig. 35) since we are accessing the page from MSR[PR]=0.

Can we employ a similar approach on the 8xx? I would prefer *not* to wrap
the __patch_instruction() with the allow_/prevent_write_to_user() KUAP things
because this is a temporary kernel mapping which really isn't userspace in
the usual sense.
 
>   err = unmap_patch(_mapping);
>   if (err)
> @@ -179,7 +182,7 @@ static int do_patch_instruction(unsigned int *addr, 
> unsigned int instr)
>* think we just wrote.
>* XXX: BUG_ON() instead?
>*/
> - WARN_ON(memcmp(addr, , sizeof(instr)));
> + WARN_ON(*addr != instr);
>  
>  out:
>   local_irq_restore(flags);
> -- 
> 2.25.0

Re: [RFC PATCH 3/3] powerpc/lib: Use a temporary mm for code patching

2020-04-14 Thread Christopher M Riedl

> On March 24, 2020 11:25 AM Christophe Leroy  wrote:
> 
>  
> Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :
> > Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
> > mappings to other CPUs. These mappings should be kept local to the CPU
> > doing the patching. Use the pre-initialized temporary mm and patching
> > address for this purpose. Also add a check after patching to ensure the
> > patch succeeded.
> > 
> > Based on x86 implementation:
> > 
> > commit b3fd8e83ada0
> > ("x86/alternatives: Use temporary mm for text poking")
> > 
> > Signed-off-by: Christopher M. Riedl 
> > ---
> >   arch/powerpc/lib/code-patching.c | 128 ++-
> >   1 file changed, 57 insertions(+), 71 deletions(-)
> > 
> > diff --git a/arch/powerpc/lib/code-patching.c 
> > b/arch/powerpc/lib/code-patching.c
> > index 18b88ecfc5a8..f156132e8975 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -19,6 +19,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   
> >   static int __patch_instruction(unsigned int *exec_addr, unsigned int 
> > instr,
> >unsigned int *patch_addr)
> > @@ -65,99 +66,79 @@ void __init poking_init(void)
> > pte_unmap_unlock(ptep, ptl);
> >   }
> >   
> > -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> > -
> > -static int text_area_cpu_up(unsigned int cpu)
> > -{
> > -   struct vm_struct *area;
> > -
> > -   area = get_vm_area(PAGE_SIZE, VM_ALLOC);
> > -   if (!area) {
> > -   WARN_ONCE(1, "Failed to create text area for cpu %d\n",
> > -   cpu);
> > -   return -1;
> > -   }
> > -   this_cpu_write(text_poke_area, area);
> > -
> > -   return 0;
> > -}
> > -
> > -static int text_area_cpu_down(unsigned int cpu)
> > -{
> > -   free_vm_area(this_cpu_read(text_poke_area));
> > -   return 0;
> > -}
> > -
> > -/*
> > - * Run as a late init call. This allows all the boot time patching to be 
> > done
> > - * simply by patching the code, and then we're called here prior to
> > - * mark_rodata_ro(), which happens after all init calls are run. Although
> > - * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we 
> > judge
> > - * it as being preferable to a kernel that will crash later when someone 
> > tries
> > - * to use patch_instruction().
> > - */
> > -static int __init setup_text_poke_area(void)
> > -{
> > -   BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> > -   "powerpc/text_poke:online", text_area_cpu_up,
> > -   text_area_cpu_down));
> > -
> > -   return 0;
> > -}
> > -late_initcall(setup_text_poke_area);
> > +struct patch_mapping {
> > +   spinlock_t *ptl; /* for protecting pte table */
> > +   struct temp_mm temp_mm;
> > +};
> >   
> >   /*
> >* This can be called for kernel text or a module.
> >*/
> > -static int map_patch_area(void *addr, unsigned long text_poke_addr)
> > +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
> 
> Why change the name ?
> 

It's not really an "area" anymore.

> >   {
> > -   unsigned long pfn;
> > -   int err;
> > +   struct page *page;
> > +   pte_t pte, *ptep;
> > +   pgprot_t pgprot;
> >   
> > if (is_vmalloc_addr(addr))
> > -   pfn = vmalloc_to_pfn(addr);
> > +   page = vmalloc_to_page(addr);
> > else
> > -   pfn = __pa_symbol(addr) >> PAGE_SHIFT;
> > +   page = virt_to_page(addr);
> >   
> > -   err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
> > +   if (radix_enabled())
> > +   pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
> > +   else
> > +   pgprot = PAGE_SHARED;
> 
> Can you explain the difference between radix and non radix ?
> 
> Why PAGE_KERNEL for a page that is mapped in userspace ?
> 
> Why do you need to do __pgprot(pgprot_val(PAGE_KERNEL)) instead of just 
> using PAGE_KERNEL ?
> 

On hash there is a manual check which prevents setting _PAGE_PRIVILEGED for
kernel to userspace access in __hash_page - hence we cannot access the mapping
if the page is mapped PAGE_KERNEL on hash. However, I would like to use
PAGE_KERNEL here as well and am working on understanding why this check is
done in hash and if this can change. On radix this works just fine.

The page is mapped PAGE_KERNEL because the address is technically a userspace
address - but only to keep the mapping local to this CPU doing the patching.
PAGE_KERNEL makes it clear both in intent and protection that this is a kernel
mapping.

I think the correct way is pgprot_val(PAGE_KERNEL) since PAGE_KERNEL is defined
as:

#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)

and __pgprot() is defined as:

typedef struct { unsigned long pgprot; } pgprot_t;
#define pgprot_val(x)   ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) })

> >   
> > -   pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
> > -   if (err)
> > +   ptep =

Re: [PATCH 1/2] mm, treewide: Rename kzfree() to kfree_sensitive()

2020-04-14 Thread Johannes Weiner

On Mon, Apr 13, 2020 at 05:15:49PM -0400, Waiman Long wrote:
> As said by Linus:
> 
>   A symmetric naming is only helpful if it implies symmetries in use.
>   Otherwise it's actively misleading.

As the btrfs example proves - people can be tempted by this false
symmetry to pair kzalloc with kzfree, which isn't what we wanted.

>   In "kzalloc()", the z is meaningful and an important part of what the
>   caller wants.
> 
>   In "kzfree()", the z is actively detrimental, because maybe in the
>   future we really _might_ want to use that "memfill(0xdeadbeef)" or
>   something. The "zero" part of the interface isn't even _relevant_.
> 
> The main reason that kzfree() exists is to clear sensitive information
> that should not be leaked to other future users of the same memory
> objects.
> 
> Rename kzfree() to kfree_sensitive() to follow the example of the
> recently added kvfree_sensitive() and make the intention of the API
> more explicit. In addition, memzero_explicit() is used to clear the
> memory to make sure that it won't get optimized away by the compiler.
> 
> The renaming is done by using the command sequence:
> 
>   git grep -w --name-only kzfree |\
>   xargs sed -i 's/\bkzfree\b/kfree_sensitive/'
> 
> followed by some editing of the kfree_sensitive() kerneldoc and the
> use of memzero_explicit() instead of memset().
> 
> Suggested-by: Joe Perches 
> Signed-off-by: Waiman Long 

Looks good to me. Thanks for fixing this very old mistake.

Acked-by: Johannes Weiner

[PATCH] ASoC: fsl_micfil: Omit superfluous error message in fsl_micfil_probe()

2020-04-14 Thread Tang Bin

In the function fsl_micfil_probe(), when get irq failed, the function 
platform_get_irq() logs an error message, so remove redundant message here.

Signed-off-by: Tang Bin 
Signed-off-by: Shengju Zhang 
---
 sound/soc/fsl/fsl_micfil.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sound/soc/fsl/fsl_micfil.c b/sound/soc/fsl/fsl_micfil.c
index f7f2d29f1..e73bd6570 100644
--- a/sound/soc/fsl/fsl_micfil.c
+++ b/sound/soc/fsl/fsl_micfil.c
@@ -702,10 +702,8 @@ static int fsl_micfil_probe(struct platform_device *pdev)
for (i = 0; i < MICFIL_IRQ_LINES; i++) {
micfil->irq[i] = platform_get_irq(pdev, i);
dev_err(>dev, "GET IRQ: %d\n", micfil->irq[i]);
-   if (micfil->irq[i] < 0) {
-   dev_err(>dev, "no irq for node %s\n", pdev->name);
+   if (micfil->irq[i] < 0)
return micfil->irq[i];
-   }
}
 
if (of_property_read_bool(np, "fsl,shared-interrupt"))
-- 
2.20.1.windows.1

Re: [PATCH v5 18/21] powerpc64: Add prefixed instructions to instruction data type

2020-04-14 Thread Jordan Niethe

On Mon, Apr 13, 2020 at 10:04 PM Balamuruhan S  wrote:
>
> On Mon, 2020-04-06 at 18:09 +1000, Jordan Niethe wrote:
> > For powerpc64, redefine the ppc_inst type so both word and prefixed
> > instructions can be represented. On powerpc32 the type will remain the
> > same.  Update places which had assumed instructions to be 4 bytes long.
> >
> > Signed-off-by: Jordan Niethe 
> > ---
> > v4: New to series
> > v5:  - Distinguish normal instructions from prefixed instructions with a
> >0xff marker for the suffix.
> >  - __patch_instruction() using std for prefixed instructions
> > ---
> >  arch/powerpc/include/asm/inst.h  | 71 ++--
> >  arch/powerpc/include/asm/kprobes.h   |  2 +-
> >  arch/powerpc/include/asm/uaccess.h   | 31 ++--
> >  arch/powerpc/include/asm/uprobes.h   |  2 +-
> >  arch/powerpc/kernel/optprobes.c  | 42 
> >  arch/powerpc/kernel/optprobes_head.S |  3 ++
> >  arch/powerpc/kernel/trace/ftrace.c   | 26 +-
> >  arch/powerpc/lib/code-patching.c | 19 +---
> >  arch/powerpc/lib/feature-fixups.c|  5 +-
> >  arch/powerpc/lib/sstep.c |  4 +-
> >  arch/powerpc/xmon/xmon.c |  6 +--
> >  arch/powerpc/xmon/xmon_bpts.S|  4 +-
> >  12 files changed, 171 insertions(+), 44 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/inst.h
> > b/arch/powerpc/include/asm/inst.h
> > index 70b37a35a91a..7e23e7146c66 100644
> > --- a/arch/powerpc/include/asm/inst.h
> > +++ b/arch/powerpc/include/asm/inst.h
> > @@ -8,23 +8,67 @@
> >
> >  struct ppc_inst {
> >  u32 val;
> > +#ifdef __powerpc64__
> > +u32 suffix;
> > +#endif /* __powerpc64__ */
> >  } __packed;
> >
> > -#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> > +static inline int ppc_inst_opcode(struct ppc_inst x)
> > +{
> > + return x.val >> 26;
>
>
> why don't we wrap here and in `ppc_inst_opcode()` in patch 9 using
> `ppc_inst_val()` ?
Will do.
>
>
> > +}
> >
> >  static inline u32 ppc_inst_val(struct ppc_inst x)
>
>
> There is another same definition below for the same function in
> #else part of __powerpc64__ ifdef.
Thanks
>
>
> >  {
> >   return x.val;
> >  }
> >
> > -static inline bool ppc_inst_len(struct ppc_inst x)
> > +#ifdef __powerpc64__
> > +#define ppc_inst(x) ((struct ppc_inst){ .val = (x), .suffix = 0xff })
> > +
> > +#define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix = (y)
> > })
> > +
> > +static inline u32 ppc_inst_suffix(struct ppc_inst x)
> >  {
> > - return sizeof(struct ppc_inst);
> > + return x.suffix;
> >  }
> >
> > -static inline int ppc_inst_opcode(struct ppc_inst x)
> > +static inline bool ppc_inst_prefixed(struct ppc_inst x) {
> > + return ((ppc_inst_val(x) >> 26) == 1) && ppc_inst_suffix(x) != 0xff;
> > +}
> > +
> > +static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> >  {
> > - return x.val >> 26;
> > + return ppc_inst_prefix(swab32(ppc_inst_val(x)),
> > +swab32(ppc_inst_suffix(x)));
> > +}
> > +
> > +static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
> > +{
> > + u32 val, suffix = 0xff;
> > + val = *(u32 *)ptr;
> > + if ((val >> 26) == 1)
> > + suffix = *((u32 *)ptr + 1);
> > + return ppc_inst_prefix(val, suffix);
> > +}
> > +
> > +static inline void ppc_inst_write(struct ppc_inst *ptr, struct ppc_inst x)
> > +{
> > + if (ppc_inst_prefixed(x)) {
> > + *(u32 *)ptr = x.val;
> > + *((u32 *)ptr + 1) = x.suffix;
> > + } else {
> > + *(u32 *)ptr = x.val;
>
>
> can we wrap here as well with `ppc_inst_val()` and `ppc_inst_suffix()` ?
Yeah no reason not too.
>
>
> > + }
> > +}
> > +
> > +#else
> > +
> > +#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> > +
> > +static inline bool ppc_inst_prefixed(ppc_inst x)
> > +{
> > + return 0;
>
>
> Is it return !!0 or return false ?
False probably will make more sense.
>
>
> >  }
> >
> >  static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> > @@ -32,14 +76,31 @@ static inline struct ppc_inst ppc_inst_swab(struct
> > ppc_inst x)
> >   return ppc_inst(swab32(ppc_inst_val(x)));
> >  }
> >
> > +static inline u32 ppc_inst_val(struct ppc_inst x)
>
>
> [...] duplicate definition that is defined outside __powerpc64__ above.
>
>
> > +{
> > + return x.val;
> > +}
> > +
> >  static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
> >  {
> >   return *ptr;
> >  }
> >
> > +static inline void ppc_inst_write(struct ppc_inst *ptr, struct ppc_inst x)
> > +{
> > + *ptr = x;
> > +}
> > +
> > +#endif /* __powerpc64__ */
> > +
> >  static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
> >  {
> >   return !memcmp(, , sizeof(struct ppc_inst));
> >  }
> >
> > +static inline int ppc_inst_len(struct ppc_inst x)
> > +{
> > + return (ppc_inst_prefixed(x)) ? 8  : 4;
> > +}
> > +
> >  #endif /* _ASM_INST_H */
> > diff --git

Re: [RFC PATCH 2/3] powerpc/lib: Initialize a temporary mm for code patching

2020-04-14 Thread Christopher M Riedl

> On April 8, 2020 6:01 AM Christophe Leroy  wrote:
> 
>  
> Le 31/03/2020 à 05:19, Christopher M Riedl a écrit :
> >> On March 24, 2020 11:10 AM Christophe Leroy  
> >> wrote:
> >>
> >>   
> >> Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :
> >>> When code patching a STRICT_KERNEL_RWX kernel the page containing the
> >>> address to be patched is temporarily mapped with permissive memory
> >>> protections. Currently, a per-cpu vmalloc patch area is used for this
> >>> purpose. While the patch area is per-cpu, the temporary page mapping is
> >>> inserted into the kernel page tables for the duration of the patching.
> >>> The mapping is exposed to CPUs other than the patching CPU - this is
> >>> undesirable from a hardening perspective.
> >>>
> >>> Use the `poking_init` init hook to prepare a temporary mm and patching
> >>> address. Initialize the temporary mm by copying the init mm. Choose a
> >>> randomized patching address inside the temporary mm userspace address
> >>> portion. The next patch uses the temporary mm and patching address for
> >>> code patching.
> >>>
> >>> Based on x86 implementation:
> >>>
> >>> commit 4fc19708b165
> >>> ("x86/alternatives: Initialize temporary mm for patching")
> >>>
> >>> Signed-off-by: Christopher M. Riedl 
> >>> ---
> >>>arch/powerpc/lib/code-patching.c | 26 ++
> >>>1 file changed, 26 insertions(+)
> >>>
> >>> diff --git a/arch/powerpc/lib/code-patching.c 
> >>> b/arch/powerpc/lib/code-patching.c
> >>> index 3345f039a876..18b88ecfc5a8 100644
> >>> --- a/arch/powerpc/lib/code-patching.c
> >>> +++ b/arch/powerpc/lib/code-patching.c
> >>> @@ -11,6 +11,8 @@
> >>>#include 
> >>>#include 
> >>>#include 
> >>> +#include 
> >>> +#include 
> >>>
> >>>#include 
> >>>#include 
> >>> @@ -39,6 +41,30 @@ int raw_patch_instruction(unsigned int *addr, unsigned 
> >>> int instr)
> >>>}
> >>>
> >>>#ifdef CONFIG_STRICT_KERNEL_RWX
> >>> +
> >>> +__ro_after_init struct mm_struct *patching_mm;
> >>> +__ro_after_init unsigned long patching_addr;
> >>
> >> Can we make those those static ?
> >>
> > 
> > Yes, makes sense to me.
> > 
> >>> +
> >>> +void __init poking_init(void)
> >>> +{
> >>> + spinlock_t *ptl; /* for protecting pte table */
> >>> + pte_t *ptep;
> >>> +
> >>> + patching_mm = copy_init_mm();
> >>> + BUG_ON(!patching_mm);
> >>
> >> Does it needs to be a BUG_ON() ? Can't we fail gracefully with just a
> >> WARN_ON ?
> >>
> > 
> > I'm not sure what failing gracefully means here? The main reason this could
> > fail is if there is not enough memory to allocate the patching_mm. The
> > previous implementation had this justification for BUG_ON():
> 
> But the system can continue running just fine after this failure.
> Only the things that make use of code patching will fail (ftrace, kgdb, ...)
> 
> Checkpatch tells: "Avoid crashing the kernel - try using WARN_ON & 
> recovery code rather than BUG() or BUG_ON()"
> 
> All vital code patching has already been done previously, so I think a 
> WARN_ON() should be enough, plus returning non 0 to indicate that the 
> late_initcall failed.
> 
> 

Got it, makes sense to me. I will make these changes in the next version.
Thanks!

> > 
> > /*
> >   * Run as a late init call. This allows all the boot time patching to be 
> > done
> >   * simply by patching the code, and then we're called here prior to
> >   * mark_rodata_ro(), which happens after all init calls are run. Although
> >   * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we 
> > judge
> >   * it as being preferable to a kernel that will crash later when someone 
> > tries
> >   * to use patch_instruction().
> >   */
> > static int __init setup_text_poke_area(void)
> > {
> >  BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> >  "powerpc/text_poke:online", text_area_cpu_up,
> >  text_area_cpu_down));
> > 
> >  return 0;
> > }
> > late_initcall(setup_text_poke_area);
> > 
> > I think the BUG_ON() is appropriate even if only to adhere to the previous
> > judgement call. I can add a similar comment explaining the reasoning if
> > that helps.
> > 
> >>> +
> >>> + /*
> >>> +  * In hash we cannot go above DEFAULT_MAP_WINDOW easily.
> >>> +  * XXX: Do we want additional bits of entropy for radix?
> >>> +  */
> >>> + patching_addr = (get_random_long() & PAGE_MASK) %
> >>> + (DEFAULT_MAP_WINDOW - PAGE_SIZE);
> >>> +
> >>> + ptep = get_locked_pte(patching_mm, patching_addr, );
> >>> + BUG_ON(!ptep);
> >>
> >> Same here, can we fail gracefully instead ?
> >>
> > 
> > Same reasoning as above.
> 
> Here as well, a WARN_ON() should be enough, the system will continue 
> running after that.
> 
> > 
> >>> + pte_unmap_unlock(ptep, ptl);
> >>> +}
> >>> +
> >>>static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> >>>
> >>>static int text_area_cpu_up(unsigned int cpu)
> >>>
> >>
> >> Christophe
> 
> Christophe

Re: [PATCH 4/8] binfmt_elf: open code copy_siginfo_to_user to kernelspace buffer

2020-04-14 Thread Michael Ellerman

Christoph Hellwig  writes:

> Instead of messing with the address limit just open code the trivial
> memcpy + memset logic for the native version, and a call to
> to_compat_siginfo for the compat version.
>
> Signed-off-by: Christoph Hellwig 
> ---
>  fs/binfmt_elf.c| 9 +
>  fs/compat_binfmt_elf.c | 6 +-
>  2 files changed, 10 insertions(+), 5 deletions(-)
>
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 13f25e241ac4..607c5a5f855e 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -1553,15 +1553,16 @@ static void fill_auxv_note(struct memelfnote *note, 
> struct mm_struct *mm)
>   fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
>  }
>  
> +#ifndef fill_siginfo_note
>  static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t 
> *csigdata,
>   const kernel_siginfo_t *siginfo)
>  {
> - mm_segment_t old_fs = get_fs();
> - set_fs(KERNEL_DS);
> - copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
> - set_fs(old_fs);
> + memcpy(csigdata, siginfo, sizeof(struct kernel_siginfo));
> + memset((char *)csigdata + sizeof(struct kernel_siginfo), 0,
> + SI_EXPANSION_SIZE);
>   fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
>  }
> +#endif
>  
>  #define MAX_FILE_NOTE_SIZE (4*1024*1024)
>  /*
> diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
> index aaad4ca1217e..ab84e095618b 100644
> --- a/fs/compat_binfmt_elf.c
> +++ b/fs/compat_binfmt_elf.c
> @@ -39,7 +39,11 @@
>   */
>  #define user_long_t  compat_long_t
>  #define user_siginfo_t   compat_siginfo_t
> -#define copy_siginfo_to_user copy_siginfo_to_user32
> +#define fill_siginfo_note(note, csigdata, siginfo)   \
> +do { \
> + to_compat_siginfo(csigdata, siginfo, compat_siginfo_flags());   \
> + fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); \
> +} while (0)

This doesn't build on ppc (cell_defconfig):

  ../fs/binfmt_elf.c: In function 'fill_note_info':
  ../fs/compat_binfmt_elf.c:44:39: error: implicit declaration of function 
'compat_siginfo_flags'; did you mean 'to_compat_siginfo'? 
[-Werror=implicit-function-d
  eclaration]
to_compat_siginfo(csigdata, siginfo, compat_siginfo_flags()); \
 ^~~~
  ../fs/binfmt_elf.c:1846:2: note: in expansion of macro 'fill_siginfo_note'
fill_siginfo_note(>signote, >csigdata, siginfo);
^
  cc1: some warnings being treated as errors
  make[2]: *** [../scripts/Makefile.build:266: fs/compat_binfmt_elf.o] Error 1


I guess the empty version from kernel/signal.c needs to move into a
header somewhere.

cheers

[PATCH kernel] powerpc/dma: Call indirect dma_ops when persistent memory present

2020-04-14 Thread Alexey Kardashevskiy

Unlike normal memory ("memory" compatible type in the FDT),
the persistent memory ("ibm,pmemory" in the FDT) can be mapped anywhere
in the guest physical space and it can be used for DMA.

In order to maintain 1:1 mapping via the huge DMA window, we need to
know the maximum physical address at the time of the window setup.
So far we've been looking at "memory" nodes but "ibm,pmemory" does not
have fixed addresses and the persistent memory may be mapped afterwards.

When the maximum window size is not big enough to fit persistent memory,
this clears the dma_ops_bypass flag to tell the generic code that indirect
dma_ops call is needed. This lets the platform code check the DMA
boundaries and call direct DMA API if DMA-ing to/from generic RAM
or call IOMMU API otherwise.

This adds dma_max to device::archdata to tell the direct DMA mapping
limit. At the moment only pseries sets the limit so powernv is
unaffected by this change.

As persistent memory is backed with page structs, this uses
MAX_PHYSMEM_BITS as the upper limit (rather than simple 64bit).

This should not change the existing behaviour when no persistent memory.

Signed-off-by: Alexey Kardashevskiy 
---

This is based on
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=170152

---
 arch/powerpc/include/asm/device.h  |  2 +
 arch/powerpc/kernel/dma-iommu.c| 68 +-
 arch/powerpc/platforms/pseries/iommu.c | 23 -
 3 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 452402215e12..380e92684a16 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -18,6 +18,8 @@ struct iommu_table;
  * drivers/macintosh/macio_asic.c
  */
 struct dev_archdata {
+   /* Maximum DMA address for 1:1 mapping (when enabled) */
+   dma_addr_t  dma_max;
/*
 * These two used to be a union. However, with the hybrid ops we need
 * both so here we store both a DMA offset for direct mappings and
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 569fecd7b5b2..8c67bfffdef6 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -10,6 +10,16 @@
 #include 
 #include 
 
+static inline bool can_map_direct(struct device *dev, phys_addr_t addr)
+{
+   return dev->archdata.dma_max >= phys_to_dma(dev, addr);
+}
+
+static inline bool dma_handle_direct(struct device *dev, dma_addr_t dma_handle)
+{
+   return dma_handle >= dev->archdata.dma_offset;
+}
+
 /*
  * Generic iommu implementation
  */
@@ -44,6 +54,12 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, 
struct page *page,
 enum dma_data_direction direction,
 unsigned long attrs)
 {
+   if (dev->archdata.dma_max &&
+   can_map_direct(dev, (phys_addr_t) page_to_phys(page) +
+  offset + size))
+   return dma_direct_map_page(dev, page, offset, size, direction,
+  attrs);
+
return iommu_map_page(dev, get_iommu_table_base(dev), page, offset,
  size, dma_get_mask(dev), direction, attrs);
 }
@@ -53,6 +69,12 @@ static void dma_iommu_unmap_page(struct device *dev, 
dma_addr_t dma_handle,
 size_t size, enum dma_data_direction direction,
 unsigned long attrs)
 {
+   if (dev->archdata.dma_max &&
+   dma_handle_direct(dev, dma_handle + size)) {
+   dma_direct_unmap_page(dev, dma_handle, size, direction, attrs);
+   return;
+   }
+
iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction,
 attrs);
 }
@@ -62,6 +84,22 @@ static int dma_iommu_map_sg(struct device *dev, struct 
scatterlist *sglist,
int nelems, enum dma_data_direction direction,
unsigned long attrs)
 {
+   if (dev->archdata.dma_max) {
+   struct scatterlist *s;
+   bool direct = true;
+   int i;
+
+   for_each_sg(sglist, s, nelems, i) {
+   direct = can_map_direct(dev,
+   sg_phys(s) + s->offset + s->length);
+   if (!direct)
+   break;
+   }
+   if (direct)
+   return dma_direct_map_sg(dev, sglist, nelems, direction,
+attrs);
+   }
+
return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems,
dma_get_mask(dev), direction, attrs);
 }
@@ -70,6 +108,24 @@ static void dma_iommu_unmap_sg(struct device *dev, struct 
scatterlist *sglist,
int nelems, enum dma_data_direction direction,

Re: [PATCH v6 6/7] ASoC: dt-bindings: fsl_easrc: Add document for EASRC

2020-04-14 Thread Shengjiu Wang

Hi Rob

On Tue, Apr 14, 2020 at 11:49 PM Rob Herring  wrote:
>
> On Wed, Apr 01, 2020 at 04:45:39PM +0800, Shengjiu Wang wrote:
> > EASRC (Enhanced Asynchronous Sample Rate Converter) is a new
> > IP module found on i.MX8MN.
> >
> > Signed-off-by: Shengjiu Wang 
> > ---
> >  .../devicetree/bindings/sound/fsl,easrc.yaml  | 101 ++
> >  1 file changed, 101 insertions(+)
> >  create mode 100644 Documentation/devicetree/bindings/sound/fsl,easrc.yaml
> >
> > diff --git a/Documentation/devicetree/bindings/sound/fsl,easrc.yaml 
> > b/Documentation/devicetree/bindings/sound/fsl,easrc.yaml
> > new file mode 100644
> > index ..14ea60084420
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/sound/fsl,easrc.yaml
> > @@ -0,0 +1,101 @@
> > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > +%YAML 1.2
> > +---
> > +$id: http://devicetree.org/schemas/sound/fsl,easrc.yaml#
> > +$schema: http://devicetree.org/meta-schemas/core.yaml#
> > +
> > +title: NXP Asynchronous Sample Rate Converter (ASRC) Controller
> > +
> > +maintainers:
> > +  - Shengjiu Wang 
> > +
> > +properties:
> > +  $nodename:
> > +pattern: "^easrc@.*"
> > +
> > +  compatible:
> > +const: fsl,imx8mn-easrc
> > +
> > +  reg:
> > +maxItems: 1
> > +
> > +  interrupts:
> > +maxItems: 1
> > +
> > +  clocks:
> > +items:
> > +  - description: Peripheral clock
> > +
> > +  clock-names:
> > +items:
> > +  - const: mem
> > +
> > +  dmas:
> > +maxItems: 8
> > +
> > +  dma-names:
> > +items:
> > +  - const: ctx0_rx
> > +  - const: ctx0_tx
> > +  - const: ctx1_rx
> > +  - const: ctx1_tx
> > +  - const: ctx2_rx
> > +  - const: ctx2_tx
> > +  - const: ctx3_rx
> > +  - const: ctx3_tx
> > +
> > +  firmware-name:
> > +allOf:
> > +  - $ref: /schemas/types.yaml#/definitions/string
> > +  - const: imx/easrc/easrc-imx8mn.bin
> > +description: The coefficient table for the filters
> > +
> > +  fsl,asrc-rate:
>
> fsl,asrc-rate-hz

Can we keep "fsl,asrc-rate", because I want this property
align with the one in fsl,asrc.txt.  These two asrc modules
can share same property name.

best regards
wang shengjiu

[PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-14 Thread Jason Wang

We try to keep the defconfig untouched after decoupling CONFIG_VHOST
out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU by
default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
without the caring of CONFIG_VHOST.

But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and even
for the ones that doesn't want vhost. So it actually shifts the
burdens to the maintainers of all other to add "CONFIG_VHOST_MENU is
not set". So this patch tries to enable CONFIG_VHOST explicitly in
defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.

Acked-by: Christian Borntraeger  (s390)
Acked-by: Michael Ellerman  (powerpc)
Cc: Thomas Bogendoerfer 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
Reported-by: Geert Uytterhoeven 
Signed-off-by: Jason Wang 
---
Change since V1:
- depends on EVENTFD for VHOST
---
 arch/mips/configs/malta_kvm_defconfig  |  1 +
 arch/powerpc/configs/powernv_defconfig |  1 +
 arch/powerpc/configs/ppc64_defconfig   |  1 +
 arch/powerpc/configs/pseries_defconfig |  1 +
 arch/s390/configs/debug_defconfig  |  1 +
 arch/s390/configs/defconfig|  1 +
 drivers/vhost/Kconfig  | 26 +-
 7 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/arch/mips/configs/malta_kvm_defconfig 
b/arch/mips/configs/malta_kvm_defconfig
index 8ef612552a19..06f0c7a0ca87 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -18,6 +18,7 @@ CONFIG_PCI=y
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM=m
 CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS=y
+CONFIG_VHOST=m
 CONFIG_VHOST_NET=m
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
diff --git a/arch/powerpc/configs/powernv_defconfig 
b/arch/powerpc/configs/powernv_defconfig
index 71749377d164..404245b4594d 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -346,5 +346,6 @@ CONFIG_CRYPTO_DEV_VMX=y
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM_BOOK3S_64=m
 CONFIG_KVM_BOOK3S_64_HV=m
+CONFIG_VHOST=m
 CONFIG_VHOST_NET=m
 CONFIG_PRINTK_TIME=y
diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index 7e68cb222c7b..4599fc7be285 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -61,6 +61,7 @@ CONFIG_ELECTRA_CF=y
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM_BOOK3S_64=m
 CONFIG_KVM_BOOK3S_64_HV=m
+CONFIG_VHOST=m
 CONFIG_VHOST_NET=m
 CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index 6b68109e248f..4cad3901b5de 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -321,5 +321,6 @@ CONFIG_CRYPTO_DEV_VMX=y
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM_BOOK3S_64=m
 CONFIG_KVM_BOOK3S_64_HV=m
+CONFIG_VHOST=m
 CONFIG_VHOST_NET=m
 CONFIG_PRINTK_TIME=y
diff --git a/arch/s390/configs/debug_defconfig 
b/arch/s390/configs/debug_defconfig
index 0c86ba19fa2b..6ec6e69630d1 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -57,6 +57,7 @@ CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
+CONFIG_VHOST=m
 CONFIG_VHOST_NET=m
 CONFIG_VHOST_VSOCK=m
 CONFIG_OPROFILE=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 6b27d861a9a3..d1b3bf83d687 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -57,6 +57,7 @@ CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
+CONFIG_VHOST=m
 CONFIG_VHOST_NET=m
 CONFIG_VHOST_VSOCK=m
 CONFIG_OPROFILE=m
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e79cbbdfea45..29f171a53d8a 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -12,23 +12,19 @@ config VHOST_RING
  This option is selected by any driver which needs to access
  the host side of a virtio ring.
 
-config VHOST
-   tristate
+menuconfig VHOST
+   tristate "Vhost Devices"
+   depends on EVENTFD
select VHOST_IOTLB
help
- This option is selected by any driver which needs to access
- the core of vhost.
+ Enable option to support host kernel or hardware accelerator
+ for virtio device.
 
-menuconfig VHOST_MENU
-   bool "VHOST drivers"
-   default y
-
-if VHOST_MENU
+if VHOST
 
 config VHOST_NET
tristate "Host kernel accelerator for virtio net"
-   depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP)
-   select VHOST
+   depends on NET && (TUN || !TUN) && (TAP || !TAP)
---help---
  This kernel module can be loaded in host kernel to accelerate
  guest networking with virtio_net. Not to be confused with virtio_net
@@ -39,8 +35,7 @@ config VHOST_NET
 
 config VHOST_SCSI
tristate "VHOST_SCSI TCM

Re: [PATCH] vhost: do not enable VHOST_MENU by default

2020-04-14 Thread Jason Wang




On 2020/4/15 上午5:15, kbuild test robot wrote:

Hi Jason,

I love your patch! Yet something to improve:

[auto build test ERROR on vhost/linux-next]
[also build test ERROR on next-20200414]
[cannot apply to powerpc/next s390/features v5.7-rc1]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/Jason-Wang/vhost-do-not-enable-VHOST_MENU-by-default/20200414-110807
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
config: ia64-randconfig-a001-20200415 (attached as .config)
compiler: ia64-linux-gcc (GCC) 9.3.0
reproduce:
 wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
 chmod +x ~/bin/make.cross
 # save the attached .config to linux build tree
 GCC_VERSION=9.3.0 make.cross ARCH=ia64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot 

All error/warnings (new ones prefixed by >>):

drivers/vhost/vhost.c: In function 'vhost_vring_ioctl':

drivers/vhost/vhost.c:1577:33: error: implicit declaration of function 
'eventfd_fget'; did you mean 'eventfd_signal'? 
[-Werror=implicit-function-declaration]

 1577 |   eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
  | ^~~~
  | eventfd_signal

drivers/vhost/vhost.c:1577:31: warning: pointer/integer type mismatch in 
conditional expression



Forget to make VHOST depend on EVENTFD.

Will send v2.

Thanks



 1577 |   eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
  |   ^
cc1: some warnings being treated as errors

vim +1577 drivers/vhost/vhost.c

feebcaeac79ad8 Jason Wang 2019-05-24  1493
feebcaeac79ad8 Jason Wang 2019-05-24  1494  static long 
vhost_vring_set_num_addr(struct vhost_dev *d,
feebcaeac79ad8 Jason Wang 2019-05-24  1495  
 struct vhost_virtqueue *vq,
feebcaeac79ad8 Jason Wang 2019-05-24  1496  
 unsigned int ioctl,
feebcaeac79ad8 Jason Wang 2019-05-24  1497  
 void __user *argp)
feebcaeac79ad8 Jason Wang 2019-05-24  1498  {
feebcaeac79ad8 Jason Wang 2019-05-24  1499  long r;
feebcaeac79ad8 Jason Wang 2019-05-24  1500
feebcaeac79ad8 Jason Wang 2019-05-24  1501  mutex_lock(>mutex);
feebcaeac79ad8 Jason Wang 2019-05-24  1502
feebcaeac79ad8 Jason Wang 2019-05-24  1503  switch (ioctl) {
feebcaeac79ad8 Jason Wang 2019-05-24  1504  case 
VHOST_SET_VRING_NUM:
feebcaeac79ad8 Jason Wang 2019-05-24  1505  r = 
vhost_vring_set_num(d, vq, argp);
feebcaeac79ad8 Jason Wang 2019-05-24  1506  break;
feebcaeac79ad8 Jason Wang 2019-05-24  1507  case 
VHOST_SET_VRING_ADDR:
feebcaeac79ad8 Jason Wang 2019-05-24  1508  r = 
vhost_vring_set_addr(d, vq, argp);
feebcaeac79ad8 Jason Wang 2019-05-24  1509  break;
feebcaeac79ad8 Jason Wang 2019-05-24  1510  default:
feebcaeac79ad8 Jason Wang 2019-05-24  1511  BUG();
feebcaeac79ad8 Jason Wang 2019-05-24  1512  }
feebcaeac79ad8 Jason Wang 2019-05-24  1513
feebcaeac79ad8 Jason Wang 2019-05-24  1514  
mutex_unlock(>mutex);
feebcaeac79ad8 Jason Wang 2019-05-24  1515
feebcaeac79ad8 Jason Wang 2019-05-24  1516  return r;
feebcaeac79ad8 Jason Wang 2019-05-24  1517  }
26b36604523f4a Sonny Rao  2018-03-14  1518  long 
vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1519  {
cecb46f194460d Al Viro2012-08-27  1520  struct file *eventfp, 
*filep = NULL;
cecb46f194460d Al Viro2012-08-27  1521  bool pollstart = false, 
pollstop = false;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1522  struct eventfd_ctx *ctx 
= NULL;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1523  u32 __user *idxp = argp;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1524  struct vhost_virtqueue 
*vq;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1525  struct 
vhost_vring_state s;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1526  struct vhost_vring_file 
f;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1527  u32 idx;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1528  long r;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1529
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1530  r = get_user(idx, idxp);
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1531  if (r < 0)
3a4d5c94e95935 Michael S. Tsirkin 2

Re: [PATCH 1/3] kexec: Prevent removal of memory in use by a loaded kexec image

2020-04-14 Thread Baoquan He

On 04/14/20 at 04:49pm, David Hildenbrand wrote:
> > The root cause is kexec-ed kernel is targeted at hotpluggable memory
> > region. Just avoiding the movable area can fix it. In kexec_file_load(),
> > just checking or picking those unmovable region to put kernel/initrd in
> > function locate_mem_hole_callback() can fix it. The page or pageblock's
> > zone is movable or not, it's easy to know. This fix doesn't need to
> > bother other component.
> 
>  I don't fully agree. E.g., just because memory is onlined to ZONE_NORMAL
>  does not imply that it cannot get offlined and removed e.g., this is
>  heavily used on ppc64, with 16MB sections.
> >>>
> >>> Really? I just know there are two kinds of mem hoplug in ppc, but don't
> >>> know the details. So in this case, is there any flag or a way to know
> >>> those memory block are hotpluggable? I am curious how those kernel data
> >>> is avoided to be put in this area. Or ppc just freely uses it for kernel
> >>> data or user space data, then try to migrate when hot remove?
> >>
> >> See
> >> arch/powerpc/platforms/pseries/hotplug-memory.c:dlpar_memory_remove_by_count()
> >>
> >> Under DLAPR, it can remove memory in LMB granularity, which is usually
> >> 16MB (== single section on ppc64). DLPAR will directly online all
> >> hotplugged memory (LMBs) from the kernel using device_online(), which
> >> will go to ZONE_NORMAL.
> >>
> >> When trying to remove memory, it simply scans for offlineable 16MB
> >> memory blocks (==section == LMB), offlines and removes them. No need for
> >> the movable zone and all the involved issues.
> > 
> > Yes, this is a different one, thanks for pointing it out. It sounds like
> > balloon driver in virt platform, doesn't it?
> 
> With DLPAR there is a hypervisor involved (which manages the actual HW
> DIMMs), so yes.
> 
> > 
> > Avoiding to put kexec kernel into movable zone can't solve this DLPAR
> > case as you said.
> > 
> >>
> >> Now, the interesting question is, can we have LMBs added during boot
> >> (not via add_memory()), that will later be removed via remove_memory().
> >> IIRC, we had BUGs related to that, so I think yes. If a section contains
> >> no unmovable allocations (after boot), it can get removed.
> > 
> > I do want to ask this question. If we can add LMB into system RAM, then
> > reload kexec can solve it. 
> > 
> > Another better way is adding a common function to filter out the
> > movable zone when search position for kexec kernel, use a arch specific
> > funciton to filter out DLPAR memory blocks for ppc only. Over there,
> > we can simply use for_each_drmem_lmb() to do that.
> 
> I was thinking about something similar. Maybe something like a notifier
> that can be used to test if selected memory can be used for kexec

Not sure if I get the notifier idea clearly. If you mean 

1) Add a common function to pick memory in unmovable zone;
2) Let DLPAR, balloon register with notifier;
3) In the common function, ask notified part to check if the picked
   unmovable memory is available for locating kexec kernel;

Sounds doable to me, and not complicated.

> images. It would apply to
> 
> - arm64 and filter out all hotadded memory (IIRC, only boot memory can
>   be used).

Do you mean hot added memory after boot can't be recognized and added
into system RAM on arm64?


> - powerpc to filter out all LMBs that can be removed (assuming not all
>   memory corresponds to LMBs that can be removed, otherwise we're in
>   trouble ... :) )
> - virtio-mem to filter out all memory it added.
> - hyper-v to filter out partially backed memory blocks (esp. the last
>   memory block it added and only partially backed it by memory).
> 
> This would make it work for kexec_file_load(), however, I do wonder how
> we would want to approach that from userspace kexec-tools when handling
> it from kexec_load().

Let's make kexec_file_load work firstly. Since this work is only first
step to make kexec-ed kernel not break memory hotplug. After kexec
rebooting, the KASLR may locate kernel into hotpluggable area too.

Re: [PATCH 1/4] dma-mapping: move the remaining DMA API calls out of line

2020-04-14 Thread Alexey Kardashevskiy




On 14/04/2020 22:25, Christoph Hellwig wrote:
> For a long time the DMA API has been implemented inline in dma-mapping.h,
> but the function bodies can be quite large.  Move them all out of line.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  include/linux/dma-direct.h  |  58 +
>  include/linux/dma-mapping.h | 247 
>  kernel/dma/direct.c |   9 --
>  kernel/dma/mapping.c| 164 
>  4 files changed, 244 insertions(+), 234 deletions(-)
> 
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 24b8684aa21d..da689ad5fffd 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -85,4 +85,62 @@ int dma_direct_mmap(struct device *dev, struct 
> vm_area_struct *vma,
>   void *cpu_addr, dma_addr_t dma_addr, size_t size,
>   unsigned long attrs);
>  int dma_direct_supported(struct device *dev, u64 mask);
> +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> + unsigned long offset, size_t size, enum dma_data_direction dir,
> + unsigned long attrs);
> +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
> + enum dma_data_direction dir, unsigned long attrs);
> +dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
> + size_t size, enum dma_data_direction dir, unsigned long attrs);
> +
> +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
> +defined(CONFIG_SWIOTLB)
> +void dma_direct_sync_single_for_device(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir);
> +void dma_direct_sync_sg_for_device(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction 
> dir);
> +#else
> +static inline void dma_direct_sync_single_for_device(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +}
> +static inline void dma_direct_sync_sg_for_device(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> +{
> +}
> +#endif
> +
> +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
> +defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
> +defined(CONFIG_SWIOTLB)
> +void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
> + size_t size, enum dma_data_direction dir, unsigned long attrs);
> +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
> + int nents, enum dma_data_direction dir, unsigned long attrs);
> +void dma_direct_sync_single_for_cpu(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir);
> +void dma_direct_sync_sg_for_cpu(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction 
> dir);
> +#else
> +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
> + size_t size, enum dma_data_direction dir, unsigned long attrs)
> +{
> +}
> +static inline void dma_direct_unmap_sg(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction dir,
> + unsigned long attrs)
> +{
> +}
> +static inline void dma_direct_sync_single_for_cpu(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +}
> +static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> +{
> +}
> +#endif
> +
> +size_t dma_direct_max_mapping_size(struct device *dev);
> +
>  #endif /* _LINUX_DMA_DIRECT_H */
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 330ad58fbf4d..793ad775cd54 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -188,73 +188,6 @@ static inline int dma_mmap_from_global_coherent(struct 
> vm_area_struct *vma,
>  }
>  #endif /* CONFIG_DMA_DECLARE_COHERENT */
>  
> -static inline bool dma_is_direct(const struct dma_map_ops *ops)
> -{
> - return likely(!ops);
> -}
> -
> -/*
> - * All the dma_direct_* declarations are here just for the indirect call 
> bypass,
> - * and must not be used directly drivers!
> - */
> -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> - unsigned long offset, size_t size, enum dma_data_direction dir,
> - unsigned long attrs);
> -int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
> - enum dma_data_direction dir, unsigned long attrs);
> -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
> - size_t size, enum dma_data_direction dir, unsigned long attrs);
> -
> -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
> -defined(CONFIG_SWIOTLB)
> -void dma_direct_sync_single_for_device(struct device *dev,
> - dma_addr_t addr, size_t size, enum

[PATCH 1/2] ocxl: Remove unnecessary externs

2020-04-14 Thread Alastair D'Silva

Function declarations don't need externs, remove the existing ones
so they are consistent with newer code

Signed-off-by: Alastair D'Silva 
Acked-by: Andrew Donnellan 
Acked-by: Frederic Barrat 
---
 arch/powerpc/include/asm/pnv-ocxl.h | 40 ++---
 include/misc/ocxl.h |  6 ++---
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/pnv-ocxl.h 
b/arch/powerpc/include/asm/pnv-ocxl.h
index 7de82647e761..ee79d2cd9fb6 100644
--- a/arch/powerpc/include/asm/pnv-ocxl.h
+++ b/arch/powerpc/include/asm/pnv-ocxl.h
@@ -9,28 +9,26 @@
 #define PNV_OCXL_TL_BITS_PER_RATE   4
 #define PNV_OCXL_TL_RATE_BUF_SIZE   ((PNV_OCXL_TL_MAX_TEMPLATE+1) * 
PNV_OCXL_TL_BITS_PER_RATE / 8)
 
-extern int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
-   u16 *supported);
-extern int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count);
+int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled, u16 
*supported);
+int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count);
 
-extern int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long *cap,
+int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long *cap,
char *rate_buf, int rate_buf_size);
-extern int pnv_ocxl_set_tl_conf(struct pci_dev *dev, long cap,
-   uint64_t rate_buf_phys, int rate_buf_size);
-
-extern int pnv_ocxl_get_xsl_irq(struct pci_dev *dev, int *hwirq);
-extern void pnv_ocxl_unmap_xsl_regs(void __iomem *dsisr, void __iomem *dar,
-   void __iomem *tfc, void __iomem *pe_handle);
-extern int pnv_ocxl_map_xsl_regs(struct pci_dev *dev, void __iomem **dsisr,
-   void __iomem **dar, void __iomem **tfc,
-   void __iomem **pe_handle);
-
-extern int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
-   void **platform_data);
-extern void pnv_ocxl_spa_release(void *platform_data);
-extern int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int 
pe_handle);
-
-extern int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr);
-extern void pnv_ocxl_free_xive_irq(u32 irq);
+int pnv_ocxl_set_tl_conf(struct pci_dev *dev, long cap,
+uint64_t rate_buf_phys, int rate_buf_size);
+
+int pnv_ocxl_get_xsl_irq(struct pci_dev *dev, int *hwirq);
+void pnv_ocxl_unmap_xsl_regs(void __iomem *dsisr, void __iomem *dar,
+void __iomem *tfc, void __iomem *pe_handle);
+int pnv_ocxl_map_xsl_regs(struct pci_dev *dev, void __iomem **dsisr,
+ void __iomem **dar, void __iomem **tfc,
+ void __iomem **pe_handle);
+
+int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void 
**platform_data);
+void pnv_ocxl_spa_release(void *platform_data);
+int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle);
+
+int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr);
+void pnv_ocxl_free_xive_irq(u32 irq);
 
 #endif /* _ASM_PNV_OCXL_H */
diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h
index 06dd5839e438..0a762e387418 100644
--- a/include/misc/ocxl.h
+++ b/include/misc/ocxl.h
@@ -173,7 +173,7 @@ int ocxl_context_detach(struct ocxl_context *ctx);
  *
  * Returns 0 on success, negative on failure
  */
-extern int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int *irq_id);
+int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int *irq_id);
 
 /**
  * Frees an IRQ associated with an AFU context
@@ -182,7 +182,7 @@ extern int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int 
*irq_id);
  *
  * Returns 0 on success, negative on failure
  */
-extern int ocxl_afu_irq_free(struct ocxl_context *ctx, int irq_id);
+int ocxl_afu_irq_free(struct ocxl_context *ctx, int irq_id);
 
 /**
  * Gets the address of the trigger page for an IRQ
@@ -193,7 +193,7 @@ extern int ocxl_afu_irq_free(struct ocxl_context *ctx, int 
irq_id);
  *
  * returns the trigger page address, or 0 if the IRQ is not valid
  */
-extern u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id);
+u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id);
 
 /**
  * Provide a callback to be called when an IRQ is triggered
-- 
2.25.1

[PATCH 2/2] ocxl: Address kernel doc errors & warnings

2020-04-14 Thread Alastair D'Silva

This patch addresses warnings and errors from the kernel doc scripts for
the OpenCAPI driver.

It also makes minor tweaks to make the docs more consistent.

Signed-off-by: Alastair D'Silva 
Acked-by: Andrew Donnellan 
---
 drivers/misc/ocxl/config.c| 24 
 drivers/misc/ocxl/ocxl_internal.h |  9 +--
 include/misc/ocxl.h   | 96 ---
 3 files changed, 55 insertions(+), 74 deletions(-)

diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index c8e19bfb5ef9..a62e3d7db2bf 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -273,16 +273,16 @@ static int read_afu_info(struct pci_dev *dev, struct 
ocxl_fn_config *fn,
 }
 
 /**
- * Read the template version from the AFU
- * dev: the device for the AFU
- * fn: the AFU offsets
- * len: outputs the template length
- * version: outputs the major<<8,minor version
+ * read_template_version() - Read the template version from the AFU
+ * @dev: the device for the AFU
+ * @fn: the AFU offsets
+ * @len: outputs the template length
+ * @version: outputs the major<<8,minor version
  *
  * Returns 0 on success, negative on failure
  */
 static int read_template_version(struct pci_dev *dev, struct ocxl_fn_config 
*fn,
-   u16 *len, u16 *version)
+u16 *len, u16 *version)
 {
u32 val32;
u8 major, minor;
@@ -476,16 +476,16 @@ static int validate_afu(struct pci_dev *dev, struct 
ocxl_afu_config *afu)
 }
 
 /**
- * Populate AFU metadata regarding LPC memory
- * dev: the device for the AFU
- * fn: the AFU offsets
- * afu: the AFU struct to populate the LPC metadata into
+ * read_afu_lpc_memory_info() - Populate AFU metadata regarding LPC memory
+ * @dev: the device for the AFU
+ * @fn: the AFU offsets
+ * @afu: the AFU struct to populate the LPC metadata into
  *
  * Returns 0 on success, negative on failure
  */
 static int read_afu_lpc_memory_info(struct pci_dev *dev,
-   struct ocxl_fn_config *fn,
-   struct ocxl_afu_config *afu)
+   struct ocxl_fn_config *fn,
+   struct ocxl_afu_config *afu)
 {
int rc;
u32 val32;
diff --git a/drivers/misc/ocxl/ocxl_internal.h 
b/drivers/misc/ocxl/ocxl_internal.h
index 345bf843a38e..198e4e4bc51d 100644
--- a/drivers/misc/ocxl/ocxl_internal.h
+++ b/drivers/misc/ocxl/ocxl_internal.h
@@ -122,11 +122,12 @@ int ocxl_config_check_afu_index(struct pci_dev *dev,
struct ocxl_fn_config *fn, int afu_idx);
 
 /**
- * Update values within a Process Element
+ * ocxl_link_update_pe() - Update values within a Process Element
+ * @link_handle: the link handle associated with the process element
+ * @pasid: the PASID for the AFU context
+ * @tid: the new thread id for the process element
  *
- * link_handle: the link handle associated with the process element
- * pasid: the PASID for the AFU context
- * tid: the new thread id for the process element
+ * Returns 0 on success
  */
 int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid);
 
diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h
index 0a762e387418..357ef1aadbc0 100644
--- a/include/misc/ocxl.h
+++ b/include/misc/ocxl.h
@@ -62,8 +62,7 @@ struct ocxl_context;
 // Device detection & initialisation
 
 /**
- * Open an OpenCAPI function on an OpenCAPI device
- *
+ * ocxl_function_open() - Open an OpenCAPI function on an OpenCAPI device
  * @dev: The PCI device that contains the function
  *
  * Returns an opaque pointer to the function, or an error pointer (check with 
IS_ERR)
@@ -71,8 +70,7 @@ struct ocxl_context;
 struct ocxl_fn *ocxl_function_open(struct pci_dev *dev);
 
 /**
- * Get the list of AFUs associated with a PCI function device
- *
+ * ocxl_function_afu_list() - Get the list of AFUs associated with a PCI 
function device
  * Returns a list of struct ocxl_afu *
  *
  * @fn: The OpenCAPI function containing the AFUs
@@ -80,8 +78,7 @@ struct ocxl_fn *ocxl_function_open(struct pci_dev *dev);
 struct list_head *ocxl_function_afu_list(struct ocxl_fn *fn);
 
 /**
- * Fetch an AFU instance from an OpenCAPI function
- *
+ * ocxl_function_fetch_afu() - Fetch an AFU instance from an OpenCAPI function
  * @fn: The OpenCAPI function to get the AFU from
  * @afu_idx: The index of the AFU to get
  *
@@ -92,23 +89,20 @@ struct list_head *ocxl_function_afu_list(struct ocxl_fn 
*fn);
 struct ocxl_afu *ocxl_function_fetch_afu(struct ocxl_fn *fn, u8 afu_idx);
 
 /**
- * Take a reference to an AFU
- *
+ * ocxl_afu_get() - Take a reference to an AFU
  * @afu: The AFU to increment the reference count on
  */
 void ocxl_afu_get(struct ocxl_afu *afu);
 
 /**
- * Release a reference to an AFU
- *
+ * ocxl_afu_put() - Release a reference to an AFU
  * @afu: The AFU to decrement the reference count on
  */
 void ocxl_afu_put(struct ocxl_afu *afu);
 
 
 /**
- * Get the configuration information for an

[PATCH 0/2] powerpc: OpenCAPI Cleanup

2020-04-14 Thread Alastair D'Silva

These patches address checkpatch & kernel doc warnings
in the OpenCAPI infrastructure.

Alastair D'Silva (2):
  ocxl: Remove unnecessary externs
  ocxl: Address kernel doc errors & warnings

 arch/powerpc/include/asm/pnv-ocxl.h |  40 ++-
 drivers/misc/ocxl/config.c  |  24 +++
 drivers/misc/ocxl/ocxl_internal.h   |   9 +--
 include/misc/ocxl.h | 102 +++-
 4 files changed, 77 insertions(+), 98 deletions(-)

-- 
2.25.1

Re: [PATCH v2 25/33] docs: powerpc: cxl.rst: mark two section titles as such

2020-04-14 Thread Andrew Donnellan


On 15/4/20 2:48 am, Mauro Carvalho Chehab wrote:

The User API chapter contains two sub-chapters. Mark them as
such.

Signed-off-by: Mauro Carvalho Chehab 


Acked-by: Andrew Donnellan 


--
Andrew Donnellan  OzLabs, ADL Canberra
a...@linux.ibm.com IBM Australia Limited

Re: [PATCH] target/ppc: Fix mtmsr(d) L=1 variant that loses interrupts

2020-04-14 Thread Nathan Chancellor

On Tue, Apr 14, 2020 at 09:11:31PM +1000, Nicholas Piggin wrote:
> If mtmsr L=1 sets MSR[EE] while there is a maskable exception pending,
> it does not cause an interrupt. This causes the test case to hang:
> 
> https://lists.gnu.org/archive/html/qemu-ppc/2019-10/msg00826.html
> 
> More recently, Linux reduced the occurance of operations (e.g., rfi)
> which stop translation and allow pending interrupts to be processed.
> This started causing hangs in Linux boot in long-running kernel tests,
> running with '-d int' shows the decrementer stops firing despite DEC
> wrapping and MSR[EE]=1.
> 
> https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-April/208301.html
> 
> The cause is the broken mtmsr L=1 behaviour, which is contrary to the
> architecture. From Power ISA v3.0B, p.977, Move To Machine State Register,
> Programming Note states:
> 
> If MSR[EE]=0 and an External, Decrementer, or Performance Monitor
> exception is pending, executing an mtmsrd instruction that sets
> MSR[EE] to 1 will cause the interrupt to occur before the next
> instruction is executed, if no higher priority exception exists
> 
> Fix this by handling L=1 exactly the same way as L=0, modulo the MSR
> bits altered.
> 
> The confusion arises from L=0 being "context synchronizing" whereas L=1
> is "execution synchronizing", which is a weaker semantic. However this
> is not a relaxation of the requirement that these exceptions cause
> interrupts when MSR[EE]=1 (e.g., when mtmsr executes to completion as
> TCG is doing here), rather it specifies how a pipelined processor can
> have multiple instructions in flight where one may influence how another
> behaves.
> 
> Cc: qemu-sta...@nongnu.org
> Reported-by: Anton Blanchard 
> Reported-by: Nathan Chancellor 
> Tested-by: Nathan Chancellor 
> Signed-off-by: Nicholas Piggin 
> ---
> Thanks very much to Nathan for reporting and testing it, I added his
> Tested-by tag despite a more polished patch, as the the basics are 
> still the same (and still fixes his test case here).

I did re-run the test with the updated version of your patch and it
passed still so that tag can still stand without any controversy :)

Thank you for the fix again!
Nathan

> This bug possibly goes back to early v2.04 / mtmsrd L=1 support around
> 2007, and the code has been changed several times since then so may
> require some backporting.
> 
> 32-bit / mtmsr untested at the moment, I don't have an environment
> handy.
> 
>  target/ppc/translate.c | 46 +-
>  1 file changed, 27 insertions(+), 19 deletions(-)
> 
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index b207fb5386..9959259dba 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -4361,30 +4361,34 @@ static void gen_mtmsrd(DisasContext *ctx)
>  CHK_SV;
>  
>  #if !defined(CONFIG_USER_ONLY)
> +if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
> +gen_io_start();
> +}
>  if (ctx->opcode & 0x0001) {
> -/* Special form that does not need any synchronisation */
> +/* L=1 form only updates EE and RI */
>  TCGv t0 = tcg_temp_new();
> +TCGv t1 = tcg_temp_new();
>  tcg_gen_andi_tl(t0, cpu_gpr[rS(ctx->opcode)],
>  (1 << MSR_RI) | (1 << MSR_EE));
> -tcg_gen_andi_tl(cpu_msr, cpu_msr,
> +tcg_gen_andi_tl(t1, cpu_msr,
>  ~(target_ulong)((1 << MSR_RI) | (1 << MSR_EE)));
> -tcg_gen_or_tl(cpu_msr, cpu_msr, t0);
> +tcg_gen_or_tl(t1, t1, t0);
> +
> +gen_helper_store_msr(cpu_env, t1);
>  tcg_temp_free(t0);
> +tcg_temp_free(t1);
> +
>  } else {
>  /*
>   * XXX: we need to update nip before the store if we enter
>   *  power saving mode, we will exit the loop directly from
>   *  ppc_store_msr
>   */
> -if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
> -gen_io_start();
> -}
>  gen_update_nip(ctx, ctx->base.pc_next);
>  gen_helper_store_msr(cpu_env, cpu_gpr[rS(ctx->opcode)]);
> -/* Must stop the translation as machine state (may have) changed */
> -/* Note that mtmsr is not always defined as context-synchronizing */
> -gen_stop_exception(ctx);
>  }
> +/* Must stop the translation as machine state (may have) changed */
> +gen_stop_exception(ctx);
>  #endif /* !defined(CONFIG_USER_ONLY) */
>  }
>  #endif /* defined(TARGET_PPC64) */
> @@ -4394,15 +4398,23 @@ static void gen_mtmsr(DisasContext *ctx)
>  CHK_SV;
>  
>  #if !defined(CONFIG_USER_ONLY)
> -   if (ctx->opcode & 0x0001) {
> -/* Special form that does not need any synchronisation */
> +if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
> +gen_io_start();
> +}
> +if (ctx->opcode & 0x0001) {
> +/* L=1 form only updates EE and RI */
>  TCGv t0 = tcg_temp_new();
> +TCGv t1 = tcg_temp_new();

[PATCH] powerpc/powernv/pci: Add an explaination for PNV_IODA_PE_BUS_ALL

2020-04-14 Thread Oliver O'Halloran

It's pretty obsecure and confused me for a long time so I figured it's
worth documenting properly.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index a8cde70..51c254f2 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -33,6 +33,24 @@ enum pnv_phb_model {
 #define PNV_IODA_PE_SLAVE  (1 << 4)/* Slave PE in compound case
*/
 #define PNV_IODA_PE_VF (1 << 5)/* PE for one VF
*/
 
+/*
+ * A brief note on PNV_IODA_PE_BUS_ALL
+ *
+ * This is needed because of the behaviour of PCIe-to-PCI bridges. The PHB uses
+ * the Requester ID field of the PCIe request header to determine the device
+ * (and PE) that initiated a DMA. In legacy PCI individual memory read/write
+ * requests aren't tagged with the RID. To work around this the PCIe-to-PCI
+ * bridge will use (secondary_bus_no << 8) | 0x00 as the RID on the PCIe side.
+ *
+ * PCIe-to-X bridges have a similar issue even though PCI-X requests also have
+ * a RID in the transaction header. The PCIe-to-X bridge is permitted to "take
+ * ownership" of a transaction by a PCI-X device when forwarding it to the PCIe
+ * side of the bridge.
+ *
+ * To work around these problems we use the BUS_ALL flag since every 
subordinate
+ * bus of the bridge should go into the same PE.
+ */
+
 /* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */
 #define PNV_IODA_STOPPED_STATE 0x8000
 
-- 
2.9.5

Re: [PATCH 3/8] fs: wrap simple_pin_fs/simple_release_fs arguments in a struct

2020-04-14 Thread Greg Kroah-Hartman

On Tue, Apr 14, 2020 at 02:42:57PM +0200, Emanuele Giuseppe Esposito wrote:
> @@ -676,9 +674,9 @@ static void __debugfs_file_removed(struct dentry *dentry)
>  
>  static void remove_one(struct dentry *victim)
>  {
> -if (d_is_reg(victim))
> +if (d_is_reg(victim))
>   __debugfs_file_removed(victim);
> - simple_release_fs(_mount, _mount_count);
> + simple_release_fs();
>  }
>  
>  /**

Always run checkpatch.pl on your patches so you do not get grumpy
maintainers telling you to run checkpatch.pl on your patches...

Re: [PATCH 4/8] fs: introduce simple_new_inode

2020-04-14 Thread Greg Kroah-Hartman

On Tue, Apr 14, 2020 at 02:42:58PM +0200, Emanuele Giuseppe Esposito wrote:
> It is a common special case for new_inode to initialize the
> time to the current time and the inode to get_next_ino().
> Introduce a core function that does it and use it throughout
> Linux.

Shouldn't this just be called new_inode_current_time()?

How is anyone going to remember what simple_new_inode() does to the
inode structure?

> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -595,6 +595,18 @@ int simple_write_end(struct file *file, struct 
> address_space *mapping,
>  }
>  EXPORT_SYMBOL(simple_write_end);
>  
> +struct inode *simple_new_inode(struct super_block *sb)
> +{
> + struct inode *inode = new_inode(sb);
> + if (inode) {
> + inode->i_ino = get_next_ino();
> + inode->i_atime = inode->i_mtime =
> + inode->i_ctime = current_time(inode);
> + }
> + return inode;
> +}
> +EXPORT_SYMBOL(simple_new_inode);

No kernel doc explaining that get_next_ino() is called already?

Please document new global functions like this so we have a chance to
know how to use them.

Also, it is almost always easier to introduce a common function, get it
merged, and _THEN_ send out cleanup functions to all of the different
subsystems to convert over to it.  Yes, it takes longer, but it makes it
possible to do this in a way that can be reviewed properly, unlike this
patch series :(

thanks,

greg k-h

Re: [PATCH 6/8] simplefs: add file creation functions

2020-04-14 Thread Greg Kroah-Hartman

On Tue, Apr 14, 2020 at 02:43:00PM +0200, Emanuele Giuseppe Esposito wrote:
> A bunch of code is duplicated between debugfs and tracefs, unify it to the
> simplefs library.
> 
> The code is very similar, except that dentry and inode creation are unified
> into a single function (unlike start_creating in debugfs and tracefs, which
> only takes care of dentries).  This adds an output parameter to the creation
> functions, but pushes all error recovery into fs/simplefs.c.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  fs/simplefs.c| 150 +++
>  include/linux/simplefs.h |  19 +
>  2 files changed, 169 insertions(+)

What's wrong with libfs, isn't that supposed to be for these types of
"common" filesystem interactions?

Why create a whole "new" fs for this?

thanks,

greg k-h

Re: [PATCH 5/8] simplefs: add alloc_anon_inode wrapper

2020-04-14 Thread Greg Kroah-Hartman

On Tue, Apr 14, 2020 at 02:42:59PM +0200, Emanuele Giuseppe Esposito wrote:
> Start adding file creation wrappers, the simplest returns an anonymous
> inode.

This changelog text does not make much sense on its own.  Please say why
you are doing something, not just what you are doing.

thanks,

greg k-h

Re: [PATCH 2/8] fs: extract simple_pin/release_fs to separate files

2020-04-14 Thread Greg Kroah-Hartman

On Tue, Apr 14, 2020 at 02:42:56PM +0200, Emanuele Giuseppe Esposito wrote:
> We will augment this family of functions with inode management.  To avoid
> littering include/linux/fs.h and fs/libfs.c, move them to a separate header,
> with a Kconfig symbol to enable them.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 

You have a lot of people on cc:, this is going to be hard for everyone
to review...


> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index d1398cef3b18..fc38a6f0fc11 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -288,12 +288,16 @@ config STRIP_ASM_SYMS
>  
>  config READABLE_ASM
>   bool "Generate readable assembler code"
> - depends on DEBUG_KERNEL
> - help
> -   Disable some compiler optimizations that tend to generate human 
> unreadable
> -   assembler output. This may make the kernel slightly slower, but it 
> helps
> -   to keep kernel developers who have to stare a lot at assembler 
> listings
> -   sane.
> +depends on DEBUG_KERNEL
> +help
> +  Disable some compiler optimizations that tend to generate human 
> unreadable
> +  assembler output. This may make the kernel slightly slower, but it 
> helps
> +  to keep kernel developers who have to stare a lot at assembler listings
> +  sane.
> +   

Why did you loose the indentation here and add trailing whitespace?

> +config DEBUG_FS
> + bool "Debug Filesystem"
> + select SIMPLEFS
>  

We already have a DEBUG_FS config option in this file, why another one?
And what happened to the help text?

I think you need to rework your patch series to do smaller things on
each step, which would make it reviewable much easier, and prevent
mistakes like this one.

thanks,

greg k-h

Re: [PATCH] vhost: do not enable VHOST_MENU by default

2020-04-14 Thread kbuild test robot

Hi Jason,

I love your patch! Yet something to improve:

[auto build test ERROR on vhost/linux-next]
[also build test ERROR on next-20200414]
[cannot apply to powerpc/next s390/features v5.7-rc1]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/Jason-Wang/vhost-do-not-enable-VHOST_MENU-by-default/20200414-110807
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
config: ia64-randconfig-a001-20200415 (attached as .config)
compiler: ia64-linux-gcc (GCC) 9.3.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=9.3.0 make.cross ARCH=ia64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot 

All error/warnings (new ones prefixed by >>):

   drivers/vhost/vhost.c: In function 'vhost_vring_ioctl':
>> drivers/vhost/vhost.c:1577:33: error: implicit declaration of function 
>> 'eventfd_fget'; did you mean 'eventfd_signal'? 
>> [-Werror=implicit-function-declaration]
1577 |   eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
 | ^~~~
 | eventfd_signal
>> drivers/vhost/vhost.c:1577:31: warning: pointer/integer type mismatch in 
>> conditional expression
1577 |   eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
 |   ^
   cc1: some warnings being treated as errors

vim +1577 drivers/vhost/vhost.c

feebcaeac79ad8 Jason Wang 2019-05-24  1493  
feebcaeac79ad8 Jason Wang 2019-05-24  1494  static long 
vhost_vring_set_num_addr(struct vhost_dev *d,
feebcaeac79ad8 Jason Wang 2019-05-24  1495  
 struct vhost_virtqueue *vq,
feebcaeac79ad8 Jason Wang 2019-05-24  1496  
 unsigned int ioctl,
feebcaeac79ad8 Jason Wang 2019-05-24  1497  
 void __user *argp)
feebcaeac79ad8 Jason Wang 2019-05-24  1498  {
feebcaeac79ad8 Jason Wang 2019-05-24  1499  long r;
feebcaeac79ad8 Jason Wang 2019-05-24  1500  
feebcaeac79ad8 Jason Wang 2019-05-24  1501  mutex_lock(>mutex);
feebcaeac79ad8 Jason Wang 2019-05-24  1502  
feebcaeac79ad8 Jason Wang 2019-05-24  1503  switch (ioctl) {
feebcaeac79ad8 Jason Wang 2019-05-24  1504  case 
VHOST_SET_VRING_NUM:
feebcaeac79ad8 Jason Wang 2019-05-24  1505  r = 
vhost_vring_set_num(d, vq, argp);
feebcaeac79ad8 Jason Wang 2019-05-24  1506  break;
feebcaeac79ad8 Jason Wang 2019-05-24  1507  case 
VHOST_SET_VRING_ADDR:
feebcaeac79ad8 Jason Wang 2019-05-24  1508  r = 
vhost_vring_set_addr(d, vq, argp);
feebcaeac79ad8 Jason Wang 2019-05-24  1509  break;
feebcaeac79ad8 Jason Wang 2019-05-24  1510  default:
feebcaeac79ad8 Jason Wang 2019-05-24  1511  BUG();
feebcaeac79ad8 Jason Wang 2019-05-24  1512  }
feebcaeac79ad8 Jason Wang 2019-05-24  1513  
feebcaeac79ad8 Jason Wang 2019-05-24  1514  
mutex_unlock(>mutex);
feebcaeac79ad8 Jason Wang 2019-05-24  1515  
feebcaeac79ad8 Jason Wang 2019-05-24  1516  return r;
feebcaeac79ad8 Jason Wang 2019-05-24  1517  }
26b36604523f4a Sonny Rao  2018-03-14  1518  long 
vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1519  {
cecb46f194460d Al Viro2012-08-27  1520  struct file *eventfp, 
*filep = NULL;
cecb46f194460d Al Viro2012-08-27  1521  bool pollstart = false, 
pollstop = false;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1522  struct eventfd_ctx *ctx 
= NULL;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1523  u32 __user *idxp = argp;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1524  struct vhost_virtqueue 
*vq;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1525  struct 
vhost_vring_state s;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1526  struct vhost_vring_file 
f;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1527  u32 idx;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1528  long r;
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1529  
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1530  r = get_user(idx, idxp);
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1531  if (r < 0)
3a4d5c94e95935 Michael S. Tsirkin 2010-01-14  1532  return r;
0f3d9a17469d71 Krishna Kumar

Re: [PATCH v2 2/2] crypto: Remove unnecessary memzero_explicit()

2020-04-14 Thread Joe Perches

On Tue, 2020-04-14 at 15:37 -0400, Waiman Long wrote:
> OK, I can change it to clear the key length when the allocation failed
> which isn't likely.


Perhaps:

kfree_sensitive(op->key);
op->key = NULL;
op->keylen = 0;

but I don't know that it impacts any possible state.

Re: [PATCH v2 2/2] crypto: Remove unnecessary memzero_explicit()

2020-04-14 Thread Waiman Long

On 4/14/20 3:16 PM, Michal Suchánek wrote:
> On Tue, Apr 14, 2020 at 12:24:36PM -0400, Waiman Long wrote:
>> On 4/14/20 2:08 AM, Christophe Leroy wrote:
>>>
>>> Le 14/04/2020 à 00:28, Waiman Long a écrit :
 Since kfree_sensitive() will do an implicit memzero_explicit(), there
 is no need to call memzero_explicit() before it. Eliminate those
 memzero_explicit() and simplify the call sites. For better correctness,
 the setting of keylen is also moved down after the key pointer check.

 Signed-off-by: Waiman Long 
 ---
   .../allwinner/sun8i-ce/sun8i-ce-cipher.c  | 19 +-
   .../allwinner/sun8i-ss/sun8i-ss-cipher.c  | 20 +--
   drivers/crypto/amlogic/amlogic-gxl-cipher.c   | 12 +++
   drivers/crypto/inside-secure/safexcel_hash.c  |  3 +--
   4 files changed, 14 insertions(+), 40 deletions(-)

 diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
 b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
 index aa4e8fdc2b32..8358fac98719 100644
 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
 +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
 @@ -366,10 +366,7 @@ void sun8i_ce_cipher_exit(struct crypto_tfm *tfm)
   {
   struct sun8i_cipher_tfm_ctx *op = crypto_tfm_ctx(tfm);
   -    if (op->key) {
 -    memzero_explicit(op->key, op->keylen);
 -    kfree(op->key);
 -    }
 +    kfree_sensitive(op->key);
   crypto_free_sync_skcipher(op->fallback_tfm);
   pm_runtime_put_sync_suspend(op->ce->dev);
   }
 @@ -391,14 +388,11 @@ int sun8i_ce_aes_setkey(struct crypto_skcipher
 *tfm, const u8 *key,
   dev_dbg(ce->dev, "ERROR: Invalid keylen %u\n", keylen);
   return -EINVAL;
   }
 -    if (op->key) {
 -    memzero_explicit(op->key, op->keylen);
 -    kfree(op->key);
 -    }
 -    op->keylen = keylen;
 +    kfree_sensitive(op->key);
   op->key = kmemdup(key, keylen, GFP_KERNEL | GFP_DMA);
   if (!op->key)
   return -ENOMEM;
 +    op->keylen = keylen;
>>> Does it matter at all to ensure op->keylen is not set when of->key is
>>> NULL ? I'm not sure.
>>>
>>> But if it does, then op->keylen should be set to 0 when freeing op->key. 
>> My thinking is that if memory allocation fails, we just don't touch
>> anything and return an error code. I will not explicitly set keylen to 0
>> in this case unless it is specified in the API documentation.
> You already freed the key by now so not touching anything is not
> possible. The key is set to NULL on allocation failure so setting keylen
> to 0 should be redundant. However, setting keylen to 0 is consisent with
> not having a key, and it avoids the possibility of leaking the length
> later should that ever cause any problem.

OK, I can change it to clear the key length when the allocation failed
which isn't likely.

Cheers,
Longman

Re: [PATCH v2 2/2] crypto: Remove unnecessary memzero_explicit()

2020-04-14 Thread Michal Suchánek

On Tue, Apr 14, 2020 at 12:24:36PM -0400, Waiman Long wrote:
> On 4/14/20 2:08 AM, Christophe Leroy wrote:
> >
> >
> > Le 14/04/2020 à 00:28, Waiman Long a écrit :
> >> Since kfree_sensitive() will do an implicit memzero_explicit(), there
> >> is no need to call memzero_explicit() before it. Eliminate those
> >> memzero_explicit() and simplify the call sites. For better correctness,
> >> the setting of keylen is also moved down after the key pointer check.
> >>
> >> Signed-off-by: Waiman Long 
> >> ---
> >>   .../allwinner/sun8i-ce/sun8i-ce-cipher.c  | 19 +-
> >>   .../allwinner/sun8i-ss/sun8i-ss-cipher.c  | 20 +--
> >>   drivers/crypto/amlogic/amlogic-gxl-cipher.c   | 12 +++
> >>   drivers/crypto/inside-secure/safexcel_hash.c  |  3 +--
> >>   4 files changed, 14 insertions(+), 40 deletions(-)
> >>
> >> diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
> >> b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
> >> index aa4e8fdc2b32..8358fac98719 100644
> >> --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
> >> +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
> >> @@ -366,10 +366,7 @@ void sun8i_ce_cipher_exit(struct crypto_tfm *tfm)
> >>   {
> >>   struct sun8i_cipher_tfm_ctx *op = crypto_tfm_ctx(tfm);
> >>   -    if (op->key) {
> >> -    memzero_explicit(op->key, op->keylen);
> >> -    kfree(op->key);
> >> -    }
> >> +    kfree_sensitive(op->key);
> >>   crypto_free_sync_skcipher(op->fallback_tfm);
> >>   pm_runtime_put_sync_suspend(op->ce->dev);
> >>   }
> >> @@ -391,14 +388,11 @@ int sun8i_ce_aes_setkey(struct crypto_skcipher
> >> *tfm, const u8 *key,
> >>   dev_dbg(ce->dev, "ERROR: Invalid keylen %u\n", keylen);
> >>   return -EINVAL;
> >>   }
> >> -    if (op->key) {
> >> -    memzero_explicit(op->key, op->keylen);
> >> -    kfree(op->key);
> >> -    }
> >> -    op->keylen = keylen;
> >> +    kfree_sensitive(op->key);
> >>   op->key = kmemdup(key, keylen, GFP_KERNEL | GFP_DMA);
> >>   if (!op->key)
> >>   return -ENOMEM;
> >> +    op->keylen = keylen;
> >
> > Does it matter at all to ensure op->keylen is not set when of->key is
> > NULL ? I'm not sure.
> >
> > But if it does, then op->keylen should be set to 0 when freeing op->key. 
> 
> My thinking is that if memory allocation fails, we just don't touch
> anything and return an error code. I will not explicitly set keylen to 0
> in this case unless it is specified in the API documentation.
You already freed the key by now so not touching anything is not
possible. The key is set to NULL on allocation failure so setting keylen
to 0 should be redundant. However, setting keylen to 0 is consisent with
not having a key, and it avoids the possibility of leaking the length
later should that ever cause any problem.

Thanks

Michal

[PATCH v2] Fix: buffer overflow during hvc_alloc().

2020-04-14 Thread andrew

From: Andrew Melnychenko 

If there is a lot(more then 16) of virtio-console devices
or virtio_console module is reloaded
- buffers 'vtermnos' and 'cons_ops' are overflowed.
In older kernels it overruns spinlock which leads to kernel freezing:
https://bugzilla.redhat.com/show_bug.cgi?id=1786239

To reproduce the issue, you can try simple script that
loads/unloads module. Something like this:
while [ 1 ]
do
  modprobe virtio_console
  sleep 2
  modprobe -r virtio_console
  sleep 2
done

Description of problem:
Guest get 'Call Trace' when loading module "virtio_console"
and unloading it frequently - clearly reproduced on kernel-4.18.0:

[   81.498208] [ cut here ]
[   81.499263] pvqspinlock: lock 0x92080020 has corrupted value 
0xc0774ca0!
[   81.501000] WARNING: CPU: 0 PID: 785 at 
kernel/locking/qspinlock_paravirt.h:500 
__pv_queued_spin_unlock_slowpath+0xc0/0xd0
[   81.503173] Modules linked in: virtio_console fuse xt_CHECKSUM 
ipt_MASQUERADE xt_conntrack ipt_REJECT nft_counter nf_nat_tftp nft_objref 
nf_conntrack_tftp tun bridge stp llc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 
nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct 
nf_tables_set nft_chain_nat_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 
nft_chain_route_ipv6 nft_chain_nat_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 
nf_nat_ipv4 nf_nat nf_conntrack nft_chain_route_ipv4 ip6_tables nft_compat 
ip_set nf_tables nfnetlink sunrpc bochs_drm drm_vram_helper ttm drm_kms_helper 
syscopyarea sysfillrect sysimgblt fb_sys_fops drm i2c_piix4 pcspkr 
crct10dif_pclmul crc32_pclmul joydev ghash_clmulni_intel ip_tables xfs 
libcrc32c sd_mod sg ata_generic ata_piix virtio_net libata crc32c_intel 
net_failover failover serio_raw virtio_scsi dm_mirror dm_region_hash dm_log 
dm_mod [last unloaded: virtio_console]
[   81.517019] CPU: 0 PID: 785 Comm: kworker/0:2 Kdump: loaded Not tainted 
4.18.0-167.el8.x86_64 #1
[   81.518639] Hardware name: Red Hat KVM, BIOS 
1.12.0-5.scrmod+el8.2.0+5159+d8aa4d83 04/01/2014
[   81.520205] Workqueue: events control_work_handler [virtio_console]
[   81.521354] RIP: 0010:__pv_queued_spin_unlock_slowpath+0xc0/0xd0
[   81.522450] Code: 07 00 48 63 7a 10 e8 bf 64 f5 ff 66 90 c3 8b 05 e6 cf d6 
01 85 c0 74 01 c3 8b 17 48 89 fe 48 c7 c7 38 4b 29 91 e8 3a 6c fa ff <0f> 0b c3 
0f 0b 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48
[   81.525830] RSP: 0018:b51a01ffbd70 EFLAGS: 00010282
[   81.526798] RAX:  RBX: 0010 RCX: 
[   81.528110] RDX: 9e66f1826480 RSI: 9e66f1816a08 RDI: 9e66f1816a08
[   81.529437] RBP: 9153ff10 R08: 026c R09: 0053
[   81.530732] R10:  R11: b51a01ffbc18 R12: 9e66cd682200
[   81.532133] R13: 9153ff10 R14: 9e6685569500 R15: 9e66cd682000
[   81.533442] FS:  () GS:9e66f180() 
knlGS:
[   81.534914] CS:  0010 DS:  ES:  CR0: 80050033
[   81.535971] CR2: 5624c55b14d0 CR3: 0003a023c000 CR4: 003406f0
[   81.537283] Call Trace:
[   81.537763]  __raw_callee_save___pv_queued_spin_unlock_slowpath+0x11/0x20
[   81.539011]  .slowpath+0x9/0xe
[   81.539585]  hvc_alloc+0x25e/0x300
[   81.540237]  init_port_console+0x28/0x100 [virtio_console]
[   81.541251]  handle_control_message.constprop.27+0x1c4/0x310 [virtio_console]
[   81.542546]  control_work_handler+0x70/0x10c [virtio_console]
[   81.543601]  process_one_work+0x1a7/0x3b0
[   81.544356]  worker_thread+0x30/0x390
[   81.545025]  ? create_worker+0x1a0/0x1a0
[   81.545749]  kthread+0x112/0x130
[   81.546358]  ? kthread_flush_work_fn+0x10/0x10
[   81.547183]  ret_from_fork+0x22/0x40
[   81.547842] ---[ end trace aa97649bd16c8655 ]---
[   83.546539] general protection fault:  [#1] SMP NOPTI
[   83.547422] CPU: 5 PID: 3225 Comm: modprobe Kdump: loaded Tainted: G
W- -  - 4.18.0-167.el8.x86_64 #1
[   83.549191] Hardware name: Red Hat KVM, BIOS 
1.12.0-5.scrmod+el8.2.0+5159+d8aa4d83 04/01/2014
[   83.550544] RIP: 0010:__pv_queued_spin_lock_slowpath+0x19a/0x2a0
[   83.551504] Code: c4 c1 ea 12 41 be 01 00 00 00 4c 8d 6d 14 41 83 e4 03 8d 
42 ff 49 c1 e4 05 48 98 49 81 c4 40 a5 02 00 4c 03 24 c5 60 48 34 91 <49> 89 2c 
24 b8 00 80 00 00 eb 15 84 c0 75 0a 41 0f b6 54 24 14 84
[   83.554449] RSP: 0018:b51a0323fdb0 EFLAGS: 00010202
[   83.555290] RAX: 301c RBX: 92080020 RCX: 0001
[   83.556426] RDX: 301d RSI:  RDI: 
[   83.557556] RBP: 9e66f196a540 R08: 028a R09: 9e66d2757788
[   83.558688] R10:  R11:  R12: 646e61725f770b07
[   83.559821] R13: 9e66f196a554 R14: 0001 R15: 0018
[   83.560958] FS:  7fd5032e8740() GS:9e66f194() 
knlGS:
[   83.562233] CS:  0010 DS:  ES:  CR0: 80050033
[   83.563149]

Re: [PATCH 1/2] mm, treewide: Rename kzfree() to kfree_sensitive()

2020-04-14 Thread Waiman Long

On 4/14/20 8:48 AM, David Sterba wrote:
> On Mon, Apr 13, 2020 at 05:15:49PM -0400, Waiman Long wrote:
>>  fs/btrfs/ioctl.c  |  2 +-
>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index 40b729dce91c..eab3f8510426 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -2691,7 +2691,7 @@ static int btrfs_ioctl_get_subvol_info(struct file 
>> *file, void __user *argp)
>>  btrfs_put_root(root);
>>  out_free:
>>  btrfs_free_path(path);
>> -kzfree(subvol_info);
>> +kfree_sensitive(subvol_info);
> This is not in a sensitive context so please switch it to plain kfree.
> With that you have my acked-by. Thanks.
>
Thanks for letting me know about. I think I will send it out as a
separate patch.

Cheers,
Longman

Re: -Wincompatible-pointer-types in arch/powerpc/platforms/embedded6xx/mvme5100.c

2020-04-14 Thread Scott Wood

On Tue, 2020-04-14 at 17:33 +1000, Michael Ellerman wrote:
> I'm not sure TBH. This is all ancient history as far as I can tell, none
> of it's been touched for ~7 years.
> 
> Your config has:
> 
> CONFIG_EMBEDDED6xx=y
> CONFIG_PPC_BOOK3S_32=y
> CONFIG_PPC_BOOK3S_6xx=y
> CONFIG_PPC_MPC52xx=y
> CONFIG_PPC_86xx=y
> 
> 
> Which I'm not sure really makes sense at all, ie. it's trying to build a
> kernel for multiple platforms at once (EMBEDDED6xx, MPC52xx, 86xx), but
> the Kconfig doesn't exclude that so I guess we have to live with it for
> now.

I thought supporting multiple platforms in a kernel was something we tried to
support when practical?

> So I'm going to guess it should have also excluded embedded6xx, and this
> seems to fix it:
> 
> diff --git a/arch/powerpc/platforms/Kconfig.cputype
> b/arch/powerpc/platforms/Kconfig.cputype
> index 0c3c1902135c..134fc383daf7 100644
> --- a/arch/powerpc/platforms/Kconfig.cputype
> +++ b/arch/powerpc/platforms/Kconfig.cputype
> @@ -278,7 +278,7 @@ config PTE_64BIT
>  
>  config PHYS_64BIT
>   bool 'Large physical address support' if E500 || PPC_86xx
> - depends on (44x || E500 || PPC_86xx) && !PPC_83xx && !PPC_82xx
> + depends on (44x || E500 || PPC_86xx) && !PPC_83xx && !PPC_82xx &&
> !EMBEDDED6xx
>   select PHYS_ADDR_T_64BIT
>   ---help---
> This option enables kernel support for larger than 32-bit physical
> 
> 
> So unless anyone can tell me otherwise I'm inclined to commit that ^

This could silently break someone's config who's depending on PHYS_64BIT (e.g.
an 86xx kernel that happens to include an embedded6xx target as well, even if
just by accident).  It'd be better to have the MVME500 depend on
!CONFIG_PHYS_ADDR_T_64BIT as Nathan suggested (if there's nobody around to
test a fix to the actual bug), which shouldn't break anyone since it already
didn't build.

-Scott

[PATCH] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-04-14 Thread Christophe Leroy

unsafe_put_user() is designed to take benefit of 'asm goto'.

Instead of using the standard __put_user() approach and branch
based on the returned error, use 'asm goto' to branch directly
to the error label from the .fixup code.

This change significantly simplifies functions using
unsafe_put_user()

Small exemple of the benefit with the following code:

struct test {
unsigned long item1;
unsigned long item2;
unsigned long item3;
};

int set_test_to_user(struct test __user *test, unsigned long item1,
 unsigned long item2, unsigned long item3)
{
unsafe_put_user(item1, >item1, failed);
unsafe_put_user(item2, >item2, failed);
unsafe_put_user(item3, >item3, failed);
return 0;
failed:
return -EFAULT;
}

Before the patch:

0d94 :
 d94:   39 20 00 00 li  r9,0
 d98:   90 83 00 00 stw r4,0(r3)
 d9c:   2f 89 00 00 cmpwi   cr7,r9,0
 da0:   40 9e 00 30 bne cr7,dd0 
 da4:   39 43 00 04 addir10,r3,4
 da8:   90 aa 00 00 stw r5,0(r10)
 dac:   2f 89 00 00 cmpwi   cr7,r9,0
 db0:   40 9e 00 20 bne cr7,dd0 
 db4:   38 63 00 08 addir3,r3,8
 db8:   90 c3 00 00 stw r6,0(r3)
 dbc:   21 29 00 00 subfic  r9,r9,0
 dc0:   7d 29 49 10 subfe   r9,r9,r9
 dc4:   38 60 ff f2 li  r3,-14
 dc8:   7d 23 18 38 and r3,r9,r3
 dcc:   4e 80 00 20 blr
 dd0:   38 60 ff f2 li  r3,-14
 dd4:   4e 80 00 20 blr

 <.fixup>:
...
  b8:   39 20 ff f2 li  r9,-14
  bc:   48 00 00 00 b   bc <.fixup+0xbc>
bc: R_PPC_REL24 .text+0xd9c
  c0:   39 20 ff f2 li  r9,-14
  c4:   48 00 00 00 b   c4 <.fixup+0xc4>
c4: R_PPC_REL24 .text+0xdac
  c8:   39 20 ff f2 li  r9,-14
  cc:   48 00 00 00 b   cc <.fixup+0xcc>
cc: R_PPC_REL24 .text+0xdbc

After the patch:

0d94 :
 d94:   90 83 00 00 stw r4,0(r3)
 d98:   39 23 00 04 addir9,r3,4
 d9c:   90 a9 00 00 stw r5,0(r9)
 da0:   38 63 00 08 addir3,r3,8
 da4:   90 c3 00 00 stw r6,0(r3)
 da8:   38 60 00 00 li  r3,0
 dac:   4e 80 00 20 blr
 db0:   38 60 ff f2 li  r3,-14
 db4:   4e 80 00 20 blr

 <.fixup>:
...
  b8:   48 00 00 00 b   b8 <.fixup+0xb8>
b8: R_PPC_REL24 .text+0xdb0
  bc:   48 00 00 00 b   bc <.fixup+0xbc>
bc: R_PPC_REL24 .text+0xdb0
  c0:   48 00 00 00 b   c0 <.fixup+0xc0>
c0: R_PPC_REL24 .text+0xdb0

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 63 +-
 1 file changed, 54 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 2f500debae21..b904f3c56463 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -93,12 +93,10 @@ static inline int __access_ok(unsigned long addr, unsigned 
long size,
 #define __get_user(x, ptr) \
__get_user_nocheck((x), (ptr), sizeof(*(ptr)), true)
 #define __put_user(x, ptr) \
-   __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), true)
+   __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
 
 #define __get_user_allowed(x, ptr) \
__get_user_nocheck((x), (ptr), sizeof(*(ptr)), false)
-#define __put_user_allowed(x, ptr) \
-   __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), 
false)
 
 #define __get_user_inatomic(x, ptr) \
__get_user_nosleep((x), (ptr), sizeof(*(ptr)))
@@ -124,9 +122,24 @@ extern long __put_user_bad(void);
: "=r" (err)\
: "r" (x), "b" (addr), "i" (-EFAULT), "0" (err))
 
+#define __put_user_asm_goto(x, addr, label, op)\
+   asm volatile goto(  \
+   "1: " op " %0,0(%1) # put_user\n"   \
+   "2:\n"  \
+   ".section .fixup,\"ax\"\n"  \
+   "3: b %l2\n"\
+   ".previous\n"   \
+   EX_TABLE(1b, 3b)\
+   :   \
+   : "r" (x), "b" (addr)   \
+   :   \
+   : label)
+
 #ifdef __powerpc64__
 #define __put_user_asm2(x, ptr, retval)\
  __put_user_asm(x, ptr, retval, "std")
+#define __put_user_asm2_goto(x, ptr, label)\
+   __put_user_asm_goto(x, ptr, label, "std")
 #else /* __powerpc64__ */
 #define __put_user_asm2(x, addr, err)  \

[PATCH v2 11/33] docs: filesystems: fix renamed references

2020-04-14 Thread Mauro Carvalho Chehab

Some filesystem references got broken by a previous patch
series I submitted. Address those.

Signed-off-by: Mauro Carvalho Chehab 
Acked-by: David Sterba  # fs/affs/Kconfig
---
 Documentation/ABI/stable/sysfs-devices-node | 2 +-
 Documentation/ABI/testing/procfs-smaps_rollup   | 2 +-
 Documentation/admin-guide/cpu-load.rst  | 2 +-
 Documentation/admin-guide/nfs/nfsroot.rst   | 2 +-
 Documentation/driver-api/driver-model/device.rst| 4 ++--
 Documentation/driver-api/driver-model/overview.rst  | 2 +-
 Documentation/filesystems/dax.txt   | 2 +-
 Documentation/filesystems/dnotify.txt   | 2 +-
 Documentation/filesystems/ramfs-rootfs-initramfs.rst| 2 +-
 Documentation/filesystems/sysfs.rst | 2 +-
 Documentation/powerpc/firmware-assisted-dump.rst| 2 +-
 Documentation/process/adding-syscalls.rst   | 2 +-
 .../translations/it_IT/process/adding-syscalls.rst  | 2 +-
 Documentation/translations/zh_CN/filesystems/sysfs.txt  | 6 +++---
 drivers/base/core.c | 2 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h | 2 +-
 fs/Kconfig  | 2 +-
 fs/Kconfig.binfmt   | 2 +-
 fs/adfs/Kconfig | 2 +-
 fs/affs/Kconfig | 2 +-
 fs/afs/Kconfig  | 6 +++---
 fs/bfs/Kconfig  | 2 +-
 fs/cramfs/Kconfig   | 2 +-
 fs/ecryptfs/Kconfig | 2 +-
 fs/hfs/Kconfig  | 2 +-
 fs/hpfs/Kconfig | 2 +-
 fs/isofs/Kconfig| 2 +-
 fs/namespace.c  | 2 +-
 fs/notify/inotify/Kconfig   | 2 +-
 fs/ntfs/Kconfig | 2 +-
 fs/ocfs2/Kconfig| 2 +-
 fs/proc/Kconfig | 4 ++--
 fs/romfs/Kconfig| 2 +-
 fs/sysfs/dir.c  | 2 +-
 fs/sysfs/file.c | 2 +-
 fs/sysfs/mount.c| 2 +-
 fs/sysfs/symlink.c  | 2 +-
 fs/sysv/Kconfig | 2 +-
 fs/udf/Kconfig  | 2 +-
 include/linux/kobject.h | 2 +-
 include/linux/kobject_ns.h  | 2 +-
 include/linux/relay.h   | 2 +-
 include/linux/sysfs.h   | 2 +-
 kernel/relay.c  | 2 +-
 lib/kobject.c   | 4 ++--
 45 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-devices-node 
b/Documentation/ABI/stable/sysfs-devices-node
index df8413cf1468..484fc04bcc25 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -54,7 +54,7 @@ Date: October 2002
 Contact:   Linux Memory Management list 
 Description:
Provides information about the node's distribution and memory
-   utilization. Similar to /proc/meminfo, see 
Documentation/filesystems/proc.txt
+   utilization. Similar to /proc/meminfo, see 
Documentation/filesystems/proc.rst
 
 What:  /sys/devices/system/node/nodeX/numastat
 Date:  October 2002
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup 
b/Documentation/ABI/testing/procfs-smaps_rollup
index 274df44d8b1b..046978193368 100644
--- a/Documentation/ABI/testing/procfs-smaps_rollup
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -11,7 +11,7 @@ Description:
Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem
are not present in /proc/pid/smaps.  These fields represent
the sum of the Pss field of each type (anon, file, shmem).
-   For more details, see Documentation/filesystems/proc.txt
+   For more details, see Documentation/filesystems/proc.rst
and the procfs man page.
 
Typical output looks like this:
diff --git a/Documentation/admin-guide/cpu-load.rst 
b/Documentation/admin-guide/cpu-load.rst
index 2d01ce43d2a2..ebdecf864080 100644
--- a/Documentation/admin-guide/cpu-load.rst
+++ b/Documentation/admin-guide/cpu-load.rst
@@ -105,7 +105,7 @@

[PATCH v2 25/33] docs: powerpc: cxl.rst: mark two section titles as such

2020-04-14 Thread Mauro Carvalho Chehab

The User API chapter contains two sub-chapters. Mark them as
such.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/powerpc/cxl.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/powerpc/cxl.rst b/Documentation/powerpc/cxl.rst
index 920546d81326..d2d77057610e 100644
--- a/Documentation/powerpc/cxl.rst
+++ b/Documentation/powerpc/cxl.rst
@@ -133,6 +133,7 @@ User API
 
 
 1. AFU character devices
+
 
 For AFUs operating in AFU directed mode, two character device
 files will be created. /dev/cxl/afu0.0m will correspond to a
@@ -395,6 +396,7 @@ read
 
 
 2. Card character device (powerVM guest only)
+^
 
 In a powerVM guest, an extra character device is created for the
 card. The device is only used to write (flash) a new image on the
-- 
2.25.2

[PATCH v2 00/33] Documentation fixes for Kernel 5.8

2020-04-14 Thread Mauro Carvalho Chehab

Patches 1 to 5 contain changes to the documentation toolset:

- The first 3 patches help to reduce a lot the number of reported
  kernel-doc issues, by making the tool more smart.

- Patches 4 and 5 are meant to partially address the PDF
  build, with now requires Sphinx version 2.4 or upper.

The remaining patches fix broken references detected by
this tool:

./scripts/documentation-file-ref-check

and address other random errors due to tags being mis-interpreted
or mis-used.

They are independent each other, but some may depend on
the kernel-doc improvements.

PS.: Due to the large number of C/C, I opted to keep a smaller
set of C/C at this first e-mail (only e-mails with "L:" tag from
MAINTAINERS file).

Jon,

Those patches should apply cleanly at docs-next, once you
pull from v5.7-rc1.


-

v2:

- patches re-ordered;
- added reviewed/acked-by tags;
- rebased on the top of docs-next + v5.7-rc1.


Mauro Carvalho Chehab (33):
  scripts: kernel-doc: proper handle @foo->bar()
  scripts: kernel-doc: accept negation like !@var
  scripts: kernel-doc: accept blank lines on parameter description
  docs: update recommended Sphinx version to 2.4.4
  docs: LaTeX/PDF: drop list of documents
  MAINTAINERS: dt: update display/allwinner file entry
  MAINTAINERS: dt: fix pointers for ARM Integrator, Versatile and
RealView
  docs: dt: fix broken reference to phy-cadence-torrent.yaml
  docs: fix broken references to text files
  docs: fix broken references for ReST files that moved around
  docs: filesystems: fix renamed references
  docs: amu: supress some Sphinx warnings
  docs: arm64: booting.rst: get rid of some warnings
  docs: pci: boot-interrupts.rst: improve html output
  docs: ras: get rid of some warnings
  docs: ras: don't need to repeat twice the same thing
  docs: infiniband: verbs.c: fix some documentation warnings
  docs: spi: spi.h: fix a doc building warning
  docs: drivers: fix some warnings at base/platform.c when building docs
  docs: mm: userfaultfd.rst: use ``foo`` for literals
  docs: mm: userfaultfd.rst: use a cross-reference for a section
  docs: vm: index.rst: add an orphan doc to the building system
  docs: dt: qcom,dwc3.txt: fix cross-reference for a converted file
  docs: dt: fix a broken reference for a file converted to json
  docs: powerpc: cxl.rst: mark two section titles as such
  docs: i2c: rename i2c.svg to i2c_bus.svg
  docs: Makefile: place final pdf docs on a separate dir
  docs: dt: rockchip,dwc3.txt: fix a pointer to a renamed file
  ata: libata-core: fix a doc warning
  firewire: firewire-cdev.hL get rid of a docs warning
  fs: inode.c: get rid of docs warnings
  futex: get rid of a kernel-docs build warning
  lib: bitmap.c: get rid of some doc warnings

 Documentation/ABI/stable/sysfs-devices-node   |   2 +-
 Documentation/ABI/testing/procfs-smaps_rollup |   2 +-
 Documentation/Makefile|   6 +-
 Documentation/PCI/boot-interrupts.rst |  34 +--
 Documentation/admin-guide/cpu-load.rst|   2 +-
 Documentation/admin-guide/mm/userfaultfd.rst  | 209 +-
 Documentation/admin-guide/nfs/nfsroot.rst |   2 +-
 Documentation/admin-guide/ras.rst |  18 +-
 Documentation/arm64/amu.rst   |   5 +
 Documentation/arm64/booting.rst   |  36 +--
 Documentation/conf.py |  38 
 .../bindings/net/qualcomm-bluetooth.txt   |   2 +-
 .../bindings/phy/ti,phy-j721e-wiz.yaml|   2 +-
 .../devicetree/bindings/usb/qcom,dwc3.txt |   4 +-
 .../devicetree/bindings/usb/rockchip,dwc3.txt |   2 +-
 .../doc-guide/maintainer-profile.rst  |   2 +-
 .../driver-api/driver-model/device.rst|   4 +-
 .../driver-api/driver-model/overview.rst  |   2 +-
 Documentation/filesystems/dax.txt |   2 +-
 Documentation/filesystems/dnotify.txt |   2 +-
 .../filesystems/ramfs-rootfs-initramfs.rst|   2 +-
 Documentation/filesystems/sysfs.rst   |   2 +-
 Documentation/i2c/{i2c.svg => i2c_bus.svg}|   2 +-
 Documentation/i2c/summary.rst |   2 +-
 Documentation/memory-barriers.txt |   2 +-
 Documentation/powerpc/cxl.rst |   2 +
 .../powerpc/firmware-assisted-dump.rst|   2 +-
 Documentation/process/adding-syscalls.rst |   2 +-
 Documentation/process/submit-checklist.rst|   2 +-
 Documentation/sphinx/requirements.txt |   2 +-
 .../it_IT/process/adding-syscalls.rst |   2 +-
 .../it_IT/process/submit-checklist.rst|   2 +-
 .../translations/ko_KR/memory-barriers.txt|   2 +-
 .../translations/zh_CN/filesystems/sysfs.txt  |   8 +-
 .../zh_CN/process/submit-checklist.rst|   2 +-
 Documentation/virt/kvm/arm/pvtime.rst |   2 +-
 Documentation/virt/kvm/devices/vcpu.rst   |   2 +-
 Documentation/virt/kvm/hypercalls.rst |   4 +-
 Documentation/virt/kvm/mmu.rst|   2 +-
 Documentation/virt/kvm/review-checklist.rst   |   2 +-

[PATCH v2 09/33] docs: fix broken references to text files

2020-04-14 Thread Mauro Carvalho Chehab

Several references got broken due to txt to ReST conversion.

Several of them can be automatically fixed with:

scripts/documentation-file-ref-check --fix

Reviewed-by: Mathieu Poirier  # 
hwtracing/coresight/Kconfig
Reviewed-by: Paul E. McKenney  # memory-barrier.txt
Acked-by: Alex Shi  # translations/zh_CN
Acked-by: Federico Vaga  # translations/it_IT
Acked-by: Marc Zyngier  # kvm/arm64
Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/memory-barriers.txt|  2 +-
 Documentation/process/submit-checklist.rst   |  2 +-
 .../translations/it_IT/process/submit-checklist.rst  |  2 +-
 Documentation/translations/ko_KR/memory-barriers.txt |  2 +-
 .../translations/zh_CN/filesystems/sysfs.txt |  2 +-
 .../translations/zh_CN/process/submit-checklist.rst  |  2 +-
 Documentation/virt/kvm/arm/pvtime.rst|  2 +-
 Documentation/virt/kvm/devices/vcpu.rst  |  2 +-
 Documentation/virt/kvm/hypercalls.rst|  4 ++--
 arch/powerpc/include/uapi/asm/kvm_para.h |  2 +-
 drivers/gpu/drm/Kconfig  |  2 +-
 drivers/gpu/drm/drm_ioctl.c  |  2 +-
 drivers/hwtracing/coresight/Kconfig  |  2 +-
 fs/fat/Kconfig   |  8 
 fs/fuse/Kconfig  |  2 +-
 fs/fuse/dev.c|  2 +-
 fs/overlayfs/Kconfig |  6 +++---
 include/linux/mm.h   |  4 ++--
 include/uapi/linux/ethtool_netlink.h |  2 +-
 include/uapi/rdma/rdma_user_ioctl_cmds.h |  2 +-
 mm/gup.c | 12 ++--
 virt/kvm/arm/vgic/vgic-mmio-v3.c |  2 +-
 virt/kvm/arm/vgic/vgic.h |  4 ++--
 23 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/Documentation/memory-barriers.txt 
b/Documentation/memory-barriers.txt
index e1c355e84edd..eaabc3134294 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -620,7 +620,7 @@ because the CPUs that the Linux kernel supports don't do 
writes
 until they are certain (1) that the write will actually happen, (2)
 of the location of the write, and (3) of the value to be written.
 But please carefully read the "CONTROL DEPENDENCIES" section and the
-Documentation/RCU/rcu_dereference.txt file:  The compiler can and does
+Documentation/RCU/rcu_dereference.rst file:  The compiler can and does
 break dependencies in a great many highly creative ways.
 
CPU 1 CPU 2
diff --git a/Documentation/process/submit-checklist.rst 
b/Documentation/process/submit-checklist.rst
index 8e56337d422d..3f8e9d5d95c2 100644
--- a/Documentation/process/submit-checklist.rst
+++ b/Documentation/process/submit-checklist.rst
@@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches.
 and why.
 
 26) If any ioctl's are added by the patch, then also update
-``Documentation/ioctl/ioctl-number.rst``.
+``Documentation/userspace-api/ioctl/ioctl-number.rst``.
 
 27) If your modified source code depends on or uses any of the kernel
 APIs or features that are related to the following ``Kconfig`` symbols,
diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst 
b/Documentation/translations/it_IT/process/submit-checklist.rst
index 995ee69fab11..3e575502690f 100644
--- a/Documentation/translations/it_IT/process/submit-checklist.rst
+++ b/Documentation/translations/it_IT/process/submit-checklist.rst
@@ -117,7 +117,7 @@ sottomissione delle patch, in particolare
 sorgenti che ne spieghi la logica: cosa fanno e perché.
 
 25) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate
-``Documentation/ioctl/ioctl-number.rst``.
+``Documentation/userspace-api/ioctl/ioctl-number.rst``.
 
 26) Se il codice che avete modificato dipende o usa una qualsiasi interfaccia o
 funzionalità del kernel che è associata a uno dei seguenti simboli
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt 
b/Documentation/translations/ko_KR/memory-barriers.txt
index 2e831ece6e26..e50fe6541335 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -641,7 +641,7 @@ P 는 짝수 번호 캐시 라인에 저장되어 있고, 변수 B 는 홀수 
 리눅스 커널이 지원하는 CPU 들은 (1) 쓰기가 정말로 일어날지, (2) 쓰기가 어디에
 이루어질지, 그리고 (3) 쓰여질 값을 확실히 알기 전까지는 쓰기를 수행하지 않기
 때문입니다.  하지만 "컨트롤 의존성" 섹션과
-Documentation/RCU/rcu_dereference.txt 파일을 주의 깊게 읽어 주시기 바랍니다:
+Documentation/RCU/rcu_dereference.rst 파일을 주의 깊게 읽어 주시기 바랍니다:
 컴파일러는 매우 창의적인 많은 방법으로 종속성을 깰 수 있습니다.
 
CPU 1 CPU 2
diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt 
b/Documentation/translations/zh_CN/filesystems/sysfs.txt
index ee1f37da5b23..a15c3ebdfa82 100644
--- a/Documentation/translations/zh_CN/filesystems/sysfs.txt
+++

Re: [PATCH v2 2/2] crypto: Remove unnecessary memzero_explicit()

2020-04-14 Thread Waiman Long

On 4/14/20 2:08 AM, Christophe Leroy wrote:
>
>
> Le 14/04/2020 à 00:28, Waiman Long a écrit :
>> Since kfree_sensitive() will do an implicit memzero_explicit(), there
>> is no need to call memzero_explicit() before it. Eliminate those
>> memzero_explicit() and simplify the call sites. For better correctness,
>> the setting of keylen is also moved down after the key pointer check.
>>
>> Signed-off-by: Waiman Long 
>> ---
>>   .../allwinner/sun8i-ce/sun8i-ce-cipher.c  | 19 +-
>>   .../allwinner/sun8i-ss/sun8i-ss-cipher.c  | 20 +--
>>   drivers/crypto/amlogic/amlogic-gxl-cipher.c   | 12 +++
>>   drivers/crypto/inside-secure/safexcel_hash.c  |  3 +--
>>   4 files changed, 14 insertions(+), 40 deletions(-)
>>
>> diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
>> b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
>> index aa4e8fdc2b32..8358fac98719 100644
>> --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
>> +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c
>> @@ -366,10 +366,7 @@ void sun8i_ce_cipher_exit(struct crypto_tfm *tfm)
>>   {
>>   struct sun8i_cipher_tfm_ctx *op = crypto_tfm_ctx(tfm);
>>   -    if (op->key) {
>> -    memzero_explicit(op->key, op->keylen);
>> -    kfree(op->key);
>> -    }
>> +    kfree_sensitive(op->key);
>>   crypto_free_sync_skcipher(op->fallback_tfm);
>>   pm_runtime_put_sync_suspend(op->ce->dev);
>>   }
>> @@ -391,14 +388,11 @@ int sun8i_ce_aes_setkey(struct crypto_skcipher
>> *tfm, const u8 *key,
>>   dev_dbg(ce->dev, "ERROR: Invalid keylen %u\n", keylen);
>>   return -EINVAL;
>>   }
>> -    if (op->key) {
>> -    memzero_explicit(op->key, op->keylen);
>> -    kfree(op->key);
>> -    }
>> -    op->keylen = keylen;
>> +    kfree_sensitive(op->key);
>>   op->key = kmemdup(key, keylen, GFP_KERNEL | GFP_DMA);
>>   if (!op->key)
>>   return -ENOMEM;
>> +    op->keylen = keylen;
>
> Does it matter at all to ensure op->keylen is not set when of->key is
> NULL ? I'm not sure.
>
> But if it does, then op->keylen should be set to 0 when freeing op->key. 

My thinking is that if memory allocation fails, we just don't touch
anything and return an error code. I will not explicitly set keylen to 0
in this case unless it is specified in the API documentation.

Cheers,
Longman

Re: [PATCH] kvm_host: unify VM_STAT and VCPU_STAT definitions in a single place

2020-04-14 Thread Emanuele Giuseppe Esposito





On 4/14/20 10:18 AM, Paolo Bonzini wrote:

On 13/04/20 23:34, Philippe Mathieu-Daudé wrote:

+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## 
__VA_ARGS__
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## 
__VA_ARGS__

I find this macro expanding into multiple fields odd... Maybe a matter
of taste. Sugggestion, have the macro define the full structure, as in
the arm64 arch:

#define VM_STAT(n, x, ...) { n, offsetof(struct kvm, stat.x),
KVM_STAT_VM, ## __VA_ARGS__ }

Ditto for VCPU_STAT().


Hi Philippe and Paolo,


Yes, that's a good idea.  Emanuele, can you switch it to this format?


Sure, I just submitted the v2 version.

Thanks,

Emanuele

Re: [PATCH v6 6/7] ASoC: dt-bindings: fsl_easrc: Add document for EASRC

2020-04-14 Thread Rob Herring

On Wed, Apr 01, 2020 at 04:45:39PM +0800, Shengjiu Wang wrote:
> EASRC (Enhanced Asynchronous Sample Rate Converter) is a new
> IP module found on i.MX8MN.
> 
> Signed-off-by: Shengjiu Wang 
> ---
>  .../devicetree/bindings/sound/fsl,easrc.yaml  | 101 ++
>  1 file changed, 101 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/sound/fsl,easrc.yaml
> 
> diff --git a/Documentation/devicetree/bindings/sound/fsl,easrc.yaml 
> b/Documentation/devicetree/bindings/sound/fsl,easrc.yaml
> new file mode 100644
> index ..14ea60084420
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/sound/fsl,easrc.yaml
> @@ -0,0 +1,101 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/sound/fsl,easrc.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: NXP Asynchronous Sample Rate Converter (ASRC) Controller
> +
> +maintainers:
> +  - Shengjiu Wang 
> +
> +properties:
> +  $nodename:
> +pattern: "^easrc@.*"
> +
> +  compatible:
> +const: fsl,imx8mn-easrc
> +
> +  reg:
> +maxItems: 1
> +
> +  interrupts:
> +maxItems: 1
> +
> +  clocks:
> +items:
> +  - description: Peripheral clock
> +
> +  clock-names:
> +items:
> +  - const: mem
> +
> +  dmas:
> +maxItems: 8
> +
> +  dma-names:
> +items:
> +  - const: ctx0_rx
> +  - const: ctx0_tx
> +  - const: ctx1_rx
> +  - const: ctx1_tx
> +  - const: ctx2_rx
> +  - const: ctx2_tx
> +  - const: ctx3_rx
> +  - const: ctx3_tx
> +
> +  firmware-name:
> +allOf:
> +  - $ref: /schemas/types.yaml#/definitions/string
> +  - const: imx/easrc/easrc-imx8mn.bin
> +description: The coefficient table for the filters
> +
> +  fsl,asrc-rate:

fsl,asrc-rate-hz

> +allOf:
> +  - $ref: /schemas/types.yaml#/definitions/uint32

And then you can drop this.

> +  - minimum: 8000
> +  - maximum: 192000
> +description: Defines a mutual sample rate used by DPCM Back Ends
> +
> +  fsl,asrc-format:
> +allOf:
> +  - $ref: /schemas/types.yaml#/definitions/uint32
> +  - enum: [2, 6, 10, 32, 36]
> +default: 2
> +description:
> +  Defines a mutual sample format used by DPCM Back Ends
> +
> +required:
> +  - compatible
> +  - reg
> +  - interrupts
> +  - clocks
> +  - clock-names
> +  - dmas
> +  - dma-names
> +  - firmware-name
> +  - fsl,asrc-rate
> +  - fsl,asrc-format
> +
> +examples:
> +  - |
> +#include 
> +
> +easrc: easrc@300C {

Lowercase hex

> +   compatible = "fsl,imx8mn-easrc";
> +   reg = <0x0 0x300C 0x0 0x1>;
> +   interrupts = <0x0 122 0x4>;
> +   clocks = < IMX8MN_CLK_ASRC_ROOT>;
> +   clock-names = "mem";
> +   dmas = < 16 23 0> , < 17 23 0>,
> +  < 18 23 0> , < 19 23 0>,
> +  < 20 23 0> , < 21 23 0>,
> +  < 22 23 0> , < 23 23 0>;
> +   dma-names = "ctx0_rx", "ctx0_tx",
> +   "ctx1_rx", "ctx1_tx",
> +   "ctx2_rx", "ctx2_tx",
> +   "ctx3_rx", "ctx3_tx";
> +   firmware-name = "imx/easrc/easrc-imx8mn.bin";
> +   fsl,asrc-rate  = <8000>;
> +   fsl,asrc-format = <2>;
> +};
> -- 
> 2.21.0
>

[PATCH v4 14/14] mm: remove __ARCH_HAS_5LEVEL_HACK and include/asm-generic/5level-fixup.h

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

There are no architectures that use include/asm-generic/5level-fixup.h
therefore it can be removed along with __ARCH_HAS_5LEVEL_HACK define and
the code it surrounds

Signed-off-by: Mike Rapoport 
---
 include/asm-generic/5level-fixup.h | 58 --
 include/linux/mm.h |  6 
 mm/kasan/init.c| 11 --
 mm/memory.c|  8 -
 4 files changed, 83 deletions(-)
 delete mode 100644 include/asm-generic/5level-fixup.h

diff --git a/include/asm-generic/5level-fixup.h 
b/include/asm-generic/5level-fixup.h
deleted file mode 100644
index 4c74b1c1d13b..
--- a/include/asm-generic/5level-fixup.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _5LEVEL_FIXUP_H
-#define _5LEVEL_FIXUP_H
-
-#define __ARCH_HAS_5LEVEL_HACK
-#define __PAGETABLE_P4D_FOLDED 1
-
-#define P4D_SHIFT  PGDIR_SHIFT
-#define P4D_SIZE   PGDIR_SIZE
-#define P4D_MASK   PGDIR_MASK
-#define MAX_PTRS_PER_P4D   1
-#define PTRS_PER_P4D   1
-
-#define p4d_t  pgd_t
-
-#define pud_alloc(mm, p4d, address) \
-   ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \
-   NULL : pud_offset(p4d, address))
-
-#define p4d_alloc(mm, pgd, address)(pgd)
-#define p4d_offset(pgd, start) (pgd)
-
-#ifndef __ASSEMBLY__
-static inline int p4d_none(p4d_t p4d)
-{
-   return 0;
-}
-
-static inline int p4d_bad(p4d_t p4d)
-{
-   return 0;
-}
-
-static inline int p4d_present(p4d_t p4d)
-{
-   return 1;
-}
-#endif
-
-#define p4d_ERROR(p4d) do { } while (0)
-#define p4d_clear(p4d) pgd_clear(p4d)
-#define p4d_val(p4d)   pgd_val(p4d)
-#define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud)
-#define p4d_populate_safe(mm, p4d, pud)pgd_populate(mm, p4d, pud)
-#define p4d_page(p4d)  pgd_page(p4d)
-#define p4d_page_vaddr(p4d)pgd_page_vaddr(p4d)
-
-#define __p4d(x)   __pgd(x)
-#define set_p4d(p4dp, p4d) set_pgd(p4dp, p4d)
-
-#undef p4d_free_tlb
-#define p4d_free_tlb(tlb, x, addr) do { } while (0)
-#define p4d_free(mm, x)do { } while (0)
-
-#undef  p4d_addr_end
-#define p4d_addr_end(addr, end)(end)
-
-#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5a323422d783..f794b77df1ca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2060,11 +2060,6 @@ int __pte_alloc_kernel(pmd_t *pmd);
 
 #if defined(CONFIG_MMU)
 
-/*
- * The following ifdef needed to get the 5level-fixup.h header to work.
- * Remove it when 5level-fixup.h has been removed.
- */
-#ifndef __ARCH_HAS_5LEVEL_HACK
 static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
unsigned long address)
 {
@@ -2078,7 +2073,6 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, 
p4d_t *p4d,
return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
NULL : pud_offset(p4d, address);
 }
-#endif /* !__ARCH_HAS_5LEVEL_HACK */
 
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long 
address)
 {
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ce45c491ebcd..fe6be0be1f76 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void 
*shadow_start,
 * 3,2 - level page tables where we don't have
 * puds,pmds, so pgd_populate(), pud_populate()
 * is noops.
-*
-* The ifndef is required to avoid build breakage.
-*
-* With 5level-fixup.h, pgd_populate() is not nop and
-* we reference kasan_early_shadow_p4d. It's not defined
-* unless 5-level paging enabled.
-*
-* The ifndef can be dropped once all KASAN-enabled
-* architectures will switch to pgtable-nop4d.h.
 */
-#ifndef __ARCH_HAS_5LEVEL_HACK
pgd_populate(_mm, pgd,
lm_alias(kasan_early_shadow_p4d));
-#endif
p4d = p4d_offset(pgd, addr);
p4d_populate(_mm, p4d,
lm_alias(kasan_early_shadow_pud));
diff --git a/mm/memory.c b/mm/memory.c
index f703fe8c8346..379277c631b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4434,19 +4434,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, 
unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
 
spin_lock(>page_table_lock);
-#ifndef __ARCH_HAS_5LEVEL_HACK
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);

[PATCH v4 13/14] asm-generic: remove pgtable-nop4d-hack.h

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

No architecture defines __ARCH_USE_5LEVEL_HACK and therefore
pgtable-nop4d-hack.h will be never actually included.

Remove it.

Signed-off-by: Mike Rapoport 
---
 include/asm-generic/pgtable-nop4d-hack.h | 64 
 include/asm-generic/pgtable-nopud.h  |  4 --
 2 files changed, 68 deletions(-)
 delete mode 100644 include/asm-generic/pgtable-nop4d-hack.h

diff --git a/include/asm-generic/pgtable-nop4d-hack.h 
b/include/asm-generic/pgtable-nop4d-hack.h
deleted file mode 100644
index 829bdb0d6327..
--- a/include/asm-generic/pgtable-nop4d-hack.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PGTABLE_NOP4D_HACK_H
-#define _PGTABLE_NOP4D_HACK_H
-
-#ifndef __ASSEMBLY__
-#include 
-
-#define __PAGETABLE_PUD_FOLDED 1
-
-/*
- * Having the pud type consist of a pgd gets the size right, and allows
- * us to conceptually access the pgd entry that this pud is folded into
- * without casting.
- */
-typedef struct { pgd_t pgd; } pud_t;
-
-#define PUD_SHIFT  PGDIR_SHIFT
-#define PTRS_PER_PUD   1
-#define PUD_SIZE   (1UL << PUD_SHIFT)
-#define PUD_MASK   (~(PUD_SIZE-1))
-
-/*
- * The "pgd_xxx()" functions here are trivial for a folded two-level
- * setup: the pud is never bad, and a pud always exists (as it's folded
- * into the pgd entry)
- */
-static inline int pgd_none(pgd_t pgd)  { return 0; }
-static inline int pgd_bad(pgd_t pgd)   { return 0; }
-static inline int pgd_present(pgd_t pgd)   { return 1; }
-static inline void pgd_clear(pgd_t *pgd)   { }
-#define pud_ERROR(pud) (pgd_ERROR((pud).pgd))
-
-#define pgd_populate(mm, pgd, pud) do { } while (0)
-#define pgd_populate_safe(mm, pgd, pud)do { } while (0)
-/*
- * (puds are folded into pgds so this doesn't get actually called,
- * but the define is needed for a generic inline function.)
- */
-#define set_pgd(pgdptr, pgdval)set_pud((pud_t *)(pgdptr), (pud_t) { 
pgdval })
-
-static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
-{
-   return (pud_t *)pgd;
-}
-
-#define pud_val(x) (pgd_val((x).pgd))
-#define __pud(x)   ((pud_t) { __pgd(x) })
-
-#define pgd_page(pgd)  (pud_page((pud_t){ pgd }))
-#define pgd_page_vaddr(pgd)(pud_page_vaddr((pud_t){ pgd }))
-
-/*
- * allocating and freeing a pud is trivial: the 1-entry pud is
- * inside the pgd, so has no extra memory associated with it.
- */
-#define pud_alloc_one(mm, address) NULL
-#define pud_free(mm, x)do { } while (0)
-#define __pud_free_tlb(tlb, x, a)  do { } while (0)
-
-#undef  pud_addr_end
-#define pud_addr_end(addr, end)(end)
-
-#endif /* __ASSEMBLY__ */
-#endif /* _PGTABLE_NOP4D_HACK_H */
diff --git a/include/asm-generic/pgtable-nopud.h 
b/include/asm-generic/pgtable-nopud.h
index d3776cb494c0..ad05c1684bfc 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -4,9 +4,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifdef __ARCH_USE_5LEVEL_HACK
-#include 
-#else
 #include 
 
 #define __PAGETABLE_PUD_FOLDED 1
@@ -65,5 +62,4 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long 
address)
 #define pud_addr_end(addr, end)(end)
 
 #endif /* __ASSEMBLY__ */
-#endif /* !__ARCH_USE_5LEVEL_HACK */
 #endif /* _PGTABLE_NOPUD_H */
-- 
2.25.1

[PATCH v4 12/14] unicore32: remove __ARCH_USE_5LEVEL_HACK

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

The unicore32 architecture has 2 level page tables and
asm-generic/pgtable-nopmd.h and explicit casts from pud_t to pgd_t for page
table folding.

Add p4d walk in the only place that actually unfolds the pud level and
remove __ARCH_USE_5LEVEL_HACK.

Signed-off-by: Mike Rapoport 
---
 arch/unicore32/include/asm/pgtable.h | 1 -
 arch/unicore32/kernel/hibernate.c| 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/unicore32/include/asm/pgtable.h 
b/arch/unicore32/include/asm/pgtable.h
index 3b8731b3a937..826f49edd94e 100644
--- a/arch/unicore32/include/asm/pgtable.h
+++ b/arch/unicore32/include/asm/pgtable.h
@@ -9,7 +9,6 @@
 #ifndef __UNICORE_PGTABLE_H__
 #define __UNICORE_PGTABLE_H__
 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 #include 
 
diff --git a/arch/unicore32/kernel/hibernate.c 
b/arch/unicore32/kernel/hibernate.c
index f3812245cc00..ccad051a79b6 100644
--- a/arch/unicore32/kernel/hibernate.c
+++ b/arch/unicore32/kernel/hibernate.c
@@ -33,9 +33,11 @@ struct swsusp_arch_regs swsusp_arch_regs_cpu0;
 static pmd_t *resume_one_md_table_init(pgd_t *pgd)
 {
pud_t *pud;
+   p4d_t *p4d;
pmd_t *pmd_table;
 
-   pud = pud_offset(pgd, 0);
+   p4d = p4d_offset(pgd, 0);
+   pud = pud_offset(p4d, 0);
pmd_table = pmd_offset(pud, 0);
 
return pmd_table;
-- 
2.25.1

[PATCH v4 11/14] sh: add support for folded p4d page tables

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK.

Signed-off-by: Mike Rapoport 
---
 arch/sh/include/asm/pgtable-2level.h |  1 -
 arch/sh/include/asm/pgtable-3level.h |  1 -
 arch/sh/kernel/io_trapped.c  |  7 ++-
 arch/sh/mm/cache-sh4.c   |  4 +++-
 arch/sh/mm/cache-sh5.c   |  7 ++-
 arch/sh/mm/fault.c   | 26 +++---
 arch/sh/mm/hugetlbpage.c | 28 ++--
 arch/sh/mm/init.c|  9 -
 arch/sh/mm/kmap.c|  2 +-
 arch/sh/mm/tlbex_32.c|  6 +-
 arch/sh/mm/tlbex_64.c|  7 ++-
 11 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/arch/sh/include/asm/pgtable-2level.h 
b/arch/sh/include/asm/pgtable-2level.h
index bf1eb51c3ee5..08bff93927ff 100644
--- a/arch/sh/include/asm/pgtable-2level.h
+++ b/arch/sh/include/asm/pgtable-2level.h
@@ -2,7 +2,6 @@
 #ifndef __ASM_SH_PGTABLE_2LEVEL_H
 #define __ASM_SH_PGTABLE_2LEVEL_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 
 /*
diff --git a/arch/sh/include/asm/pgtable-3level.h 
b/arch/sh/include/asm/pgtable-3level.h
index 779260b721ca..0f80097e5c9c 100644
--- a/arch/sh/include/asm/pgtable-3level.h
+++ b/arch/sh/include/asm/pgtable-3level.h
@@ -2,7 +2,6 @@
 #ifndef __ASM_SH_PGTABLE_3LEVEL_H
 #define __ASM_SH_PGTABLE_3LEVEL_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 
 /*
diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c
index 60c828a2b8a2..037aab2708b7 100644
--- a/arch/sh/kernel/io_trapped.c
+++ b/arch/sh/kernel/io_trapped.c
@@ -136,6 +136,7 @@ EXPORT_SYMBOL_GPL(match_trapped_io_handler);
 static struct trapped_io *lookup_tiop(unsigned long address)
 {
pgd_t *pgd_k;
+   p4d_t *p4d_k;
pud_t *pud_k;
pmd_t *pmd_k;
pte_t *pte_k;
@@ -145,7 +146,11 @@ static struct trapped_io *lookup_tiop(unsigned long 
address)
if (!pgd_present(*pgd_k))
return NULL;
 
-   pud_k = pud_offset(pgd_k, address);
+   p4d_k = p4d_offset(pgd_k, address);
+   if (!p4d_present(*p4d_k))
+   return NULL;
+
+   pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
 
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index eee911422cf9..45943bcb7042 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -209,6 +209,7 @@ static void sh4_flush_cache_page(void *args)
unsigned long address, pfn, phys;
int map_coherent = 0;
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -224,7 +225,8 @@ static void sh4_flush_cache_page(void *args)
return;
 
pgd = pgd_offset(vma->vm_mm, address);
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
pte = pte_offset_kernel(pmd, address);
 
diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c
index 445b5e69b73c..442a77cc2957 100644
--- a/arch/sh/mm/cache-sh5.c
+++ b/arch/sh/mm/cache-sh5.c
@@ -383,6 +383,7 @@ static void sh64_dcache_purge_user_pages(struct mm_struct 
*mm,
unsigned long addr, unsigned long end)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -397,7 +398,11 @@ static void sh64_dcache_purge_user_pages(struct mm_struct 
*mm,
if (pgd_bad(*pgd))
return;
 
-   pud = pud_offset(pgd, addr);
+   p4d = p4d_offset(pgd, addr);
+   if (p4d_none(*p4d) || p4d_bad(*p4d))
+   return;
+
+   pud = pud_offset(p4d, addr);
if (pud_none(*pud) || pud_bad(*pud))
return;
 
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 7b74e18b71d7..8b3ab65c81c4 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -53,6 +53,7 @@ static void show_pte(struct mm_struct *mm, unsigned long addr)
 (u64)pgd_val(*pgd));
 
do {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -65,7 +66,20 @@ static void show_pte(struct mm_struct *mm, unsigned long 
addr)
break;
}
 
-   pud = pud_offset(pgd, addr);
+   p4d = p4d_offset(pgd, addr);
+   if (PTRS_PER_P4D != 1)
+   pr_cont(", *p4d=%0*Lx", (u32)(sizeof(*p4d) * 2),
+   (u64)p4d_val(*p4d));
+
+   if (p4d_none(*p4d))
+   break;
+
+   if (p4d_bad(*p4d)) {
+   pr_cont("(bad)");
+   break;
+   }
+
+   pud = pud_offset(p4d, addr);
if (PTRS_PER_PUD != 1)

[PATCH v4 10/14] sh: drop __pXd_offset() macros that duplicate pXd_index() ones

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

The __pXd_offset() macros are identical to the pXd_index() macros and there
is no point to keep both of them. All architectures define and use
pXd_index() so let's keep only those to make mips consistent with the rest
of the kernel.

Signed-off-by: Mike Rapoport 
---
 arch/sh/include/asm/pgtable_32.h | 5 ++---
 arch/sh/include/asm/pgtable_64.h | 5 ++---
 arch/sh/mm/init.c| 6 +++---
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h
index 29274f0e428e..4acce5f2cbf9 100644
--- a/arch/sh/include/asm/pgtable_32.h
+++ b/arch/sh/include/asm/pgtable_32.h
@@ -407,13 +407,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 #define pgd_offset(mm, address)((mm)->pgd + pgd_index(address))
-#define __pgd_offset(address)  pgd_index(address)
 
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(address)  pgd_offset(_mm, address)
 
-#define __pud_offset(address)  (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#define __pmd_offset(address)  (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
 
 /* Find an entry in the third-level page table.. */
 #define pte_index(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
diff --git a/arch/sh/include/asm/pgtable_64.h b/arch/sh/include/asm/pgtable_64.h
index 1778bc5971e7..27cc282ec6c0 100644
--- a/arch/sh/include/asm/pgtable_64.h
+++ b/arch/sh/include/asm/pgtable_64.h
@@ -46,14 +46,13 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval)
 
 /* To find an entry in a generic PGD. */
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
-#define __pgd_offset(address) pgd_index(address)
 #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
 
 /* To find an entry in a kernel PGD. */
 #define pgd_offset_k(address) pgd_offset(_mm, address)
 
-#define __pud_offset(address)  (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#define __pmd_offset(address)  (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+/* #define pmd_index(address)  (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) */
 
 /*
  * PMD level access routines. Same notes as above.
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index b9de2d4fa57e..f445ba630790 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -172,9 +172,9 @@ void __init page_table_range_init(unsigned long start, 
unsigned long end,
unsigned long vaddr;
 
vaddr = start;
-   i = __pgd_offset(vaddr);
-   j = __pud_offset(vaddr);
-   k = __pmd_offset(vaddr);
+   i = pgd_index(vaddr);
+   j = pud_index(vaddr);
+   k = pmd_index(vaddr);
pgd = pgd_base + i;
 
for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) {
-- 
2.25.1

[PATCH v4 09/14] sh: fault: Modernize printing of kernel messages

2020-04-14 Thread Mike Rapoport

From: Geert Uytterhoeven 

  - Convert from printk() to pr_*(),
  - Add missing continuations,
  - Use "%llx" to format u64,
  - Join multiple prints in show_fault_oops() into a single print.

Signed-off-by: Geert Uytterhoeven 
Signed-off-by: Mike Rapoport 
---
 arch/sh/mm/fault.c | 39 ++-
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 5f23d7907597..7b74e18b71d7 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -47,10 +47,10 @@ static void show_pte(struct mm_struct *mm, unsigned long 
addr)
pgd = swapper_pg_dir;
}
 
-   printk(KERN_ALERT "pgd = %p\n", pgd);
+   pr_alert("pgd = %p\n", pgd);
pgd += pgd_index(addr);
-   printk(KERN_ALERT "[%08lx] *pgd=%0*Lx", addr,
-  (u32)(sizeof(*pgd) * 2), (u64)pgd_val(*pgd));
+   pr_alert("[%08lx] *pgd=%0*llx", addr, (u32)(sizeof(*pgd) * 2),
+(u64)pgd_val(*pgd));
 
do {
pud_t *pud;
@@ -61,33 +61,33 @@ static void show_pte(struct mm_struct *mm, unsigned long 
addr)
break;
 
if (pgd_bad(*pgd)) {
-   printk("(bad)");
+   pr_cont("(bad)");
break;
}
 
pud = pud_offset(pgd, addr);
if (PTRS_PER_PUD != 1)
-   printk(", *pud=%0*Lx", (u32)(sizeof(*pud) * 2),
-  (u64)pud_val(*pud));
+   pr_cont(", *pud=%0*llx", (u32)(sizeof(*pud) * 2),
+   (u64)pud_val(*pud));
 
if (pud_none(*pud))
break;
 
if (pud_bad(*pud)) {
-   printk("(bad)");
+   pr_cont("(bad)");
break;
}
 
pmd = pmd_offset(pud, addr);
if (PTRS_PER_PMD != 1)
-   printk(", *pmd=%0*Lx", (u32)(sizeof(*pmd) * 2),
-  (u64)pmd_val(*pmd));
+   pr_cont(", *pmd=%0*llx", (u32)(sizeof(*pmd) * 2),
+   (u64)pmd_val(*pmd));
 
if (pmd_none(*pmd))
break;
 
if (pmd_bad(*pmd)) {
-   printk("(bad)");
+   pr_cont("(bad)");
break;
}
 
@@ -96,11 +96,11 @@ static void show_pte(struct mm_struct *mm, unsigned long 
addr)
break;
 
pte = pte_offset_kernel(pmd, addr);
-   printk(", *pte=%0*Lx", (u32)(sizeof(*pte) * 2),
-  (u64)pte_val(*pte));
+   pr_cont(", *pte=%0*llx", (u32)(sizeof(*pte) * 2),
+   (u64)pte_val(*pte));
} while (0);
 
-   printk("\n");
+   pr_cont("\n");
 }
 
 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
@@ -188,14 +188,11 @@ show_fault_oops(struct pt_regs *regs, unsigned long 
address)
if (!oops_may_print())
return;
 
-   printk(KERN_ALERT "BUG: unable to handle kernel ");
-   if (address < PAGE_SIZE)
-   printk(KERN_CONT "NULL pointer dereference");
-   else
-   printk(KERN_CONT "paging request");
-
-   printk(KERN_CONT " at %08lx\n", address);
-   printk(KERN_ALERT "PC:");
+   pr_alert("BUG: unable to handle kernel %s at %08lx\n",
+address < PAGE_SIZE ? "NULL pointer dereference"
+: "paging request",
+address);
+   pr_alert("PC:");
printk_address(regs->pc, 1);
 
show_pte(NULL, address);
-- 
2.25.1

[PATCH v4 08/14] powerpc: add support for folded p4d page tables

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.

Signed-off-by: Mike Rapoport 
Tested-by: Christophe Leroy  # 8xx and 83xx
---
 arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 -
 arch/powerpc/include/asm/book3s/64/hash.h |  4 +-
 arch/powerpc/include/asm/book3s/64/pgalloc.h  |  4 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 60 ++-
 arch/powerpc/include/asm/book3s/64/radix.h|  6 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 -
 arch/powerpc/include/asm/nohash/64/pgalloc.h  |  2 +-
 .../include/asm/nohash/64/pgtable-4k.h| 32 +-
 arch/powerpc/include/asm/nohash/64/pgtable.h  |  6 +-
 arch/powerpc/include/asm/pgtable.h| 10 ++--
 arch/powerpc/kvm/book3s_64_mmu_radix.c| 32 ++
 arch/powerpc/lib/code-patching.c  |  7 ++-
 arch/powerpc/mm/book3s64/hash_pgtable.c   |  4 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c  | 26 +---
 arch/powerpc/mm/book3s64/subpage_prot.c   |  6 +-
 arch/powerpc/mm/hugetlbpage.c | 28 +
 arch/powerpc/mm/nohash/book3e_pgtable.c   | 15 ++---
 arch/powerpc/mm/pgtable.c | 30 ++
 arch/powerpc/mm/pgtable_64.c  | 10 ++--
 arch/powerpc/mm/ptdump/hashpagetable.c| 20 ++-
 arch/powerpc/mm/ptdump/ptdump.c   | 14 +++--
 arch/powerpc/xmon/xmon.c  | 18 +++---
 22 files changed, 197 insertions(+), 139 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 7549393c4c43..6052b72216a6 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -2,7 +2,6 @@
 #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
 #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 
 #include 
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 6fc4520092c7..73ad038ed10b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea)
 
 #definehash__pmd_bad(pmd)  (pmd_val(pmd) & H_PMD_BAD_BITS)
 #definehash__pud_bad(pud)  (pud_val(pud) & H_PUD_BAD_BITS)
-static inline int hash__pgd_bad(pgd_t pgd)
+static inline int hash__p4d_bad(p4d_t p4d)
 {
-   return (pgd_val(pgd) == 0);
+   return (p4d_val(p4d) == 0);
 }
 #ifdef CONFIG_STRICT_KERNEL_RWX
 extern void hash__mark_rodata_ro(void);
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h 
b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index a41e91bd0580..69c5b051734f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
 {
-   *pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
+   *pgd =  __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 368b136517e0..bc047514724c 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
 #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
 
-#include 
+#include 
 
 #ifndef __ASSEMBLY__
 #include 
@@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift;
 /* Bits to mask out from a PUD to get to the PMD page */
 #define PUD_MASKED_BITS0xc0ffUL
 /* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS0xc0ffUL
+#define P4D_MASKED_BITS0xc0ffUL
 
 /*
  * Used as an indicator for rcu callback functions
@@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool 
write)
return pte_access_permitted(pud_pte(pud), write);
 }
 
-#define pgd_write(pgd) pte_write(pgd_pte(pgd))
+#define __p4d_raw(x)   ((p4d_t) { __pgd_raw(x) })
+static inline __be64 p4d_raw(p4d_t x)
+{
+   return pgd_raw(x.pgd);
+}
+
+#define p4d_write(p4d) pte_write(p4d_pte(p4d))
 
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void p4d_clear(p4d_t *p4dp)
 {
-   *pgdp = __pgd(0);
+   *p4dp = __p4d(0);
 }
 
-static inline int pgd_none(pgd_t pgd)
+static inline int p4d_none(p4d_t p4d)
 {
-   return !pgd_raw(pgd);
+   return !p4d_raw(p4d);
 }
 
-static inline int

[PATCH v4 07/14] openrisc: add support for folded p4d page tables

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK.

Signed-off-by: Mike Rapoport 
---
 arch/openrisc/include/asm/pgtable.h |  1 -
 arch/openrisc/mm/fault.c| 10 --
 arch/openrisc/mm/init.c |  4 +++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/openrisc/include/asm/pgtable.h 
b/arch/openrisc/include/asm/pgtable.h
index 7f3fb9ceb083..219979e57790 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -21,7 +21,6 @@
 #ifndef __ASM_OPENRISC_PGTABLE_H
 #define __ASM_OPENRISC_PGTABLE_H
 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 
 #ifndef __ASSEMBLY__
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 8af1cc78c4fb..6e0a11ac4c00 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -295,6 +295,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
unsigned long address,
 
int offset = pgd_index(address);
pgd_t *pgd, *pgd_k;
+   p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
pte_t *pte_k;
@@ -321,8 +322,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
unsigned long address,
 * it exists.
 */
 
-   pud = pud_offset(pgd, address);
-   pud_k = pud_offset(pgd_k, address);
+   p4d = p4d_offset(pgd, address);
+   p4d_k = p4d_offset(pgd_k, address);
+   if (!p4d_present(*p4d_k))
+   goto no_context;
+
+   pud = pud_offset(p4d, address);
+   pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
goto no_context;
 
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index 1f87b524db78..2536aeae0975 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -71,6 +71,7 @@ static void __init map_ram(void)
unsigned long v, p, e;
pgprot_t prot;
pgd_t *pge;
+   p4d_t *p4e;
pud_t *pue;
pmd_t *pme;
pte_t *pte;
@@ -90,7 +91,8 @@ static void __init map_ram(void)
 
while (p < e) {
int j;
-   pue = pud_offset(pge, v);
+   p4e = p4d_offset(pge, v);
+   pue = pud_offset(p4e, v);
pme = pmd_offset(pue, v);
 
if ((u32) pue != (u32) pge || (u32) pme != (u32) pge) {
-- 
2.25.1

[PATCH v4 06/14] nios2: add support for folded p4d page tables

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK.

Signed-off-by: Mike Rapoport 
---
 arch/nios2/include/asm/pgtable.h | 3 +--
 arch/nios2/mm/fault.c| 9 +++--
 arch/nios2/mm/ioremap.c  | 6 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index f98b7f4519ba..47a1a3ea5734 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -22,7 +22,6 @@
 #include 
 
 #include 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 
 #define FIRST_USER_ADDRESS 0UL
@@ -100,7 +99,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)];
  */
 static inline void set_pmd(pmd_t *pmdptr, pmd_t pmdval)
 {
-   pmdptr->pud.pgd.pgd = pmdval.pud.pgd.pgd;
+   *pmdptr = pmdval;
 }
 
 /* to find an entry in a page-table-directory */
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index ec9d8a9c426f..964eac1a21d0 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -242,6 +242,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
unsigned long cause,
 */
int offset = pgd_index(address);
pgd_t *pgd, *pgd_k;
+   p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
pte_t *pte_k;
@@ -253,8 +254,12 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
unsigned long cause,
goto no_context;
set_pgd(pgd, *pgd_k);
 
-   pud = pud_offset(pgd, address);
-   pud_k = pud_offset(pgd_k, address);
+   p4d = p4d_offset(pgd, address);
+   p4d_k = p4d_offset(pgd_k, address);
+   if (!p4d_present(*p4d_k))
+   goto no_context;
+   pud = pud_offset(p4d, address);
+   pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
goto no_context;
pmd = pmd_offset(pud, address);
diff --git a/arch/nios2/mm/ioremap.c b/arch/nios2/mm/ioremap.c
index 819bdfcc2e71..fe821efb9a99 100644
--- a/arch/nios2/mm/ioremap.c
+++ b/arch/nios2/mm/ioremap.c
@@ -86,11 +86,15 @@ static int remap_area_pages(unsigned long address, unsigned 
long phys_addr,
if (address >= end)
BUG();
do {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
 
error = -ENOMEM;
-   pud = pud_alloc(_mm, dir, address);
+   p4d = p4d_alloc(_mm, dir, address);
+   if (!p4d)
+   break;
+   pud = pud_alloc(_mm, p4d, address);
if (!pud)
break;
pmd = pmd_alloc(_mm, pud, address);
-- 
2.25.1

[PATCH v4 05/14] ia64: add support for folded p4d page tables

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate, remove usage of __ARCH_USE_5LEVEL_HACK and replace
5level-fixup.h with pgtable-nop4d.h

Signed-off-by: Mike Rapoport 
---
 arch/ia64/include/asm/pgalloc.h |  4 ++--
 arch/ia64/include/asm/pgtable.h | 17 -
 arch/ia64/mm/fault.c|  7 ++-
 arch/ia64/mm/hugetlbpage.c  | 18 --
 arch/ia64/mm/init.c | 28 
 5 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index f4c491044882..2a3050345099 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -36,9 +36,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 #if CONFIG_PGTABLE_LEVELS == 4
 static inline void
-pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
+p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud)
 {
-   pgd_val(*pgd_entry) = __pa(pud);
+   p4d_val(*p4d_entry) = __pa(pud);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 0e7b645b76c6..787b0a91d255 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -283,12 +283,12 @@ extern unsigned long VMALLOC_END;
 #define pud_page(pud)  virt_to_page((pud_val(pud) + 
PAGE_OFFSET))
 
 #if CONFIG_PGTABLE_LEVELS == 4
-#define pgd_none(pgd)  (!pgd_val(pgd))
-#define pgd_bad(pgd)   (!ia64_phys_addr_valid(pgd_val(pgd)))
-#define pgd_present(pgd)   (pgd_val(pgd) != 0UL)
-#define pgd_clear(pgdp)(pgd_val(*(pgdp)) = 0UL)
-#define pgd_page_vaddr(pgd)((unsigned long) __va(pgd_val(pgd) & 
_PFN_MASK))
-#define pgd_page(pgd)  virt_to_page((pgd_val(pgd) + 
PAGE_OFFSET))
+#define p4d_none(p4d)  (!p4d_val(p4d))
+#define p4d_bad(p4d)   (!ia64_phys_addr_valid(p4d_val(p4d)))
+#define p4d_present(p4d)   (p4d_val(p4d) != 0UL)
+#define p4d_clear(p4dp)(p4d_val(*(p4dp)) = 0UL)
+#define p4d_page_vaddr(p4d)((unsigned long) __va(p4d_val(p4d) & 
_PFN_MASK))
+#define p4d_page(p4d)  virt_to_page((p4d_val(p4d) + 
PAGE_OFFSET))
 #endif
 
 /*
@@ -386,7 +386,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long 
address)
 #if CONFIG_PGTABLE_LEVELS == 4
 /* Find an entry in the second-level page table.. */
 #define pud_offset(dir,addr) \
-   ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & 
(PTRS_PER_PUD - 1)))
+   ((pud_t *) p4d_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & 
(PTRS_PER_PUD - 1)))
 #endif
 
 /* Find an entry in the third-level page table.. */
@@ -580,10 +580,9 @@ extern struct page *zero_page_memmap_ptr;
 
 
 #if CONFIG_PGTABLE_LEVELS == 3
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 #endif
-#include 
+#include 
 #include 
 
 #endif /* _ASM_IA64_PGTABLE_H */
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 30d0c1fca99e..12242aa0dad1 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -29,6 +29,7 @@ static int
 mapped_kernel_page_is_present (unsigned long address)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
@@ -37,7 +38,11 @@ mapped_kernel_page_is_present (unsigned long address)
if (pgd_none(*pgd) || pgd_bad(*pgd))
return 0;
 
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   if (p4d_none(*p4d) || p4d_bad(*p4d))
+   return 0;
+
+   pud = pud_offset(p4d, address);
if (pud_none(*pud) || pud_bad(*pud))
return 0;
 
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index d16e419fd712..32352a73df0c 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -30,12 +30,14 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, 
unsigned long sz)
 {
unsigned long taddr = htlbpage_to_page(addr);
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte = NULL;
 
pgd = pgd_offset(mm, taddr);
-   pud = pud_alloc(mm, pgd, taddr);
+   p4d = p4d_offset(pgd, taddr);
+   pud = pud_alloc(mm, p4d, taddr);
if (pud) {
pmd = pmd_alloc(mm, pud, taddr);
if (pmd)
@@ -49,17 +51,21 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr, 
unsigned long sz)
 {
unsigned long taddr = htlbpage_to_page(addr);
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte = NULL;
 
pgd = pgd_offset(mm, taddr);
if (pgd_present(*pgd)) {
-   pud = pud_offset(pgd, taddr);
-   if (pud_present(*pud)) {

[PATCH v4 04/14] hexagon: remove __ARCH_USE_5LEVEL_HACK

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

The hexagon architecture has 2 level page tables and as such most of the
page table folding is already implemented in asm-generic/pgtable-nopmd.h.

Fixup the only place in arch/hexagon to unfold the p4d level and remove
__ARCH_USE_5LEVEL_HACK.

Signed-off-by: Mike Rapoport 
---
 arch/hexagon/include/asm/fixmap.h  | 4 ++--
 arch/hexagon/include/asm/pgtable.h | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/hexagon/include/asm/fixmap.h 
b/arch/hexagon/include/asm/fixmap.h
index 933dac167504..97b1b062e750 100644
--- a/arch/hexagon/include/asm/fixmap.h
+++ b/arch/hexagon/include/asm/fixmap.h
@@ -16,7 +16,7 @@
 #include 
 
 #define kmap_get_fixmap_pte(vaddr) \
-   pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), \
-   (vaddr)), (vaddr)), (vaddr))
+   pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr), 
\
+   (vaddr)), (vaddr)), (vaddr)), (vaddr))
 
 #endif
diff --git a/arch/hexagon/include/asm/pgtable.h 
b/arch/hexagon/include/asm/pgtable.h
index d383e8bea5b2..2a17d4eb2fa4 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -12,7 +12,6 @@
  * Page table definitions for Qualcomm Hexagon processor.
  */
 #include 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 
 /* A handy thing to have if one has the RAM. Declared in head.S */
-- 
2.25.1

[PATCH v4 03/14] arm64: add support for folded p4d page tables

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate, replace 5level-fixup.h with pgtable-nop4d.h and
remove __ARCH_USE_5LEVEL_HACK.

Signed-off-by: Mike Rapoport 
---
 arch/arm64/include/asm/kvm_mmu.h|  10 +-
 arch/arm64/include/asm/pgalloc.h|  10 +-
 arch/arm64/include/asm/pgtable-types.h  |   5 +-
 arch/arm64/include/asm/pgtable.h|  37 +++--
 arch/arm64/include/asm/stage2_pgtable.h |  48 --
 arch/arm64/kernel/hibernate.c   |  44 -
 arch/arm64/mm/fault.c   |   9 +-
 arch/arm64/mm/hugetlbpage.c |  15 +-
 arch/arm64/mm/kasan_init.c  |  26 ++-
 arch/arm64/mm/mmu.c |  52 --
 arch/arm64/mm/pageattr.c|   7 +-
 virt/kvm/arm/mmu.c  | 209 
 12 files changed, 368 insertions(+), 104 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 30b0e8d6b895..8255fab2e441 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -172,8 +172,8 @@ void kvm_clear_hyp_idmap(void);
__pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
 #define kvm_mk_pud(pmdp)   \
__pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
-#define kvm_mk_pgd(pudp)   \
-   __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE)
+#define kvm_mk_p4d(pmdp)   \
+   __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
 
 #define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
 
@@ -299,6 +299,12 @@ static inline bool kvm_s2pud_young(pud_t pud)
 #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
 #endif
 
+#ifdef __PAGETABLE_P4D_FOLDED
+#define hyp_p4d_table_empty(p4dp) (0)
+#else
+#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
+#endif
+
 struct kvm;
 
 #define kvm_flush_dcache_to_poc(a,l)   __flush_dcache_area((a), (l))
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 172d76fa0245..58e93583ddb6 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -73,17 +73,17 @@ static inline void pud_free(struct mm_struct *mm, pud_t 
*pudp)
free_page((unsigned long)pudp);
 }
 
-static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
+static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 {
-   set_pgd(pgdp, __pgd(__phys_to_pgd_val(pudp) | prot));
+   set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
 }
 
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
 {
-   __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE);
+   __p4d_populate(p4dp, __pa(pudp), PUD_TYPE_TABLE);
 }
 #else
-static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
+static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 {
BUILD_BUG();
 }
diff --git a/arch/arm64/include/asm/pgtable-types.h 
b/arch/arm64/include/asm/pgtable-types.h
index acb0751a6606..b8f158ae2527 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -14,6 +14,7 @@
 typedef u64 pteval_t;
 typedef u64 pmdval_t;
 typedef u64 pudval_t;
+typedef u64 p4dval_t;
 typedef u64 pgdval_t;
 
 /*
@@ -44,13 +45,11 @@ typedef struct { pteval_t pgprot; } pgprot_t;
 #define __pgprot(x)((pgprot_t) { (x) } )
 
 #if CONFIG_PGTABLE_LEVELS == 2
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 #elif CONFIG_PGTABLE_LEVELS == 3
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 #elif CONFIG_PGTABLE_LEVELS == 4
-#include 
+#include 
 #endif
 
 #endif /* __ASM_PGTABLE_TYPES_H */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 538c85e62f86..c23c5a4e6dc6 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -298,6 +298,11 @@ static inline pte_t pgd_pte(pgd_t pgd)
return __pte(pgd_val(pgd));
 }
 
+static inline pte_t p4d_pte(p4d_t p4d)
+{
+   return __pte(p4d_val(p4d));
+}
+
 static inline pte_t pud_pte(pud_t pud)
 {
return __pte(pud_val(pud));
@@ -401,6 +406,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 
 #define set_pmd_at(mm, addr, pmdp, pmd)set_pte_at(mm, addr, (pte_t 
*)pmdp, pmd_pte(pmd))
 
+#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d))
+#define __phys_to_p4d_val(phys)__phys_to_pte_val(phys)
+
 #define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd))
 #define __phys_to_pgd_val(phys)__phys_to_pte_val(phys)
 
@@ -588,49 +596,50 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 
 #define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
 
-#define pgd_none(pgd)  (!pgd_val(pgd))
-#define pgd_bad(pgd)

[PATCH v4 02/14] arm: add support for folded p4d page tables

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate, and remove __ARCH_USE_5LEVEL_HACK.

Signed-off-by: Mike Rapoport 
---
 arch/arm/include/asm/pgtable.h |  1 -
 arch/arm/lib/uaccess_with_memcpy.c |  7 +-
 arch/arm/mach-sa1100/assabet.c |  2 +-
 arch/arm/mm/dump.c | 29 +-
 arch/arm/mm/fault-armv.c   |  7 +-
 arch/arm/mm/fault.c| 22 ++--
 arch/arm/mm/idmap.c|  3 ++-
 arch/arm/mm/init.c |  2 +-
 arch/arm/mm/ioremap.c  | 12 ++---
 arch/arm/mm/mm.h   |  2 +-
 arch/arm/mm/mmu.c  | 35 +-
 arch/arm/mm/pgd.c  | 40 --
 12 files changed, 125 insertions(+), 37 deletions(-)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index befc8fcec98f..fba20607c53c 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -17,7 +17,6 @@
 
 #else
 
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 #include 
 #include 
diff --git a/arch/arm/lib/uaccess_with_memcpy.c 
b/arch/arm/lib/uaccess_with_memcpy.c
index c9450982a155..d72b14c96670 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -24,6 +24,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, 
spinlock_t **ptlp)
 {
unsigned long addr = (unsigned long)_addr;
pgd_t *pgd;
+   p4d_t *p4d;
pmd_t *pmd;
pte_t *pte;
pud_t *pud;
@@ -33,7 +34,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, 
spinlock_t **ptlp)
if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
return 0;
 
-   pud = pud_offset(pgd, addr);
+   p4d = p4d_offset(pgd, addr);
+   if (unlikely(p4d_none(*p4d) || p4d_bad(*p4d)))
+   return 0;
+
+   pud = pud_offset(p4d, addr);
if (unlikely(pud_none(*pud) || pud_bad(*pud)))
return 0;
 
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index d96a101e5504..0631a7b02678 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -633,7 +633,7 @@ static void __init map_sa1100_gpio_regs( void )
int prot = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_DOMAIN(DOMAIN_IO);
pmd_t *pmd;
 
-   pmd = pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt);
+   pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(virt), virt), 
virt), virt);
*pmd = __pmd(phys | prot);
flush_pmd_entry(pmd);
 }
diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c
index 7d6291f23251..677549d6854c 100644
--- a/arch/arm/mm/dump.c
+++ b/arch/arm/mm/dump.c
@@ -207,6 +207,7 @@ struct pg_level {
 static struct pg_level pg_level[] = {
{
}, { /* pgd */
+   }, { /* p4d */
}, { /* pud */
}, { /* pmd */
.bits   = section_bits,
@@ -308,7 +309,7 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, 
unsigned long start,
 
for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
addr = start + i * PAGE_SIZE;
-   note_page(st, addr, 4, pte_val(*pte), domain);
+   note_page(st, addr, 5, pte_val(*pte), domain);
}
 }
 
@@ -350,14 +351,14 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, 
unsigned long start)
addr += SECTION_SIZE;
pmd++;
domain = get_domain_name(pmd);
-   note_page(st, addr, 3, pmd_val(*pmd), domain);
+   note_page(st, addr, 4, pmd_val(*pmd), domain);
}
}
 }
 
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
 {
-   pud_t *pud = pud_offset(pgd, 0);
+   pud_t *pud = pud_offset(p4d, 0);
unsigned long addr;
unsigned i;
 
@@ -366,7 +367,23 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, 
unsigned long start)
if (!pud_none(*pud)) {
walk_pmd(st, pud, addr);
} else {
-   note_page(st, addr, 2, pud_val(*pud), NULL);
+   note_page(st, addr, 3, pud_val(*pud), NULL);
+   }
+   }
+}
+
+static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+   p4d_t *p4d = p4d_offset(pgd, 0);
+   unsigned long addr;
+   unsigned i;
+
+   for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+   addr = start + i * P4D_SIZE;
+   if (!p4d_none(*p4d)) {
+   walk_pud(st, p4d, addr);
+   } else {
+   note_page(st, addr, 2, p4d_val(*p4d), NULL);
}
}
 }
@@ -381,7 +398,7 @@ static void walk_pgd(struct pg_state *st, struct mm_struct

[PATCH v4 01/14] h8300: remove usage of __ARCH_USE_5LEVEL_HACK

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

h8300 is a nommu architecture and does not require fixup for upper layers
of the page tables because it is already handled by the generic nommu
implementation.

Remove definition of __ARCH_USE_5LEVEL_HACK in
arch/h8300/include/asm/pgtable.h

Signed-off-by: Mike Rapoport 
---
 arch/h8300/include/asm/pgtable.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h
index 4d00152fab58..f00828720dc4 100644
--- a/arch/h8300/include/asm/pgtable.h
+++ b/arch/h8300/include/asm/pgtable.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _H8300_PGTABLE_H
 #define _H8300_PGTABLE_H
-#define __ARCH_USE_5LEVEL_HACK
 #include 
 #include 
 extern void paging_init(void);
-- 
2.25.1

[PATCH v4 00/14] mm: remove __ARCH_HAS_5LEVEL_HACK

2020-04-14 Thread Mike Rapoport

From: Mike Rapoport 

Hi,

These patches convert several architectures to use page table folding and
remove __ARCH_HAS_5LEVEL_HACK along with include/asm-generic/5level-fixup.h
and include/asm-generic/pgtable-nop4d-hack.h. With that we'll have a single
and consistent way of dealing with page table folding instead of a mix of
three existing options.

The changes are mostly about mechanical replacement of pgd accessors with
p4d ones and the addition of higher levels to page table traversals.

v4 is about rebasing on top of v5.7-rc1 
* split arm and arm64 changes as there is no KVM host on arm anymore
* update powerpc patches to reflect its recent changes in page table handling

v3:
* add Christophe's patch that removes ppc32 get_pteptr()
* reduce amount of upper layer walks in powerpc

v2:
* collect per-arch patches into a single set
* include Geert's update of 'sh' printing messages
* rebase on v5.6-rc1+

Geert Uytterhoeven (1):
  sh: fault: Modernize printing of kernel messages

Mike Rapoport (13):
  h8300: remove usage of __ARCH_USE_5LEVEL_HACK
  arm: add support for folded p4d page tables
  arm64: add support for folded p4d page tables
  hexagon: remove __ARCH_USE_5LEVEL_HACK
  ia64: add support for folded p4d page tables
  nios2: add support for folded p4d page tables
  openrisc: add support for folded p4d page tables
  powerpc: add support for folded p4d page tables
  sh: drop __pXd_offset() macros that duplicate pXd_index() ones
  sh: add support for folded p4d page tables
  unicore32: remove __ARCH_USE_5LEVEL_HACK
  asm-generic: remove pgtable-nop4d-hack.h
  mm: remove __ARCH_HAS_5LEVEL_HACK and include/asm-generic/5level-fixup.h

 arch/arm/include/asm/pgtable.h|   1 -
 arch/arm/lib/uaccess_with_memcpy.c|   7 +-
 arch/arm/mach-sa1100/assabet.c|   2 +-
 arch/arm/mm/dump.c|  29 ++-
 arch/arm/mm/fault-armv.c  |   7 +-
 arch/arm/mm/fault.c   |  22 +-
 arch/arm/mm/idmap.c   |   3 +-
 arch/arm/mm/init.c|   2 +-
 arch/arm/mm/ioremap.c |  12 +-
 arch/arm/mm/mm.h  |   2 +-
 arch/arm/mm/mmu.c |  35 ++-
 arch/arm/mm/pgd.c |  40 +++-
 arch/arm64/include/asm/kvm_mmu.h  |  10 +-
 arch/arm64/include/asm/pgalloc.h  |  10 +-
 arch/arm64/include/asm/pgtable-types.h|   5 +-
 arch/arm64/include/asm/pgtable.h  |  37 ++--
 arch/arm64/include/asm/stage2_pgtable.h   |  48 +++-
 arch/arm64/kernel/hibernate.c |  44 +++-
 arch/arm64/mm/fault.c |   9 +-
 arch/arm64/mm/hugetlbpage.c   |  15 +-
 arch/arm64/mm/kasan_init.c|  26 ++-
 arch/arm64/mm/mmu.c   |  52 +++--
 arch/arm64/mm/pageattr.c  |   7 +-
 arch/h8300/include/asm/pgtable.h  |   1 -
 arch/hexagon/include/asm/fixmap.h |   4 +-
 arch/hexagon/include/asm/pgtable.h|   1 -
 arch/ia64/include/asm/pgalloc.h   |   4 +-
 arch/ia64/include/asm/pgtable.h   |  17 +-
 arch/ia64/mm/fault.c  |   7 +-
 arch/ia64/mm/hugetlbpage.c|  18 +-
 arch/ia64/mm/init.c   |  28 ++-
 arch/nios2/include/asm/pgtable.h  |   3 +-
 arch/nios2/mm/fault.c |   9 +-
 arch/nios2/mm/ioremap.c   |   6 +-
 arch/openrisc/include/asm/pgtable.h   |   1 -
 arch/openrisc/mm/fault.c  |  10 +-
 arch/openrisc/mm/init.c   |   4 +-
 arch/powerpc/include/asm/book3s/32/pgtable.h  |   1 -
 arch/powerpc/include/asm/book3s/64/hash.h |   4 +-
 arch/powerpc/include/asm/book3s/64/pgalloc.h  |   4 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  60 ++---
 arch/powerpc/include/asm/book3s/64/radix.h|   6 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h  |   1 -
 arch/powerpc/include/asm/nohash/64/pgalloc.h  |   2 +-
 .../include/asm/nohash/64/pgtable-4k.h|  32 +--
 arch/powerpc/include/asm/nohash/64/pgtable.h  |   6 +-
 arch/powerpc/include/asm/pgtable.h|  10 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c|  32 +--
 arch/powerpc/lib/code-patching.c  |   7 +-
 arch/powerpc/mm/book3s64/hash_pgtable.c   |   4 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c  |  26 ++-
 arch/powerpc/mm/book3s64/subpage_prot.c   |   6 +-
 arch/powerpc/mm/hugetlbpage.c |  28 ++-
 arch/powerpc/mm/nohash/book3e_pgtable.c   |  15 +-
 arch/powerpc/mm/pgtable.c |  30 ++-
 arch/powerpc/mm/pgtable_64.c  |  10 +-
 arch/powerpc/mm/ptdump/hashpagetable.c|  20 +-
 arch/powerpc/mm/ptdump/ptdump.c   |  14 +-
 arch/powerpc/xmon/xmon.c  |  18 +-

Re: [PATCH v2 4/4] hugetlbfs: clean up command line processing

2020-04-14 Thread Peter Xu

On Mon, Apr 13, 2020 at 10:59:26AM -0700, Mike Kravetz wrote:
> On 4/10/20 1:37 PM, Peter Xu wrote:
> > On Wed, Apr 01, 2020 at 11:38:19AM -0700, Mike Kravetz wrote:
> >> With all hugetlb page processing done in a single file clean up code.
> >> - Make code match desired semantics
> >>   - Update documentation with semantics
> >> - Make all warnings and errors messages start with 'HugeTLB:'.
> >> - Consistently name command line parsing routines.
> >> - Check for hugepages_supported() before processing parameters.
> >> - Add comments to code
> >>   - Describe some of the subtle interactions
> >>   - Describe semantics of command line arguments
> >>
> >> Signed-off-by: Mike Kravetz 
> >> ---
> >>  .../admin-guide/kernel-parameters.txt | 35 ---
> >>  Documentation/admin-guide/mm/hugetlbpage.rst  | 44 +
> >>  mm/hugetlb.c  | 96 +++
> >>  3 files changed, 142 insertions(+), 33 deletions(-)
> >>
> >> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> >> b/Documentation/admin-guide/kernel-parameters.txt
> >> index 1bd5454b5e5f..de653cfe1726 100644
> >> --- a/Documentation/admin-guide/kernel-parameters.txt
> >> +++ b/Documentation/admin-guide/kernel-parameters.txt
> >> @@ -832,12 +832,15 @@
> >>See also Documentation/networking/decnet.txt.
> >>  
> >>default_hugepagesz=
> >> -  [same as hugepagesz=] The size of the default
> >> -  HugeTLB page size. This is the size represented by
> >> -  the legacy /proc/ hugepages APIs, used for SHM, and
> >> -  default size when mounting hugetlbfs filesystems.
> >> -  Defaults to the default architecture's huge page size
> >> -  if not specified.
> >> +  [HW] The size of the default HugeTLB page size. This
> > 
> > Could I ask what's "HW"?  Sorry this is not a comment at all but
> > really a pure question I wanted to ask... :)
> 
> kernel-parameters.rst includes kernel-parameters.txt and included the meaning
> for these codes.
> 
>HW  Appropriate hardware is enabled.
> 
> Previously, it listed an obsolete list of architectures.

I see. It was a bit confusing since hugepage is not a real hardware,
"CAP (capability)" might be easier, but I get the point now, thanks!

[...]

> >> diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst 
> >> b/Documentation/admin-guide/mm/hugetlbpage.rst
> >> index 1cc0bc78d10e..de340c586995 100644
> >> --- a/Documentation/admin-guide/mm/hugetlbpage.rst
> >> +++ b/Documentation/admin-guide/mm/hugetlbpage.rst
> >> @@ -100,6 +100,50 @@ with a huge page size selection parameter 
> >> "hugepagesz=".   must
> >>  be specified in bytes with optional scale suffix [kKmMgG].  The default 
> >> huge
> >>  page size may be selected with the "default_hugepagesz=" boot 
> >> parameter.
> >>  
> >> +Hugetlb boot command line parameter semantics
> >> +hugepagesz - Specify a huge page size.  Used in conjunction with hugepages
> >> +  parameter to preallocate a number of huge pages of the specified
> >> +  size.  Hence, hugepagesz and hugepages are typically specified in
> >> +  pairs such as:
> >> +  hugepagesz=2M hugepages=512
> >> +  hugepagesz can only be specified once on the command line for a
> >> +  specific huge page size.  Valid huge page sizes are architecture
> >> +  dependent.
> >> +hugepages - Specify the number of huge pages to preallocate.  This 
> >> typically
> >> +  follows a valid hugepagesz parameter.  However, if hugepages is the
> >> +  first or only hugetlb command line parameter it specifies the number
> >> +  of huge pages of default size to allocate.  The number of huge pages
> >> +  of default size specified in this manner can be overwritten by a
> >> +  hugepagesz,hugepages parameter pair for the default size.
> >> +  For example, on an architecture with 2M default huge page size:
> >> +  hugepages=256 hugepagesz=2M hugepages=512
> >> +  will result in 512 2M huge pages being allocated.  If a hugepages
> >> +  parameter is preceded by an invalid hugepagesz parameter, it will
> >> +  be ignored.
> >> +default_hugepagesz - Specify the default huge page size.  This parameter 
> >> can
> >> +  only be specified once on the command line.  No other hugetlb command
> >> +  line parameter is associated with default_hugepagesz.  Therefore, it
> >> +  can appear anywhere on the command line.  If hugepages= is the first
> >> +  hugetlb command line parameter, the specified number of huge pages
> >> +  will apply to the default huge page size specified with
> >> +  default_hugepagesz.  For example,
> >> +  hugepages=512 default_hugepagesz=2M
> > 
> > No strong opinion, but considering to the special case of gigantic
> > huge page mentioned below, I'm thinking maybe it's easier to just ask
> > the user to always use "hugepagesz=X hugepages=Y" pair when people
> > want to reserve huge pages.
> 
>

Re: [PATCH v6 2/7] ASoC: dt-bindings: fsl_asrc: Add new property fsl,asrc-format

2020-04-14 Thread Rob Herring

On Wed,  1 Apr 2020 16:45:35 +0800, Shengjiu Wang wrote:
> In order to support new EASRC and simplify the code structure,
> We decide to share the common structure between them. This bring
> a problem that EASRC accept format directly from devicetree, but
> ASRC accept width from devicetree.
> 
> In order to align with new ESARC, we add new property fsl,asrc-format.
> The fsl,asrc-format can replace the fsl,asrc-width, then driver
> can accept format from devicetree, don't need to convert it to
> format through width.
> 
> Signed-off-by: Shengjiu Wang 
> ---
>  Documentation/devicetree/bindings/sound/fsl,asrc.txt | 4 
>  1 file changed, 4 insertions(+)
> 

Acked-by: Rob Herring

Re: [PATCH 21/29] mm: remove the pgprot argument to __vmalloc

2020-04-14 Thread Wei Liu

On Tue, Apr 14, 2020 at 03:13:40PM +0200, Christoph Hellwig wrote:
> The pgprot argument to __vmalloc is always PROT_KERNEL now, so remove
> it.
> 
> Signed-off-by: Christoph Hellwig 
> Reviewed-by: Michael Kelley  [hyperv]
> Acked-by: Gao Xiang  [erofs]
> Acked-by: Peter Zijlstra (Intel) 
> ---
>  arch/x86/hyperv/hv_init.c  |  3 +--
[...]
> 
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 5a4b363ba67b..a3d689dfc745 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -95,8 +95,7 @@ static int hv_cpu_init(unsigned int cpu)
>* not be stopped in the case of CPU offlining and the VM will hang.
>*/
>   if (!*hvp) {
> - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO,
> -  PAGE_KERNEL);
> + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
>   }

Acked-by: Wei Liu

Re: [PATCH 1/3] kexec: Prevent removal of memory in use by a loaded kexec image

2020-04-14 Thread David Hildenbrand

On 14.04.20 16:39, Baoquan He wrote:
> On 04/14/20 at 11:37am, David Hildenbrand wrote:
>> On 14.04.20 11:22, Baoquan He wrote:
>>> On 04/14/20 at 10:00am, David Hildenbrand wrote:
 On 14.04.20 08:40, Baoquan He wrote:
> On 04/13/20 at 08:15am, Eric W. Biederman wrote:
>> Baoquan He  writes:
>>
>>> On 04/12/20 at 02:52pm, Eric W. Biederman wrote:

 The only benefit of kexec_file_load is that it is simple enough from a
 kernel perspective that signatures can be checked.
>>>
>>> We don't have this restriction any more with below commit:
>>>
>>> commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
>>> and KEXEC_SIG_FORCE")
>>>
>>> With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
>>> secure boot or legacy system for kexec/kdump. Being simple enough is
>>> enough to astract and convince us to use it instead. And kexec_file_load
>>> has been in use for several years on systems with secure boot, since
>>> added in 2014, on x86_64.
>>
>> No.  Actaully kexec_file_load is the less capable interface, and less
>> flexible interface.  Which is why it is appropriate for signature
>> verification.
>
> Well, everyone has a stance and the corresponding view. You could have
> wider view from long time maintenance and in upstrem position, and think
> kexec_file_load is horrible. But I can only see from our work as a front
> line engineer to maintain/develop kexec/kdump in RHEL, and think
> kexec_file_load is easier to maintain.
>
> Surely except of multiple kernel image format support. No matter it is
> kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
> This is produced from kerel building by default. We have no way to
> support it in our distros and add it into kexec_file_load.
>
> [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> https://lkml.org/lkml/2017/2/15/654
>
>>
 kexec_load in every other respect is the more capable and functional
 interface.  It makes no sense to get rid of it.

 It does make sense to reload with a loaded kernel on memory hotplug.
 That is simple and easy.  If we are going to handle something in the
 kernel it should simple an automated unloading of the kernel on memory
 hotplug.


 I think it would be irresponsible to deprecate kexec_load on any
 platform.

 I also suspect that kexec_file_load could be taught to copy the dtb
 on arm32 if someone wants to deal with signatures.

 We definitely can not even think of deprecating kexec_load until
 architecture that supports it also supports kexec_file_load and 
 everyone
 is happy with that interface.  That is Linus's no regression rule.
>>>
>>> I should pick a milder word to express our tendency and tell our plan
>>> then 'obsolete'. Even though I added 'gradually', seems it doesn't help
>>> much. I didn't mean to say 'deprecate' at all when replied.
>>>
>>> The situation and trend I understand about kexec_load and 
>>> kexec_file_load
>>> are:
>>>
>>> 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
>>> have yet, just as x86_64, arm64 and s390 have done;
>>>  
>>> 2) kexec_file_load is suggested to use, and take precedence over
>>> kexec_load in the future, if both are supported in one ARCH.
>>
>> The deep problem is that kexec_file_load is distinctly less expressive
>> than kexec_load.
>>
>>> 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
>>> and by ARCHes for back compatibility w/ kexec_file_load support.
>>>
>>> For 1) and 2), I think the reason is obvious as Eric said,
>>> kexec_file_load is simple enough. And currently, whenever we got a bug
>>> report, we may need fix them twice, for kexec_load and kexec_file_load.
>>> If kexec_file_load is made by default, e.g on x86_64, we will change it
>>> in kernel space only, for kexec_file_load. This is what I meant about
>>> 'obsolete gradually'. I think for arm64, s390, they will do these too.
>>> Unless there's some critical/blocker bug in kexec_load, to corrupt the
>>> old kexec_load interface in old product.
>>
>> Maybe.  The code that kexec_file_load sucked into the kernel is quite
>> stable and rarely needs changes except during a port of kexec to
>> another architecture.
>>
>> Last I looked the real maintenance effor of kexec and kexec on panic was
>> in the drivers.  So I don't think we can use maintenance to do anything.
>
> Not sure if I got it. But if check Lianbo's patches, a lot of effort has
> been taken to make SEV work well on kexec_file_load. And we have
> switched to use kexec_file_load in the newly

Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

2020-04-14 Thread Nicholas Piggin

Excerpts from Christoph Hellwig's message of April 14, 2020 11:02 pm:
> On Tue, Apr 14, 2020 at 10:13:44PM +1000, Nicholas Piggin wrote:
>> Which case? Usually the answer would be because you don't want to use
>> contiguous physical memory and/or you don't want to use the linear 
>> mapping.
> 
> But with huge pages you do by definition already use large contiguous
> areas.  So you want allocations larger than "small" huge pages but not
> using gigantic pages using vmalloc?

Yes.

Thanks,
Nick

Re: [PATCH 1/3] kexec: Prevent removal of memory in use by a loaded kexec image

2020-04-14 Thread Baoquan He

On 04/14/20 at 11:37am, David Hildenbrand wrote:
> On 14.04.20 11:22, Baoquan He wrote:
> > On 04/14/20 at 10:00am, David Hildenbrand wrote:
> >> On 14.04.20 08:40, Baoquan He wrote:
> >>> On 04/13/20 at 08:15am, Eric W. Biederman wrote:
>  Baoquan He  writes:
> 
> > On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
> >>
> >> The only benefit of kexec_file_load is that it is simple enough from a
> >> kernel perspective that signatures can be checked.
> >
> > We don't have this restriction any more with below commit:
> >
> > commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
> > and KEXEC_SIG_FORCE")
> >
> > With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
> > secure boot or legacy system for kexec/kdump. Being simple enough is
> > enough to astract and convince us to use it instead. And kexec_file_load
> > has been in use for several years on systems with secure boot, since
> > added in 2014, on x86_64.
> 
>  No.  Actaully kexec_file_load is the less capable interface, and less
>  flexible interface.  Which is why it is appropriate for signature
>  verification.
> >>>
> >>> Well, everyone has a stance and the corresponding view. You could have
> >>> wider view from long time maintenance and in upstrem position, and think
> >>> kexec_file_load is horrible. But I can only see from our work as a front
> >>> line engineer to maintain/develop kexec/kdump in RHEL, and think
> >>> kexec_file_load is easier to maintain.
> >>>
> >>> Surely except of multiple kernel image format support. No matter it is
> >>> kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
> >>> This is produced from kerel building by default. We have no way to
> >>> support it in our distros and add it into kexec_file_load.
> >>>
> >>> [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> >>> https://lkml.org/lkml/2017/2/15/654
> >>>
> 
> >> kexec_load in every other respect is the more capable and functional
> >> interface.  It makes no sense to get rid of it.
> >>
> >> It does make sense to reload with a loaded kernel on memory hotplug.
> >> That is simple and easy.  If we are going to handle something in the
> >> kernel it should simple an automated unloading of the kernel on memory
> >> hotplug.
> >>
> >>
> >> I think it would be irresponsible to deprecate kexec_load on any
> >> platform.
> >>
> >> I also suspect that kexec_file_load could be taught to copy the dtb
> >> on arm32 if someone wants to deal with signatures.
> >>
> >> We definitely can not even think of deprecating kexec_load until
> >> architecture that supports it also supports kexec_file_load and 
> >> everyone
> >> is happy with that interface.  That is Linus's no regression rule.
> >
> > I should pick a milder word to express our tendency and tell our plan
> > then 'obsolete'. Even though I added 'gradually', seems it doesn't help
> > much. I didn't mean to say 'deprecate' at all when replied.
> >
> > The situation and trend I understand about kexec_load and 
> > kexec_file_load
> > are:
> >
> > 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
> > have yet, just as x86_64, arm64 and s390 have done;
> >  
> > 2) kexec_file_load is suggested to use, and take precedence over
> > kexec_load in the future, if both are supported in one ARCH.
> 
>  The deep problem is that kexec_file_load is distinctly less expressive
>  than kexec_load.
> 
> > 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
> > and by ARCHes for back compatibility w/ kexec_file_load support.
> >
> > For 1) and 2), I think the reason is obvious as Eric said,
> > kexec_file_load is simple enough. And currently, whenever we got a bug
> > report, we may need fix them twice, for kexec_load and kexec_file_load.
> > If kexec_file_load is made by default, e.g on x86_64, we will change it
> > in kernel space only, for kexec_file_load. This is what I meant about
> > 'obsolete gradually'. I think for arm64, s390, they will do these too.
> > Unless there's some critical/blocker bug in kexec_load, to corrupt the
> > old kexec_load interface in old product.
> 
>  Maybe.  The code that kexec_file_load sucked into the kernel is quite
>  stable and rarely needs changes except during a port of kexec to
>  another architecture.
> 
>  Last I looked the real maintenance effor of kexec and kexec on panic was
>  in the drivers.  So I don't think we can use maintenance to do anything.
> >>>
> >>> Not sure if I got it. But if check Lianbo's patches, a lot of effort has
> >>> been taken to make SEV work well on kexec_file_load. And we have
> >>> switched to use kexec_file_load in the newly published  Fedora release
> >>> on

[PATCH] iommu: spapr_tce: Disable compile testing to fix build on book3s_32 config

2020-04-14 Thread Krzysztof Kozlowski

Although SPAPR_TCE_IOMMU itself can be compile tested on certain PowerPC
configurations, its presence makes arch/powerpc/kvm/Makefile to select
modules which do not build in such configuration.

The arch/powerpc/kvm/ modules use kvm_arch.spapr_tce_tables which exists
only with CONFIG_PPC_BOOK3S_64.  However these modules are selected when
COMPILE_TEST and SPAPR_TCE_IOMMU are chosen leading to build failures:

In file included from arch/powerpc/include/asm/book3s/64/mmu-hash.h:20:0,
 from arch/powerpc/kvm/book3s_64_vio_hv.c:22:
arch/powerpc/include/asm/book3s/64/pgtable.h:17:0: error: "_PAGE_EXEC" 
redefined [-Werror]
 #define _PAGE_EXEC  0x1 /* execute permission */

In file included from arch/powerpc/include/asm/book3s/32/pgtable.h:8:0,
 from arch/powerpc/include/asm/book3s/pgtable.h:8,
 from arch/powerpc/include/asm/pgtable.h:18,
 from include/linux/mm.h:95,
 from arch/powerpc/include/asm/io.h:29,
 from include/linux/io.h:13,
 from include/linux/irq.h:20,
 from arch/powerpc/include/asm/hardirq.h:6,
 from include/linux/hardirq.h:9,
 from include/linux/kvm_host.h:7,
 from arch/powerpc/kvm/book3s_64_vio_hv.c:12:
arch/powerpc/include/asm/book3s/32/hash.h:29:0: note: this is the location 
of the previous definition
 #define _PAGE_EXEC 0x200 /* software: exec allowed */

Reported-by: Geert Uytterhoeven 
Fixes: e93a1695d7fb ("iommu: Enable compile testing for some of drivers")
Signed-off-by: Krzysztof Kozlowski 
---
 drivers/iommu/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 58b4a4dbfc78..3532b1ead19d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -362,7 +362,7 @@ config IPMMU_VMSA
 
 config SPAPR_TCE_IOMMU
bool "sPAPR TCE IOMMU Support"
-   depends on PPC_POWERNV || PPC_PSERIES || (PPC && COMPILE_TEST)
+   depends on PPC_POWERNV || PPC_PSERIES
select IOMMU_API
help
  Enables bits of IOMMU API required by VFIO. The iommu_ops
-- 
2.17.1

Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

2020-04-14 Thread Matthew Wilcox

On Tue, Apr 14, 2020 at 02:28:35PM +0200, Christophe Leroy wrote:
> Le 13/04/2020 à 15:41, Matthew Wilcox a écrit :
> > On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
> > > +static int vmap_pages_range_noflush(unsigned long start, unsigned long 
> > > end,
> > > + pgprot_t prot, struct page **pages,
> > > + unsigned int page_shift)
> > > +{
> > > + if (page_shift == PAGE_SIZE) {
> > 
> > ... I think you meant 'page_shift == PAGE_SHIFT'
> > 
> > Overall I like this series, although it's a bit biased towards CPUs
> > which have page sizes which match PMD/PUD sizes.  It doesn't offer the
> > possibility of using 64kB page sizes on ARM, for example.  But it's a
> > step in the right direction.
> 
> I was going to ask more or less the same question, I would have liked to use
> 512kB hugepages on powerpc 8xx.
> 
> Even the 8M hugepages (still on the 8xx), can they be used as well, taking
> into account that two PGD entries have to point to the same 8M page ?
> 
> I sent out a series which tends to make the management of 512k and 8M pages
> closer to what Linux expects, in order to use them inside kernel, for Linear
> mappings and Kasan mappings for the moment. See
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=164620
> It would be nice if we could amplify it a use it for ioremaps and vmallocs
> as well.

I haven't been looking at vmalloc at all; I've been looking at the page
cache.  See:
https://lore.kernel.org/linux-mm/20200212041845.25879-1-wi...@infradead.org/

Once we have large pages in the page cache, I want to sort out the API
for asking the CPU to insert a TLB entry.  Right now, we use set_pte_at(),
set_pmd_at() and set_pud_at().  I'm thinking something along the lines of:

vm_fault_t vmf_set_page_at(struct vm_fault *vmf, struct page *page);

and the architecture can insert whatever PTEs and/or TLB entries it
likes based on compound_order(page) -- if, say, it's a 1MB page, it might
choose to insert 2 * 512kB entries, or just the upper or lower 512kB entry
(depending which half of the 1MB page the address sits in).

Re: [PATCH 3/8] signal: replace __copy_siginfo_to_user32 with to_compat_siginfo

2020-04-14 Thread Arnd Bergmann

On Tue, Apr 14, 2020 at 9:01 AM Christoph Hellwig  wrote:
>
> Move copying the siginfo to userspace into the callers, so that the
> compat_siginfo conversion can be reused by the ELF coredump code without
> set_fs magic.
>
> Signed-off-by: Christoph Hellwig 

Looks all good to me, but I noticed that the naming is now a bit
inconsistent. to_compat_siginfo() is basically the reverse of
post_copy_siginfo_from_user32(), but the names are very different.

I suppose this can always be cleaned up later though, as your
naming choice is more consistent with how things are in the
rest of the kernel these days.

Arnd

[PATCH 29/29] s390: use __vmalloc_node in stack_alloc

2020-04-14 Thread Christoph Hellwig

stack_alloc can use a slightly higher level vmalloc function.

Signed-off-by: Christoph Hellwig 
Acked-by: Christian Borntraeger 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/s390/kernel/setup.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 36445dd40fdb..0f0b140b5558 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -305,12 +305,9 @@ void *restart_stack __section(.data);
 unsigned long stack_alloc(void)
 {
 #ifdef CONFIG_VMAP_STACK
-   return (unsigned long)
-   __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
-VMALLOC_START, VMALLOC_END,
-THREADINFO_GFP,
-PAGE_KERNEL, 0, NUMA_NO_NODE,
-__builtin_return_address(0));
+   return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE,
+   THREADINFO_GFP, NUMA_NO_NODE,
+   __builtin_return_address(0));
 #else
return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
 #endif
-- 
2.25.1

[PATCH 28/29] powerpc: use __vmalloc_node in alloc_vm_stack

2020-04-14 Thread Christoph Hellwig

alloc_vm_stack can use a slightly higher level vmalloc function.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/powerpc/kernel/irq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 1f1169856dc8..112d150354b2 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -748,9 +748,8 @@ void do_IRQ(struct pt_regs *regs)
 
 static void *__init alloc_vm_stack(void)
 {
-   return __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START,
-   VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL,
-0, NUMA_NO_NODE, (void*)_RET_IP_);
+   return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP,
+ NUMA_NO_NODE, (void *)_RET_IP_);
 }
 
 static void __init vmap_irqstack_init(void)
-- 
2.25.1

[PATCH 27/29] arm64: use __vmalloc_node in arch_alloc_vmap_stack

2020-04-14 Thread Christoph Hellwig

arch_alloc_vmap_stack can use a slightly higher level vmalloc function.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/arm64/include/asm/vmap_stack.h | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/vmap_stack.h 
b/arch/arm64/include/asm/vmap_stack.h
index 0a12115d9638..0cc6636e3f15 100644
--- a/arch/arm64/include/asm/vmap_stack.h
+++ b/arch/arm64/include/asm/vmap_stack.h
@@ -19,10 +19,8 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t 
stack_size, int node)
 {
BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
 
-   return __vmalloc_node_range(stack_size, THREAD_ALIGN,
-   VMALLOC_START, VMALLOC_END,
-   THREADINFO_GFP, PAGE_KERNEL, 0, node,
-   __builtin_return_address(0));
+   return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
+   __builtin_return_address(0));
 }
 
 #endif /* __ASM_VMAP_STACK_H */
-- 
2.25.1

[PATCH 26/29] mm: remove vmalloc_user_node_flags

2020-04-14 Thread Christoph Hellwig

Open code it in __bpf_map_area_alloc, which is the only caller.  Also
clean up __bpf_map_area_alloc to have a single vmalloc call with
slightly different flags instead of the current two different calls.

For this to compile for the nommu case add a __vmalloc_node_range stub
to nommu.c.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
Acked-by: Johannes Weiner 
---
 include/linux/vmalloc.h |  1 -
 kernel/bpf/syscall.c| 24 ++--
 mm/nommu.c  | 14 --
 mm/vmalloc.c| 20 
 4 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 108f49b47756..f90f2946aac2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
-extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t 
flags);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 48d98ea8fad6..dd30b334c554 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -281,26 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int 
numa_node, bool mmapable)
 * __GFP_RETRY_MAYFAIL to avoid such situations.
 */
 
-   const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
+   const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
+   unsigned int flags = 0;
+   unsigned long align = 1;
void *area;
 
if (size >= SIZE_MAX)
return NULL;
 
/* kmalloc()'ed memory can't be mmap()'ed */
-   if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-   area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+   if (mmapable) {
+   BUG_ON(!PAGE_ALIGNED(size));
+   align = SHMLBA;
+   flags = VM_USERMAP;
+   } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+   area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
numa_node);
if (area != NULL)
return area;
}
-   if (mmapable) {
-   BUG_ON(!PAGE_ALIGNED(size));
-   return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
-  __GFP_RETRY_MAYFAIL | flags);
-   }
-   return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags,
- numa_node, __builtin_return_address(0));
+
+   return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+   gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+   flags, numa_node, __builtin_return_address(0));
 }
 
 void *bpf_map_area_alloc(u64 size, int numa_node)
diff --git a/mm/nommu.c b/mm/nommu.c
index 81a86cd85893..b42cd6003d7d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -150,6 +150,14 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+   unsigned long start, unsigned long end, gfp_t gfp_mask,
+   pgprot_t prot, unsigned long vm_flags, int node,
+   const void *caller)
+{
+   return __vmalloc(size, flags);
+}
+
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller)
 {
@@ -180,12 +188,6 @@ void *vmalloc_user(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_user);
 
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
-   return __vmalloc_user_flags(size, flags | __GFP_ZERO);
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
 struct page *vmalloc_to_page(const void *addr)
 {
return virt_to_page(addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 333fbe77255a..f6f2acdaf70c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2658,26 +2658,6 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
-/**
- * vmalloc_user_node_flags - allocate memory for userspace on a specific node
- * @size: allocation size
- * @node: numa node
- * @flags: flags for the page level allocator
- *
- * The resulting memory area is zeroed so it can be mapped to userspace
- * without leaking data.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
-   return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
-

[PATCH 25/29] mm: switch the test_vmalloc module to use __vmalloc_node

2020-04-14 Thread Christoph Hellwig

No need to export the very low-level __vmalloc_node_range when the
test module can use a slightly higher level variant.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 lib/test_vmalloc.c | 26 +++---
 mm/vmalloc.c   | 17 -
 2 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 8bbefcaddfe8..cd6aef05dfb4 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -91,12 +91,8 @@ static int random_size_align_alloc_test(void)
 */
size = ((rnd % 10) + 1) * PAGE_SIZE;
 
-   ptr = __vmalloc_node_range(size, align,
-  VMALLOC_START, VMALLOC_END,
-  GFP_KERNEL | __GFP_ZERO,
-  PAGE_KERNEL,
-  0, 0, __builtin_return_address(0));
-
+   ptr = __vmalloc_node(size, align, GFP_KERNEL | __GFP_ZERO,
+   __builtin_return_address(0));
if (!ptr)
return -1;
 
@@ -118,12 +114,8 @@ static int align_shift_alloc_test(void)
for (i = 0; i < BITS_PER_LONG; i++) {
align = ((unsigned long) 1) << i;
 
-   ptr = __vmalloc_node_range(PAGE_SIZE, align,
-   VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL | __GFP_ZERO,
-   PAGE_KERNEL,
-   0, 0, __builtin_return_address(0));
-
+   ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL | __GFP_ZERO,
+   __builtin_return_address(0));
if (!ptr)
return -1;
 
@@ -139,13 +131,9 @@ static int fix_align_alloc_test(void)
int i;
 
for (i = 0; i < test_loop_count; i++) {
-   ptr = __vmalloc_node_range(5 * PAGE_SIZE,
-   THREAD_ALIGN << 1,
-   VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL | __GFP_ZERO,
-   PAGE_KERNEL,
-   0, 0, __builtin_return_address(0));
-
+   ptr = __vmalloc_node(5 * PAGE_SIZE, THREAD_ALIGN << 1,
+   GFP_KERNEL | __GFP_ZERO,
+   __builtin_return_address(0));
if (!ptr)
return -1;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae8249ef5821..333fbe77255a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2522,15 +2522,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned 
long align,
return NULL;
 }
 
-/*
- * This is only for performance analysis of vmalloc and stress purpose.
- * It is required by vmalloc test module, therefore do not use it other
- * than that.
- */
-#ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node_range);
-#endif
-
 /**
  * __vmalloc_node - allocate virtually contiguous memory
  * @size:  allocation size
@@ -2556,6 +2547,14 @@ void *__vmalloc_node(unsigned long size, unsigned long 
align,
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, 0, node, caller);
 }
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node);
+#endif
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 {
-- 
2.25.1

Re: [PATCH 4/8] binfmt_elf: open code copy_siginfo_to_user to kernelspace buffer

2020-04-14 Thread Arnd Bergmann

On Tue, Apr 14, 2020 at 9:02 AM Christoph Hellwig  wrote:
>
> Instead of messing with the address limit just open code the trivial
> memcpy + memset logic for the native version, and a call to
> to_compat_siginfo for the compat version.
>
> Signed-off-by: Christoph Hellwig 

Nice!

>   */
>  #define user_long_tcompat_long_t
>  #define user_siginfo_t compat_siginfo_t
> -#define copy_siginfo_to_user   copy_siginfo_to_user32
> +#define fill_siginfo_note(note, csigdata, siginfo) \
> +do {   \
> +   to_compat_siginfo(csigdata, siginfo, compat_siginfo_flags());   \
> +   fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); \
> +} while (0)

I don't think you are changing the behavior here, but I still wonder if it
is in fact correct for x32: is in_x32_syscall() true here when dumping an
x32 compat elf process, or should this rather be set according to which
binfmt_elf copy is being used?

 Arnd

[PATCH 22/29] mm: remove the prot argument to __vmalloc_node

2020-04-14 Thread Christoph Hellwig

This is always PAGE_KERNEL now.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 35 ++-
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 466a449b3a15..de7952959e82 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2401,8 +2401,7 @@ void *vmap(struct page **pages, unsigned int count,
 EXPORT_SYMBOL(vmap);
 
 static void *__vmalloc_node(unsigned long size, unsigned long align,
-   gfp_t gfp_mask, pgprot_t prot,
-   int node, const void *caller);
+   gfp_t gfp_mask, int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 pgprot_t prot, int node)
 {
@@ -2420,7 +2419,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, 
gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
-   PAGE_KERNEL, node, area->caller);
+   node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2539,13 +2538,11 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range);
  * @size:  allocation size
  * @align: desired alignment
  * @gfp_mask:  flags for the page level allocator
- * @prot:  protection mask for the allocated pages
  * @node:  node to use for allocation or NUMA_NO_NODE
  * @caller:caller's return address
  *
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags.  Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level allocator with
+ * @gfp_mask flags.  Map them into contiguous kernel virtual space.
  *
  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
  * and __GFP_NOFAIL are not supported
@@ -2556,16 +2553,15 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range);
  * Return: pointer to the allocated memory or %NULL on error
  */
 static void *__vmalloc_node(unsigned long size, unsigned long align,
-   gfp_t gfp_mask, pgprot_t prot,
-   int node, const void *caller)
+   gfp_t gfp_mask, int node, const void *caller)
 {
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-   gfp_mask, prot, 0, node, caller);
+   gfp_mask, PAGE_KERNEL, 0, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 {
-   return __vmalloc_node(size, 1, gfp_mask, PAGE_KERNEL, NUMA_NO_NODE,
+   return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
@@ -2573,15 +2569,15 @@ EXPORT_SYMBOL(__vmalloc);
 static inline void *__vmalloc_node_flags(unsigned long size,
int node, gfp_t flags)
 {
-   return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
-   node, __builtin_return_address(0));
+   return __vmalloc_node(size, 1, flags, node,
+   __builtin_return_address(0));
 }
 
 
 void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
  void *caller)
 {
-   return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
+   return __vmalloc_node(size, 1, flags, node, caller);
 }
 
 /**
@@ -2656,8 +2652,8 @@ EXPORT_SYMBOL(vmalloc_user);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-   return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
-   node, __builtin_return_address(0));
+   return __vmalloc_node(size, 1, GFP_KERNEL, node,
+   __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_node);
 
@@ -2670,9 +2666,6 @@ EXPORT_SYMBOL(vmalloc_node);
  * allocator and map them into contiguous kernel virtual space.
  * The memory allocated is set to zero.
  *
- * For tight control over page level allocator and protection flags
- * use __vmalloc_node() instead.
- *
  * Return: pointer to the allocated memory or %NULL on error
  */
 void *vzalloc_node(unsigned long size, int node)
@@ -2745,8 +2738,8 @@ void *vmalloc_exec(unsigned long size)
  */
 void *vmalloc_32(unsigned long size)
 {
-   return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+   return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_32);
 
-- 
2.25.1

[PATCH 24/29] mm: remove __vmalloc_node_flags_caller

2020-04-14 Thread Christoph Hellwig

Just use __vmalloc_node instead which gets and extra argument.  To be
able to to use __vmalloc_node in all caller make it available outside
of vmalloc and implement it in nommu.c.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 include/linux/vmalloc.h |  4 ++--
 kernel/bpf/syscall.c|  5 ++---
 mm/nommu.c  |  4 ++--
 mm/util.c   |  2 +-
 mm/vmalloc.c| 10 +-
 5 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 4a46d296e70d..108f49b47756 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -115,8 +115,8 @@ extern void *__vmalloc_node_range(unsigned long size, 
unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller);
-extern void *__vmalloc_node_flags_caller(unsigned long size,
-int node, gfp_t flags, void *caller);
+void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+   int node, const void *caller);
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 64783da34202..48d98ea8fad6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -299,9 +299,8 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, 
bool mmapable)
return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
   __GFP_RETRY_MAYFAIL | flags);
}
-   return __vmalloc_node_flags_caller(size, numa_node,
-  GFP_KERNEL | __GFP_RETRY_MAYFAIL |
-  flags, __builtin_return_address(0));
+   return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags,
+ numa_node, __builtin_return_address(0));
 }
 
 void *bpf_map_area_alloc(u64 size, int numa_node)
diff --git a/mm/nommu.c b/mm/nommu.c
index 9553efa59787..81a86cd85893 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -150,8 +150,8 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
-   void *caller)
+void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+   int node, const void *caller)
 {
return __vmalloc(size, flags);
 }
diff --git a/mm/util.c b/mm/util.c
index 988d11e6c17c..6d5868adbe18 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
 
-   return __vmalloc_node_flags_caller(size, node, flags,
+   return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
 }
 EXPORT_SYMBOL(kvmalloc_node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3d59d848ad48..ae8249ef5821 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2400,8 +2400,6 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
-static void *__vmalloc_node(unsigned long size, unsigned long align,
-   gfp_t gfp_mask, int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 pgprot_t prot, int node)
 {
@@ -2552,7 +2550,7 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, int node, const void *caller)
 {
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
@@ -2566,12 +2564,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
- void *caller)
-{
-   return __vmalloc_node(size, 1, flags, node, caller);
-}
-
 /**
  * vmalloc - allocate virtually contiguous memory
  * @size:allocation size
-- 
2.25.1

[PATCH 23/29] mm: remove both instances of __vmalloc_node_flags

2020-04-14 Thread Christoph Hellwig

The real version just had a few callers that can open code it and
remove one layer of indirection.  The nommu stub was public but only
had a single caller, so remove it and avoid a CONFIG_MMU ifdef in
vmalloc.h.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 include/linux/vmalloc.h |  9 -
 mm/nommu.c  |  3 ++-
 mm/vmalloc.c| 20 ++--
 3 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c1b9d6eca05f..4a46d296e70d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -115,17 +115,8 @@ extern void *__vmalloc_node_range(unsigned long size, 
unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller);
-#ifndef CONFIG_MMU
-extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
-static inline void *__vmalloc_node_flags_caller(unsigned long size, int node,
-   gfp_t flags, void *caller)
-{
-   return __vmalloc_node_flags(size, node, flags);
-}
-#else
 extern void *__vmalloc_node_flags_caller(unsigned long size,
 int node, gfp_t flags, void *caller);
-#endif
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2df549adb22b..9553efa59787 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -150,7 +150,8 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
+   void *caller)
 {
return __vmalloc(size, flags);
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index de7952959e82..3d59d848ad48 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2566,14 +2566,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-static inline void *__vmalloc_node_flags(unsigned long size,
-   int node, gfp_t flags)
-{
-   return __vmalloc_node(size, 1, flags, node,
-   __builtin_return_address(0));
-}
-
-
 void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
  void *caller)
 {
@@ -2594,8 +2586,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int 
node, gfp_t flags,
  */
 void *vmalloc(unsigned long size)
 {
-   return __vmalloc_node_flags(size, NUMA_NO_NODE,
-   GFP_KERNEL);
+   return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
 
@@ -2614,8 +2606,8 @@ EXPORT_SYMBOL(vmalloc);
  */
 void *vzalloc(unsigned long size)
 {
-   return __vmalloc_node_flags(size, NUMA_NO_NODE,
-   GFP_KERNEL | __GFP_ZERO);
+   return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vzalloc);
 
@@ -2670,8 +2662,8 @@ EXPORT_SYMBOL(vmalloc_node);
  */
 void *vzalloc_node(unsigned long size, int node)
 {
-   return __vmalloc_node_flags(size, node,
-GFP_KERNEL | __GFP_ZERO);
+   return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+   __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vzalloc_node);
 
-- 
2.25.1

[PATCH 21/29] mm: remove the pgprot argument to __vmalloc

2020-04-14 Thread Christoph Hellwig

The pgprot argument to __vmalloc is always PROT_KERNEL now, so remove
it.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Michael Kelley  [hyperv]
Acked-by: Gao Xiang  [erofs]
Acked-by: Peter Zijlstra (Intel) 
---
 arch/x86/hyperv/hv_init.c  |  3 +--
 arch/x86/include/asm/kvm_host.h|  3 +--
 arch/x86/kvm/svm/sev.c |  3 +--
 drivers/block/drbd/drbd_bitmap.c   |  4 +---
 drivers/gpu/drm/etnaviv/etnaviv_dump.c |  4 ++--
 drivers/lightnvm/pblk-init.c   |  5 ++---
 drivers/md/dm-bufio.c  |  4 ++--
 drivers/mtd/ubi/io.c   |  4 ++--
 drivers/scsi/sd_zbc.c  |  3 +--
 fs/gfs2/dir.c  |  9 -
 fs/gfs2/quota.c|  2 +-
 fs/nfs/blocklayout/extent_tree.c   |  2 +-
 fs/ntfs/malloc.h   |  2 +-
 fs/ubifs/debug.c   |  2 +-
 fs/ubifs/lprops.c  |  2 +-
 fs/ubifs/lpt_commit.c  |  4 ++--
 fs/ubifs/orphan.c  |  2 +-
 fs/xfs/kmem.c  |  2 +-
 include/linux/vmalloc.h|  2 +-
 kernel/bpf/core.c  |  6 +++---
 kernel/groups.c|  2 +-
 kernel/module.c|  3 +--
 mm/nommu.c | 15 +++
 mm/page_alloc.c|  2 +-
 mm/percpu.c|  2 +-
 mm/vmalloc.c   |  4 ++--
 net/bridge/netfilter/ebtables.c|  6 ++
 sound/core/memalloc.c  |  2 +-
 sound/core/pcm_memory.c|  2 +-
 29 files changed, 47 insertions(+), 59 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 5a4b363ba67b..a3d689dfc745 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -95,8 +95,7 @@ static int hv_cpu_init(unsigned int cpu)
 * not be stopped in the case of CPU offlining and the VM will hang.
 */
if (!*hvp) {
-   *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO,
-PAGE_KERNEL);
+   *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
}
 
if (*hvp) {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..71bc09bff01a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1280,8 +1280,7 @@ extern struct kmem_cache *x86_fpu_cache;
 #define __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
 {
-   return __vmalloc(kvm_x86_ops.vm_size,
-GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL);
+   return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 }
 void kvm_arch_free_vm(struct kvm *kvm);
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0e3fc311d7da..b699e40573ad 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -335,8 +335,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, 
unsigned long uaddr,
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
-   pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- PAGE_KERNEL);
+   pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
else
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
 
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 15e99697234a..df53dca5d02c 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap 
*b, unsigned long want)
bytes = sizeof(struct page *)*want;
new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
if (!new_pages) {
-   new_pages = __vmalloc(bytes,
-   GFP_NOIO | __GFP_ZERO,
-   PAGE_KERNEL);
+   new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO);
if (!new_pages)
return NULL;
}
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c 
b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index 648cf0207309..706af0304ca4 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -154,8 +154,8 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit)
file_size += sizeof(*iter.hdr) * n_obj;
 
/* Allocate the file in vmalloc memory, it's likely to be big */
-   iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | 
__GFP_NORETRY,
-  PAGE_KERNEL);
+   iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN |
+   __GFP_NORETRY);
if (!iter.start) {
mutex_unlock(>mmu_context->lock);

[PATCH 20/29] gpu/drm: remove the powerpc hack in drm_legacy_sg_alloc

2020-04-14 Thread Christoph Hellwig

The non-cached vmalloc mapping was initially added as a hack for the
first-gen amigaone platform (6xx/book32s), isn't fully supported
upstream, and which used the legacy radeon driver together with
non-coherent DMA. However this only ever worked reliably for DRI .

Remove the hack as it is the last user of __vmalloc passing a page
protection flag other than PAGE_KERNEL and didn't do anything for
other platforms with non-coherent DMA.

Signed-off-by: Christoph Hellwig 
Acked-by: Daniel Vetter 
Acked-by: Peter Zijlstra (Intel) 
---
 drivers/gpu/drm/drm_scatter.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
index ca520028b2cb..f4e6184d1877 100644
--- a/drivers/gpu/drm/drm_scatter.c
+++ b/drivers/gpu/drm/drm_scatter.c
@@ -43,15 +43,6 @@
 
 #define DEBUG_SCATTER 0
 
-static inline void *drm_vmalloc_dma(unsigned long size)
-{
-#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
-   return __vmalloc(size, GFP_KERNEL, pgprot_noncached_wc(PAGE_KERNEL));
-#else
-   return vmalloc_32(size);
-#endif
-}
-
 static void drm_sg_cleanup(struct drm_sg_mem * entry)
 {
struct page *page;
@@ -126,7 +117,7 @@ int drm_legacy_sg_alloc(struct drm_device *dev, void *data,
return -ENOMEM;
}
 
-   entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
+   entry->virtual = vmalloc_32(pages << PAGE_SHIFT);
if (!entry->virtual) {
kfree(entry->busaddr);
kfree(entry->pagelist);
-- 
2.25.1

[PATCH 19/29] mm: enforce that vmap can't map pages executable

2020-04-14 Thread Christoph Hellwig

To help enforcing the W^X protection don't allow remapping existing
pages as executable.

x86 bits from Peter Zijlstra ,
arm64 bits from Mark Rutland .

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/arm64/include/asm/pgtable.h | 3 +++
 arch/x86/include/asm/pgtable_types.h | 6 ++
 include/asm-generic/pgtable.h| 4 
 mm/vmalloc.c | 2 +-
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 538c85e62f86..47095216d6a8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -407,6 +407,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 #define __pgprot_modify(prot,mask,bits) \
__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
 
+#define pgprot_nx(prot) \
+   __pgprot_modify(prot, 0, PTE_PXN)
+
 /*
  * Mark the prot value as uncacheable and unbufferable.
  */
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 947867f112ea..2e7c442cc618 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -282,6 +282,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
 
 typedef struct { pgdval_t pgd; } pgd_t;
 
+static inline pgprot_t pgprot_nx(pgprot_t prot)
+{
+   return __pgprot(pgprot_val(prot) | _PAGE_NX);
+}
+#define pgprot_nx pgprot_nx
+
 #ifdef CONFIG_X86_PAE
 
 /*
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 329b8c8ca703..8c5f9c29698b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -491,6 +491,10 @@ static inline int arch_unmap_one(struct mm_struct *mm,
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
 
+#ifndef pgprot_nx
+#define pgprot_nx(prot)(prot)
+#endif
+
 #ifndef pgprot_noncached
 #define pgprot_noncached(prot) (prot)
 #endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7356b3f07bd8..334c75251ddb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2390,7 +2390,7 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
 
-   if (map_kernel_range((unsigned long)area->addr, size, prot,
+   if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
pages) < 0) {
vunmap(area->addr);
return NULL;
-- 
2.25.1

[PATCH 18/29] mm: remove the prot argument from vm_map_ram

2020-04-14 Thread Christoph Hellwig

This is always PAGE_KERNEL - for long term mappings with other
properties vmap should be used.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c   | 2 +-
 drivers/media/common/videobuf2/videobuf2-dma-sg.c  | 3 +--
 drivers/media/common/videobuf2/videobuf2-vmalloc.c | 3 +--
 fs/erofs/decompressor.c| 2 +-
 fs/xfs/xfs_buf.c   | 2 +-
 include/linux/vmalloc.h| 3 +--
 mm/nommu.c | 2 +-
 mm/vmalloc.c   | 4 ++--
 8 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c 
b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
index 9272bef57092..debaf7b18ab5 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
@@ -66,7 +66,7 @@ static void *mock_dmabuf_vmap(struct dma_buf *dma_buf)
 {
struct mock_dmabuf *mock = to_mock(dma_buf);
 
-   return vm_map_ram(mock->pages, mock->npages, 0, PAGE_KERNEL);
+   return vm_map_ram(mock->pages, mock->npages, 0);
 }
 
 static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c 
b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 6db60e9d5183..92072a08af25 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -309,8 +309,7 @@ static void *vb2_dma_sg_vaddr(void *buf_priv)
if (buf->db_attach)
buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf);
else
-   buf->vaddr = vm_map_ram(buf->pages,
-   buf->num_pages, -1, PAGE_KERNEL);
+   buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1);
}
 
/* add offset in case userptr is not page-aligned */
diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c 
b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
index 1a4f0ca87c7c..c66fda4a65e4 100644
--- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c
+++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
@@ -107,8 +107,7 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, 
unsigned long vaddr,
buf->vaddr = (__force void *)
ioremap(__pfn_to_phys(nums[0]), size + offset);
} else {
-   buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1,
-   PAGE_KERNEL);
+   buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1);
}
 
if (!buf->vaddr)
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 5d2d81940679..7628816f2453 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct 
z_erofs_decompress_req *rq,
 
i = 0;
while (1) {
-   dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
+   dst = vm_map_ram(rq->out, nrpages_out, -1);
 
/* retry two more times (totally 3 times) */
if (dst || ++i >= 3)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9ec3eaf1c618..65538d18e64f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -477,7 +477,7 @@ _xfs_buf_map_pages(
nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-   -1, PAGE_KERNEL);
+   -1);
if (bp->b_addr)
break;
vm_unmap_aliases();
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 15ffbd8e8e65..9273b1a91ca5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -88,8 +88,7 @@ struct vmap_area {
  * Highlevel APIs for driver use
  */
 extern void vm_unmap_ram(const void *mem, unsigned int count);
-extern void *vm_map_ram(struct page **pages, unsigned int count,
-   int node, pgprot_t prot);
+extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
 extern void vm_unmap_aliases(void);
 
 #ifdef CONFIG_MMU
diff --git a/mm/nommu.c b/mm/nommu.c
index 318df4e236c9..4f07b7ef0297 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -351,7 +351,7 @@ void vunmap(const void *addr)
 }
 EXPORT_SYMBOL(vunmap);
 
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t 
prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
 {
BUG();
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 258220b203f1..7356b3f07bd8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1834,7 +1834,7 @@

[PATCH 17/29] mm: remove unmap_vmap_area

2020-04-14 Thread Christoph Hellwig

This function just has a single caller, open code it there.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b0c7cdc8701a..258220b203f1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1247,14 +1247,6 @@ int unregister_vmap_purge_notifier(struct notifier_block 
*nb)
 }
 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
 
-/*
- * Clear the pagetable entries of a given vmap_area
- */
-static void unmap_vmap_area(struct vmap_area *va)
-{
-   unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
-}
-
 /*
  * lazy_max_pages is the maximum amount of virtual address space we gather up
  * before attempting to purge with a TLB flush.
@@ -1416,7 +1408,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 static void free_unmap_vmap_area(struct vmap_area *va)
 {
flush_cache_vunmap(va->va_start, va->va_end);
-   unmap_vmap_area(va);
+   unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
 
-- 
2.25.1

[PATCH 16/29] mm: remove map_vm_range

2020-04-14 Thread Christoph Hellwig

Switch all callers to map_kernel_range, which symmetric to the unmap
side (as well as the _noflush versions).

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 Documentation/core-api/cachetlb.rst |  2 +-
 include/linux/vmalloc.h | 10 --
 mm/vmalloc.c| 21 +++--
 mm/zsmalloc.c   |  4 +++-
 net/ceph/ceph_common.c  |  3 +--
 5 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/Documentation/core-api/cachetlb.rst 
b/Documentation/core-api/cachetlb.rst
index 93cb65d52720..a1582cc79f0f 100644
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -213,7 +213,7 @@ Here are the routines, one by one:
there will be no entries in the cache for the kernel address
space for virtual addresses in the range 'start' to 'end-1'.
 
-   The first of these two routines is invoked after map_vm_area()
+   The first of these two routines is invoked after map_kernel_range()
has installed the page table entries.  The second is invoked
before unmap_kernel_range() deletes the page table entries.
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3070b4dbc2d9..15ffbd8e8e65 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -168,11 +168,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned 
long size,
 extern struct vm_struct *remove_vm_area(const void *addr);
 extern struct vm_struct *find_vm_area(const void *addr);
 
-extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
-   struct page **pages);
 #ifdef CONFIG_MMU
 extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages);
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+   struct page **pages);
 extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 static inline void set_vm_flush_reset_perms(void *addr)
@@ -189,14 +189,12 @@ map_kernel_range_noflush(unsigned long start, unsigned 
long size,
 {
return size >> PAGE_SHIFT;
 }
+#define map_kernel_range map_kernel_range_noflush
 static inline void
 unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 {
 }
-static inline void
-unmap_kernel_range(unsigned long addr, unsigned long size)
-{
-}
+#define unmap_kernel_range unmap_kernel_range_noflush
 static inline void set_vm_flush_reset_perms(void *addr)
 {
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca8dc5d42580..b0c7cdc8701a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -272,8 +272,8 @@ int map_kernel_range_noflush(unsigned long addr, unsigned 
long size,
return 0;
 }
 
-static int map_kernel_range(unsigned long start, unsigned long size,
-  pgprot_t prot, struct page **pages)
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+   struct page **pages)
 {
int ret;
 
@@ -2027,16 +2027,6 @@ void unmap_kernel_range(unsigned long addr, unsigned 
long size)
flush_tlb_kernel_range(addr, end);
 }
 
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
-{
-   unsigned long addr = (unsigned long)area->addr;
-   int err;
-
-   err = map_kernel_range(addr, get_vm_area_size(area), prot, pages);
-
-   return err > 0 ? 0 : err;
-}
-
 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
 {
@@ -2408,7 +2398,8 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
 
-   if (map_vm_area(area, prot, pages)) {
+   if (map_kernel_range((unsigned long)area->addr, size, prot,
+   pages) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2471,8 +2462,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, 
gfp_t gfp_mask,
}
atomic_long_add(area->nr_pages, _vmalloc_pages);
 
-   if (map_vm_area(area, prot, pages))
+   if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
+   prot, pages) < 0)
goto fail;
+
return area->addr;
 
 fail:
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index ac0524330b9b..f6dc0673e62c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area 
*area)
 static inline void *__zs_map_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
 {
-   BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
+   unsigned long addr = (unsigned long)area->vm->addr;
+
+   BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0);

[PATCH 15/29] mm: don't return the number of pages from map_kernel_range{, _noflush}

2020-04-14 Thread Christoph Hellwig

None of the callers needs the number of pages, and a 0 / -errno return
value is a lot more intuitive.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3d810def567..ca8dc5d42580 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -249,7 +249,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
  * function.
  *
  * RETURNS:
- * The number of pages mapped on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
 int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 pgprot_t prot, struct page **pages)
@@ -269,7 +269,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned 
long size,
return err;
} while (pgd++, addr = next, addr != end);
 
-   return nr;
+   return 0;
 }
 
 static int map_kernel_range(unsigned long start, unsigned long size,
-- 
2.25.1

[PATCH 13/29] mm: remove vmap_page_range_noflush and vunmap_page_range

2020-04-14 Thread Christoph Hellwig

These have non-static aliases called map_kernel_range_noflush and
unmap_kernel_range_noflush that just differ slightly in the calling
conventions that pass addr + size instead of an end.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 98 +---
 1 file changed, 40 insertions(+), 58 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aada9e9144bd..55df5dc6a9fc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -127,10 +127,24 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long 
addr, unsigned long end)
} while (p4d++, addr = next, addr != end);
 }
 
-static void vunmap_page_range(unsigned long addr, unsigned long end)
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify
+ * should have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is responsible
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
+ * function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 {
-   pgd_t *pgd;
+   unsigned long end = addr + size;
unsigned long next;
+   pgd_t *pgd;
 
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
@@ -219,18 +233,30 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return 0;
 }
 
-/*
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
- * will have pfns corresponding to the "pages" array.
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
  *
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify 
should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * The number of pages mapped on success, -errno on failure.
  */
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
-  pgprot_t prot, struct page **pages)
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+pgprot_t prot, struct page **pages)
 {
-   pgd_t *pgd;
+   unsigned long end = addr + size;
unsigned long next;
-   unsigned long addr = start;
+   pgd_t *pgd;
int err = 0;
int nr = 0;
 
@@ -251,7 +277,7 @@ static int vmap_page_range(unsigned long start, unsigned 
long end,
 {
int ret;
 
-   ret = vmap_page_range_noflush(start, end, prot, pages);
+   ret = map_kernel_range_noflush(start, end - start, prot, pages);
flush_cache_vmap(start, end);
return ret;
 }
@@ -1226,7 +1252,7 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
  */
 static void unmap_vmap_area(struct vmap_area *va)
 {
-   vunmap_page_range(va->va_start, va->va_end);
+   unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
 }
 
 /*
@@ -1686,7 +1712,7 @@ static void vb_free(unsigned long addr, unsigned long 
size)
rcu_read_unlock();
BUG_ON(!vb);
 
-   vunmap_page_range(addr, addr + size);
+   unmap_kernel_range_noflush(addr, size);
 
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
@@ -1984,50 +2010,6 @@ void __init vmalloc_init(void)
vmap_initialized = true;
 }
 
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing.  The caller is
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
- * before calling this function.
- *
- * RETURNS:
- * The number of pages mapped on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
-pgprot_t prot, struct page **pages)
-{
-   return vmap_page_range_noflush(addr, addr + size, prot, pages);
-}
-
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
- *

[PATCH 14/29] mm: rename vmap_page_range to map_kernel_range

2020-04-14 Thread Christoph Hellwig

This matches the map_kernel_range_noflush API.  Also change to pass
a size instead of the end, similar to the noflush version.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 55df5dc6a9fc..a3d810def567 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -272,13 +272,13 @@ int map_kernel_range_noflush(unsigned long addr, unsigned 
long size,
return nr;
 }
 
-static int vmap_page_range(unsigned long start, unsigned long end,
+static int map_kernel_range(unsigned long start, unsigned long size,
   pgprot_t prot, struct page **pages)
 {
int ret;
 
-   ret = map_kernel_range_noflush(start, end - start, prot, pages);
-   flush_cache_vmap(start, end);
+   ret = map_kernel_range_noflush(start, size, prot, pages);
+   flush_cache_vmap(start, start + size);
return ret;
 }
 
@@ -1866,7 +1866,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, 
int node, pgprot_t pro
 
kasan_unpoison_vmalloc(mem, size);
 
-   if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+   if (map_kernel_range(addr, size, prot, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
@@ -2030,10 +2030,9 @@ void unmap_kernel_range(unsigned long addr, unsigned 
long size)
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
 {
unsigned long addr = (unsigned long)area->addr;
-   unsigned long end = addr + get_vm_area_size(area);
int err;
 
-   err = vmap_page_range(addr, end, prot, pages);
+   err = map_kernel_range(addr, get_vm_area_size(area), prot, pages);
 
return err > 0 ? 0 : err;
 }
-- 
2.25.1

[PATCH 12/29] mm: pass addr as unsigned long to vb_free

2020-04-14 Thread Christoph Hellwig

Ever use of addr in vb_free casts to unsigned long first, and the caller
has an unsigned long version of the address available anyway.  Just pass
that and avoid all the casts.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9183fc0d365a..aada9e9144bd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1664,7 +1664,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
return vaddr;
 }
 
-static void vb_free(const void *addr, unsigned long size)
+static void vb_free(unsigned long addr, unsigned long size)
 {
unsigned long offset;
unsigned long vb_idx;
@@ -1674,24 +1674,22 @@ static void vb_free(const void *addr, unsigned long 
size)
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 
-   flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+   flush_cache_vunmap(addr, addr + size);
 
order = get_order(size);
 
-   offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
-   offset >>= PAGE_SHIFT;
+   offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
 
-   vb_idx = addr_to_vb_idx((unsigned long)addr);
+   vb_idx = addr_to_vb_idx(addr);
rcu_read_lock();
vb = radix_tree_lookup(_block_tree, vb_idx);
rcu_read_unlock();
BUG_ON(!vb);
 
-   vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+   vunmap_page_range(addr, addr + size);
 
if (debug_pagealloc_enabled_static())
-   flush_tlb_kernel_range((unsigned long)addr,
-   (unsigned long)addr + size);
+   flush_tlb_kernel_range(addr, addr + size);
 
spin_lock(>lock);
 
@@ -1791,7 +1789,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
 
if (likely(count <= VMAP_MAX_ALLOC)) {
debug_check_no_locks_freed(mem, size);
-   vb_free(mem, size);
+   vb_free(addr, size);
return;
}
 
-- 
2.25.1

[PATCH 11/29] mm: only allow page table mappings for built-in zsmalloc

2020-04-14 Thread Christoph Hellwig

This allows to unexport map_vm_area and unmap_kernel_range, which are
rather deep internal and should not be available to modules, as they for
example allow fine grained control of mapping permissions, and also
allow splitting the setup of a vmalloc area and the actual mapping and
thus expose vmalloc internals.

zsmalloc is typically built-in and continues to work (just like the
percpu-vm code using a similar patter), while modular zsmalloc also
continues to work, but must use copies.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/Kconfig   | 2 +-
 mm/vmalloc.c | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 09a9edfb8461..5c0362bd8d56 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -707,7 +707,7 @@ config ZSMALLOC
 
 config ZSMALLOC_PGTABLE_MAPPING
bool "Use page table mapping to access object in zsmalloc"
-   depends on ZSMALLOC
+   depends on ZSMALLOC=y
help
  By default, zsmalloc uses a copy-based object mapping method to
  access allocations that span two pages. However, if a particular
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3375f9508ef6..9183fc0d365a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2046,7 +2046,6 @@ void unmap_kernel_range(unsigned long addr, unsigned long 
size)
vunmap_page_range(addr, end);
flush_tlb_kernel_range(addr, end);
 }
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
 
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
 {
@@ -2058,7 +2057,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, 
struct page **pages)
 
return err > 0 ? 0 : err;
 }
-EXPORT_SYMBOL_GPL(map_vm_area);
 
 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
-- 
2.25.1

[PATCH 09/29] mm: unexport unmap_kernel_range_noflush

2020-04-14 Thread Christoph Hellwig

There are no modular users of this function.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d1534d610b48..3375f9508ef6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2029,7 +2029,6 @@ void unmap_kernel_range_noflush(unsigned long addr, 
unsigned long size)
 {
vunmap_page_range(addr, addr + size);
 }
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
 
 /**
  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
-- 
2.25.1

[PATCH 10/29] mm: rename CONFIG_PGTABLE_MAPPING to CONFIG_ZSMALLOC_PGTABLE_MAPPING

2020-04-14 Thread Christoph Hellwig

Rename the Kconfig variable to clarify the scope.

Signed-off-by: Christoph Hellwig 
Acked-by: Minchan Kim 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/arm/configs/omap2plus_defconfig | 2 +-
 include/linux/zsmalloc.h | 2 +-
 mm/Kconfig   | 2 +-
 mm/zsmalloc.c| 8 
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm/configs/omap2plus_defconfig 
b/arch/arm/configs/omap2plus_defconfig
index 3cc3ca5fa027..583d8abd80a4 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -81,7 +81,7 @@ CONFIG_PARTITION_ADVANCED=y
 CONFIG_BINFMT_MISC=y
 CONFIG_CMA=y
 CONFIG_ZSMALLOC=m
-CONFIG_PGTABLE_MAPPING=y
+CONFIG_ZSMALLOC_PGTABLE_MAPPING=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 2219cce81ca4..0fdbf653b173 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -20,7 +20,7 @@
  * zsmalloc mapping modes
  *
  * NOTE: These only make a difference when a mapped object spans pages.
- * They also have no effect when PGTABLE_MAPPING is selected.
+ * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected.
  */
 enum zs_mapmode {
ZS_MM_RW, /* normal read-write mapping */
diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c35..09a9edfb8461 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -705,7 +705,7 @@ config ZSMALLOC
  returned by an alloc().  This handle must be mapped in order to
  access the allocated space.
 
-config PGTABLE_MAPPING
+config ZSMALLOC_PGTABLE_MAPPING
bool "Use page table mapping to access object in zsmalloc"
depends on ZSMALLOC
help
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2f836a2b993f..ac0524330b9b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -293,7 +293,7 @@ struct zspage {
 };
 
 struct mapping_area {
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
struct vm_struct *vm; /* vm area for mapping object that span pages */
 #else
char *vm_buf; /* copy buffer for objects that span pages */
@@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class 
*class)
return zspage;
 }
 
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
 static inline int __zs_cpu_up(struct mapping_area *area)
 {
/*
@@ -1151,7 +1151,7 @@ static inline void __zs_unmap_object(struct mapping_area 
*area,
unmap_kernel_range(addr, PAGE_SIZE * 2);
 }
 
-#else /* CONFIG_PGTABLE_MAPPING */
+#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
 
 static inline int __zs_cpu_up(struct mapping_area *area)
 {
@@ -1233,7 +1233,7 @@ static void __zs_unmap_object(struct mapping_area *area,
pagefault_enable();
 }
 
-#endif /* CONFIG_PGTABLE_MAPPING */
+#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
 
 static int zs_cpu_prepare(unsigned int cpu)
 {
-- 
2.25.1

[PATCH 08/29] mm: remove __get_vm_area

2020-04-14 Thread Christoph Hellwig

Switch the two remaining callers to use __get_vm_area_caller instead.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/powerpc/kernel/pci_64.c | 3 ++-
 arch/sh/kernel/cpu/sh4/sq.c  | 3 ++-
 include/linux/vmalloc.h  | 2 --
 mm/vmalloc.c | 8 
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 2a976314f169..d9ac980c398c 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -132,7 +132,8 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long 
size)
 * address decoding but I'd rather not deal with those outside of the
 * reserved 64K legacy region.
 */
-   area = __get_vm_area(size, 0, PHB_IO_BASE, PHB_IO_END);
+   area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END,
+   __builtin_return_address(0));
if (!area)
return NULL;
 
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index 934ff84844fa..d432164b23b7 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -103,7 +103,8 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot)
 #if defined(CONFIG_MMU)
struct vm_struct *vma;
 
-   vma = __get_vm_area(map->size, VM_ALLOC, map->sq_addr, SQ_ADDRMAX);
+   vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr,
+   SQ_ADDRMAX, __builtin_return_address(0));
if (!vma)
return -ENOMEM;
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0507a162ccd0..3070b4dbc2d9 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -161,8 +161,6 @@ static inline size_t get_vm_area_size(const struct 
vm_struct *area)
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
 extern struct vm_struct *get_vm_area_caller(unsigned long size,
unsigned long flags, const void 
*caller);
-extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
-   unsigned long start, unsigned long end);
 extern struct vm_struct *__get_vm_area_caller(unsigned long size,
unsigned long flags,
unsigned long start, unsigned long end,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 399f219544f7..d1534d610b48 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2127,14 +2127,6 @@ static struct vm_struct *__get_vm_area_node(unsigned 
long size,
return area;
 }
 
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
-   unsigned long start, unsigned long end)
-{
-   return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, __builtin_return_address(0));
-}
-EXPORT_SYMBOL_GPL(__get_vm_area);
-
 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
   unsigned long start, unsigned long end,
   const void *caller)
-- 
2.25.1

[PATCH 07/29] powerpc: remove __ioremap_at and __iounmap_at

2020-04-14 Thread Christoph Hellwig

These helpers are only used for remapping the ISA I/O base.  Replace
the mapping side with a remap_isa_range helper in isa-bridge.c that
hard codes all the known arguments, and just remove __iounmap_at in
favour of open coding it in the only caller.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/powerpc/include/asm/io.h|  8 -
 arch/powerpc/kernel/isa-bridge.c | 28 +-
 arch/powerpc/mm/ioremap_64.c | 50 
 3 files changed, 21 insertions(+), 65 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 91320985d33f..13f90dd03450 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -699,10 +699,6 @@ static inline void iosync(void)
  *
  * * iounmap undoes such a mapping and can be hooked
  *
- * * __ioremap_at (and the pending __iounmap_at) are low level functions to
- *   create hand-made mappings for use only by the PCI code and cannot
- *   currently be hooked. Must be page aligned.
- *
  * * __ioremap_caller is the same as above but takes an explicit caller
  *   reference rather than using __builtin_return_address(0)
  *
@@ -729,10 +725,6 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t 
offset, unsigned long size,
 extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
  pgprot_t prot, void *caller);
 
-extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea,
-  unsigned long size, pgprot_t prot);
-extern void __iounmap_at(void *ea, unsigned long size);
-
 /*
  * When CONFIG_PPC_INDIRECT_PIO is set, we use the generic iomap implementation
  * which needs some additional definitions here. They basically allow PIO
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index 773671b512df..2257d24e6a26 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -38,6 +39,22 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev);
 #define ISA_SPACE_MASK 0x1
 #define ISA_SPACE_IO 0x1
 
+static void remap_isa_base(phys_addr_t pa, unsigned long size)
+{
+   WARN_ON_ONCE(ISA_IO_BASE & ~PAGE_MASK);
+   WARN_ON_ONCE(pa & ~PAGE_MASK);
+   WARN_ON_ONCE(size & ~PAGE_MASK);
+
+   if (slab_is_available()) {
+   if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa,
+   pgprot_noncached(PAGE_KERNEL)))
+   unmap_kernel_range(ISA_IO_BASE, size);
+   } else {
+   early_ioremap_range(ISA_IO_BASE, pa, size,
+   pgprot_noncached(PAGE_KERNEL));
+   }
+}
+
 static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
  unsigned long phb_io_base_phys)
 {
@@ -105,15 +122,13 @@ static void pci_process_ISA_OF_ranges(struct device_node 
*isa_node,
if (size > 0x1)
size = 0x1;
 
-   __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
-size, pgprot_noncached(PAGE_KERNEL));
+   remap_isa_base(phb_io_base_phys, size);
return;
 
 inval_range:
printk(KERN_ERR "no ISA IO ranges or unexpected isa range, "
   "mapping 64k\n");
-   __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
-0x1, pgprot_noncached(PAGE_KERNEL));
+   remap_isa_base(phb_io_base_phys, 0x1);
 }
 
 
@@ -248,8 +263,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np)
 * and map it
 */
isa_io_base = ISA_IO_BASE;
-   __ioremap_at(pbase, (void *)ISA_IO_BASE,
-size, pgprot_noncached(PAGE_KERNEL));
+   remap_isa_base(pbase, size);
 
pr_debug("ISA: Non-PCI bridge is %pOF\n", np);
 }
@@ -297,7 +311,7 @@ static void isa_bridge_remove(void)
isa_bridge_pcidev = NULL;
 
/* Unmap the ISA area */
-   __iounmap_at((void *)ISA_IO_BASE, 0x1);
+   unmap_kernel_range(ISA_IO_BASE, 0x1);
 }
 
 /**
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index 50a99d9684f7..ba5cbb0d66bd 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -4,56 +4,6 @@
 #include 
 #include 
 
-/**
- * Low level function to establish the page tables for an IO mapping
- */
-void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, 
pgprot_t prot)
-{
-   int ret;
-   unsigned long va = (unsigned long)ea;
-
-   /* We don't support the 4K PFN hack with ioremap */
-   if (pgprot_val(prot) & H_PAGE_4K_PFN)
-   return NULL;
-
-   if ((ea + size) >= (void *)IOREMAP_END) {
-   pr_warn("Outside the supported range\n");
-   return NULL;
-   }
-
-   WARN_ON(pa & ~PAGE_MASK);
-   WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
-

[PATCH 06/29] powerpc: add an ioremap_phb helper

2020-04-14 Thread Christoph Hellwig

Factor code shared between pci_64 and electra_cf into a ioremap_pbh
helper that follows the normal ioremap semantics, and returns a
useful __iomem pointer.  Note that it opencodes __ioremap_at as
we know from the callers the slab is available.  Switch pci_64
to also store the result as __iomem pointer, and unmap the result
using iounmap instead of force casting and using vmalloc APIs.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/powerpc/include/asm/io.h |  2 +
 arch/powerpc/include/asm/pci-bridge.h |  2 +-
 arch/powerpc/kernel/pci_64.c  | 53 ++-
 drivers/pcmcia/electra_cf.c   | 45 ---
 4 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 635969b5b58e..91320985d33f 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -719,6 +719,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, 
unsigned long size);
 
 extern void iounmap(volatile void __iomem *addr);
 
+void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size);
+
 int early_ioremap_range(unsigned long ea, phys_addr_t pa,
unsigned long size, pgprot_t prot);
 void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long 
size,
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 69f4cb3b7c56..b92e81b256e5 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -66,7 +66,7 @@ struct pci_controller {
 
void __iomem *io_base_virt;
 #ifdef CONFIG_PPC64
-   void *io_base_alloc;
+   void __iomem *io_base_alloc;
 #endif
resource_size_t io_base_phys;
resource_size_t pci_io_size;
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index f83d1f69b1dd..2a976314f169 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -109,23 +109,46 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
/* Get the host bridge */
hose = pci_bus_to_host(bus);
 
-   /* Check if we have IOs allocated */
-   if (hose->io_base_alloc == NULL)
-   return 0;
-
pr_debug("IO unmapping for PHB %pOF\n", hose->dn);
pr_debug("  alloc=0x%p\n", hose->io_base_alloc);
 
-   /* This is a PHB, we fully unmap the IO area */
-   vunmap(hose->io_base_alloc);
-
+   iounmap(hose->io_base_alloc);
return 0;
 }
 EXPORT_SYMBOL_GPL(pcibios_unmap_io_space);
 
-static int pcibios_map_phb_io_space(struct pci_controller *hose)
+void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size)
 {
struct vm_struct *area;
+   unsigned long addr;
+
+   WARN_ON_ONCE(paddr & ~PAGE_MASK);
+   WARN_ON_ONCE(size & ~PAGE_MASK);
+
+   /*
+* Let's allocate some IO space for that guy. We don't pass VM_IOREMAP
+* because we don't care about alignment tricks that the core does in
+* that case.  Maybe we should due to stupid card with incomplete
+* address decoding but I'd rather not deal with those outside of the
+* reserved 64K legacy region.
+*/
+   area = __get_vm_area(size, 0, PHB_IO_BASE, PHB_IO_END);
+   if (!area)
+   return NULL;
+
+   addr = (unsigned long)area->addr;
+   if (ioremap_page_range(addr, addr + size, paddr,
+   pgprot_noncached(PAGE_KERNEL))) {
+   unmap_kernel_range(addr, size);
+   return NULL;
+   }
+
+   return (void __iomem *)addr;
+}
+EXPORT_SYMBOL_GPL(ioremap_phb);
+
+static int pcibios_map_phb_io_space(struct pci_controller *hose)
+{
unsigned long phys_page;
unsigned long size_page;
unsigned long io_virt_offset;
@@ -146,12 +169,11 @@ static int pcibios_map_phb_io_space(struct pci_controller 
*hose)
 * with incomplete address decoding but I'd rather not deal with
 * those outside of the reserved 64K legacy region.
 */
-   area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END);
-   if (area == NULL)
+   hose->io_base_alloc = ioremap_phb(phys_page, size_page);
+   if (!hose->io_base_alloc)
return -ENOMEM;
-   hose->io_base_alloc = area->addr;
-   hose->io_base_virt = (void __iomem *)(area->addr +
- hose->io_base_phys - phys_page);
+   hose->io_base_virt = hose->io_base_alloc +
+   hose->io_base_phys - phys_page;
 
pr_debug("IO mapping for PHB %pOF\n", hose->dn);
pr_debug("  phys=0x%016llx, virt=0x%p (alloc=0x%p)\n",
@@ -159,11 +181,6 @@ static int pcibios_map_phb_io_space(struct pci_controller 
*hose)
pr_debug("  size=0x%016llx (alloc=0x%016lx)\n",
 hose->pci_io_size, size_page);
 
-   /* Establish the mapping */
-   if (__ioremap_at(phys_page, area->addr,

[PATCH 05/29] dma-mapping: use vmap insted of reimplementing it

2020-04-14 Thread Christoph Hellwig

Replace the open coded instance of vmap with the actual function.  In
the non-contiguous (IOMMU) case this requires an extra find_vm_area,
but given that this isn't a fast path function that is a small price
to pay.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 kernel/dma/remap.c | 48 --
 1 file changed, 12 insertions(+), 36 deletions(-)

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index d14cbc83986a..914ff5a58dd5 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -20,23 +20,6 @@ struct page **dma_common_find_pages(void *cpu_addr)
return area->pages;
 }
 
-static struct vm_struct *__dma_common_pages_remap(struct page **pages,
-   size_t size, pgprot_t prot, const void *caller)
-{
-   struct vm_struct *area;
-
-   area = get_vm_area_caller(size, VM_DMA_COHERENT, caller);
-   if (!area)
-   return NULL;
-
-   if (map_vm_area(area, prot, pages)) {
-   vunmap(area->addr);
-   return NULL;
-   }
-
-   return area;
-}
-
 /*
  * Remaps an array of PAGE_SIZE pages into another vm_area.
  * Cannot be used in non-sleeping contexts
@@ -44,15 +27,12 @@ static struct vm_struct *__dma_common_pages_remap(struct 
page **pages,
 void *dma_common_pages_remap(struct page **pages, size_t size,
 pgprot_t prot, const void *caller)
 {
-   struct vm_struct *area;
+   void *vaddr;
 
-   area = __dma_common_pages_remap(pages, size, prot, caller);
-   if (!area)
-   return NULL;
-
-   area->pages = pages;
-
-   return area->addr;
+   vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot);
+   if (vaddr)
+   find_vm_area(vaddr)->pages = pages;
+   return vaddr;
 }
 
 /*
@@ -62,24 +42,20 @@ void *dma_common_pages_remap(struct page **pages, size_t 
size,
 void *dma_common_contiguous_remap(struct page *page, size_t size,
pgprot_t prot, const void *caller)
 {
-   int i;
+   int count = size >> PAGE_SHIFT;
struct page **pages;
-   struct vm_struct *area;
+   void *vaddr;
+   int i;
 
-   pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+   pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return NULL;
-
-   for (i = 0; i < (size >> PAGE_SHIFT); i++)
+   for (i = 0; i < count; i++)
pages[i] = nth_page(page, i);
-
-   area = __dma_common_pages_remap(pages, size, prot, caller);
-
+   vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
kfree(pages);
 
-   if (!area)
-   return NULL;
-   return area->addr;
+   return vaddr;
 }
 
 /*
-- 
2.25.1

[PATCH 04/29] staging: media: ipu3: use vmap instead of reimplementing it

2020-04-14 Thread Christoph Hellwig

Just use vmap instead of messing with vmalloc internals.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 drivers/staging/media/ipu3/ipu3-css-pool.h |  4 +--
 drivers/staging/media/ipu3/ipu3-dmamap.c   | 30 ++
 2 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/drivers/staging/media/ipu3/ipu3-css-pool.h 
b/drivers/staging/media/ipu3/ipu3-css-pool.h
index f4a60b41401b..a8ccd4f70320 100644
--- a/drivers/staging/media/ipu3/ipu3-css-pool.h
+++ b/drivers/staging/media/ipu3/ipu3-css-pool.h
@@ -15,14 +15,12 @@ struct imgu_device;
  * @size:  size of the buffer in bytes.
  * @vaddr: kernel virtual address.
  * @daddr: iova dma address to access IPU3.
- * @vma:   private, a pointer to  vm_struct,
- * used for imgu_dmamap_free.
  */
 struct imgu_css_map {
size_t size;
void *vaddr;
dma_addr_t daddr;
-   struct vm_struct *vma;
+   struct page **pages;
 };
 
 /**
diff --git a/drivers/staging/media/ipu3/ipu3-dmamap.c 
b/drivers/staging/media/ipu3/ipu3-dmamap.c
index 7431322379f6..8a19b0024152 100644
--- a/drivers/staging/media/ipu3/ipu3-dmamap.c
+++ b/drivers/staging/media/ipu3/ipu3-dmamap.c
@@ -96,6 +96,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct 
imgu_css_map *map,
unsigned long shift = iova_shift(>iova_domain);
struct device *dev = >pci_dev->dev;
size_t size = PAGE_ALIGN(len);
+   int count = size >> PAGE_SHIFT;
struct page **pages;
dma_addr_t iovaddr;
struct iova *iova;
@@ -114,7 +115,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct 
imgu_css_map *map,
 
/* Call IOMMU driver to setup pgt */
iovaddr = iova_dma_addr(>iova_domain, iova);
-   for (i = 0; i < size / PAGE_SIZE; ++i) {
+   for (i = 0; i < count; ++i) {
rval = imgu_mmu_map(imgu->mmu, iovaddr,
page_to_phys(pages[i]), PAGE_SIZE);
if (rval)
@@ -123,33 +124,23 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct 
imgu_css_map *map,
iovaddr += PAGE_SIZE;
}
 
-   /* Now grab a virtual region */
-   map->vma = __get_vm_area(size, VM_USERMAP, VMALLOC_START, VMALLOC_END);
-   if (!map->vma)
+   map->vaddr = vmap(pages, count, VM_USERMAP, PAGE_KERNEL);
+   if (!map->vaddr)
goto out_unmap;
 
-   map->vma->pages = pages;
-   /* And map it in KVA */
-   if (map_vm_area(map->vma, PAGE_KERNEL, pages))
-   goto out_vunmap;
-
+   map->pages = pages;
map->size = size;
map->daddr = iova_dma_addr(>iova_domain, iova);
-   map->vaddr = map->vma->addr;
 
dev_dbg(dev, "%s: allocated %zu @ IOVA %pad @ VA %p\n", __func__,
-   size, >daddr, map->vma->addr);
-
-   return map->vma->addr;
+   size, >daddr, map->vaddr);
 
-out_vunmap:
-   vunmap(map->vma->addr);
+   return map->vaddr;
 
 out_unmap:
imgu_dmamap_free_buffer(pages, size);
imgu_mmu_unmap(imgu->mmu, iova_dma_addr(>iova_domain, iova),
   i * PAGE_SIZE);
-   map->vma = NULL;
 
 out_free_iova:
__free_iova(>iova_domain, iova);
@@ -177,8 +168,6 @@ void imgu_dmamap_unmap(struct imgu_device *imgu, struct 
imgu_css_map *map)
  */
 void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map)
 {
-   struct vm_struct *area = map->vma;
-
dev_dbg(>pci_dev->dev, "%s: freeing %zu @ IOVA %pad @ VA %p\n",
__func__, map->size, >daddr, map->vaddr);
 
@@ -187,11 +176,8 @@ void imgu_dmamap_free(struct imgu_device *imgu, struct 
imgu_css_map *map)
 
imgu_dmamap_unmap(imgu, map);
 
-   if (WARN_ON(!area) || WARN_ON(!area->pages))
-   return;
-
-   imgu_dmamap_free_buffer(area->pages, map->size);
vunmap(map->vaddr);
+   imgu_dmamap_free_buffer(map->pages, map->size);
map->vaddr = NULL;
 }
 
-- 
2.25.1

[PATCH 03/29] staging: android: ion: use vmap instead of vm_map_ram

2020-04-14 Thread Christoph Hellwig

vm_map_ram can keep mappings around after the vm_unmap_ram.  Using that
with non-PAGE_KERNEL mappings can lead to all kinds of aliasing issues.

Signed-off-by: Christoph Hellwig 
Acked-by: Greg Kroah-Hartman 
Acked-by: Peter Zijlstra (Intel) 
---
 drivers/staging/android/ion/ion_heap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/ion/ion_heap.c 
b/drivers/staging/android/ion/ion_heap.c
index 473b465724f1..0755b11348ed 100644
--- a/drivers/staging/android/ion/ion_heap.c
+++ b/drivers/staging/android/ion/ion_heap.c
@@ -99,12 +99,12 @@ int ion_heap_map_user(struct ion_heap *heap, struct 
ion_buffer *buffer,
 
 static int ion_heap_clear_pages(struct page **pages, int num, pgprot_t pgprot)
 {
-   void *addr = vm_map_ram(pages, num, -1, pgprot);
+   void *addr = vmap(pages, num, VM_MAP, pgprot);
 
if (!addr)
return -ENOMEM;
memset(addr, 0, PAGE_SIZE * num);
-   vm_unmap_ram(addr, num);
+   vunmap(addr);
 
return 0;
 }
-- 
2.25.1

[PATCH 02/29] x86: fix vmap arguments in map_irq_stack

2020-04-14 Thread Christoph Hellwig

vmap does not take a gfp_t, the flags argument is for VM_* flags.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/kernel/irq_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 12df3a4abfdd..6b32ab009c19 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -43,7 +43,7 @@ static int map_irq_stack(unsigned int cpu)
pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
}
 
-   va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+   va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL);
if (!va)
return -ENOMEM;
 
-- 
2.25.1

[PATCH 01/29] x86/hyperv: use vmalloc_exec for the hypercall page

2020-04-14 Thread Christoph Hellwig

Use the designated helper for allocating executable kernel memory, and
remove the now unused PAGE_KERNEL_RX define.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Michael Kelley 
Acked-by: Wei Liu 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/x86/hyperv/hv_init.c| 2 +-
 arch/x86/include/asm/pgtable_types.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index b0da5320bcff..5a4b363ba67b 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -355,7 +355,7 @@ void __init hyperv_init(void)
guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
 
-   hv_hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
+   hv_hypercall_pg = vmalloc_exec(PAGE_SIZE);
if (hv_hypercall_pg == NULL) {
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
goto remove_cpuhp_state;
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index b6606fe6cfdf..947867f112ea 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -194,7 +194,6 @@ enum page_cache_mode {
 #define _PAGE_TABLE_NOENC   (__PP|__RW|_USR|___A|   0|___D|   0|   0)
 #define _PAGE_TABLE (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)
 #define __PAGE_KERNEL_RO(__PP|   0|   0|___A|__NX|___D|   0|___G)
-#define __PAGE_KERNEL_RX(__PP|   0|   0|___A|   0|___D|   0|___G)
 #define __PAGE_KERNEL_NOCACHE   (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
 #define __PAGE_KERNEL_VVAR  (__PP|   0|_USR|___A|__NX|___D|   0|___G)
 #define __PAGE_KERNEL_LARGE (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
@@ -220,7 +219,6 @@ enum page_cache_mode {
 #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC)
 #define PAGE_KERNEL_EXEC   __pgprot_mask(__PAGE_KERNEL_EXEC   | _ENC)
 #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC   |0)
-#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC)
 #define PAGE_KERNEL_NOCACHE__pgprot_mask(__PAGE_KERNEL_NOCACHE| _ENC)
 #define PAGE_KERNEL_LARGE  __pgprot_mask(__PAGE_KERNEL_LARGE  | _ENC)
 #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
-- 
2.25.1

decruft the vmalloc API v2

2020-04-14 Thread Christoph Hellwig

Hi all,

Peter noticed that with some dumb luck you can toast the kernel address
space with exported vmalloc symbols.

I used this as an opportunity to decruft the vmalloc.c API and make it
much more systematic.  This also removes any chance to create vmalloc
mappings outside the designated areas or using executable permissions
from modules.  Besides that it removes more than 300 lines of code.

A git tree is also available here:

git://git.infradead.org/users/hch/misc.git sanitize-vmalloc-api.2

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/sanitize-vmalloc-api.2

Changes since v1:
 - implement pgprot_nx for arm64 (Mark Rutland)
 - fix a patch description
 - properly pass pgprot to vmap in ion
 - add a new patch to fix vmap() API misuse
 - fix a vmap argument in x86
 - two more vmalloc cleanups
 - cleanup use of the unmap_kernel_range API
 - rename ioremap_pbh to ioremap_phb

Re: [PATCH 1/2] mm, treewide: Rename kzfree() to kfree_sensitive()

2020-04-14 Thread David Howells

Waiman Long  wrote:

> As said by Linus:
> 
>   A symmetric naming is only helpful if it implies symmetries in use.
>   Otherwise it's actively misleading.
> 
>   In "kzalloc()", the z is meaningful and an important part of what the
>   caller wants.
> 
>   In "kzfree()", the z is actively detrimental, because maybe in the
>   future we really _might_ want to use that "memfill(0xdeadbeef)" or
>   something. The "zero" part of the interface isn't even _relevant_.
> 
> The main reason that kzfree() exists is to clear sensitive information
> that should not be leaked to other future users of the same memory
> objects.
> 
> Rename kzfree() to kfree_sensitive() to follow the example of the
> recently added kvfree_sensitive() and make the intention of the API
> more explicit. In addition, memzero_explicit() is used to clear the
> memory to make sure that it won't get optimized away by the compiler.
> 
> The renaming is done by using the command sequence:
> 
>   git grep -w --name-only kzfree |\
>   xargs sed -i 's/\bkzfree\b/kfree_sensitive/'
> 
> followed by some editing of the kfree_sensitive() kerneldoc and the
> use of memzero_explicit() instead of memset().
> 
> Suggested-by: Joe Perches 
> Signed-off-by: Waiman Long 

Since this changes a lot of crypto stuff, does it make sense for it to go via
the crypto tree?

Acked-by: David Howells

Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

2020-04-14 Thread Christoph Hellwig

On Tue, Apr 14, 2020 at 10:13:44PM +1000, Nicholas Piggin wrote:
> Which case? Usually the answer would be because you don't want to use
> contiguous physical memory and/or you don't want to use the linear 
> mapping.

But with huge pages you do by definition already use large contiguous
areas.  So you want allocations larger than "small" huge pages but not
using gigantic pages using vmalloc?

[PATCH 8/8] tracefs: switch to simplefs inode creation API

2020-04-14 Thread Emanuele Giuseppe Esposito

There is no semantic change intended; the code in the simplefs.c functions in
fact was derived from debugfs and tracefs code.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 fs/tracefs/inode.c | 86 --
 1 file changed, 7 insertions(+), 79 deletions(-)

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index a30837a8e1d4..69e2215c797b 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -298,57 +298,6 @@ static struct file_system_type trace_fs_type = {
 };
 MODULE_ALIAS_FS("tracefs");
 
-static struct dentry *start_creating(const char *name, struct dentry *parent)
-{
-   struct dentry *dentry;
-   int error;
-
-   pr_debug("tracefs: creating file '%s'\n",name);
-
-   error = simple_pin_fs(, _fs_type);
-   if (error)
-   return ERR_PTR(error);
-
-   /* If the parent is not specified, we create it in the root.
-* We need the root dentry to do this, which is in the super
-* block. A pointer to that is in the struct vfsmount that we
-* have around.
-*/
-   if (!parent)
-   parent = tracefs.mount->mnt_root;
-
-   inode_lock(parent->d_inode);
-   if (unlikely(IS_DEADDIR(parent->d_inode)))
-   dentry = ERR_PTR(-ENOENT);
-   else
-   dentry = lookup_one_len(name, parent, strlen(name));
-   if (!IS_ERR(dentry) && dentry->d_inode) {
-   dput(dentry);
-   dentry = ERR_PTR(-EEXIST);
-   }
-
-   if (IS_ERR(dentry)) {
-   inode_unlock(parent->d_inode);
-   simple_release_fs();
-   }
-
-   return dentry;
-}
-
-static struct dentry *failed_creating(struct dentry *dentry)
-{
-   inode_unlock(dentry->d_parent->d_inode);
-   dput(dentry);
-   simple_release_fs();
-   return NULL;
-}
-
-static struct dentry *end_creating(struct dentry *dentry)
-{
-   inode_unlock(dentry->d_parent->d_inode);
-   return dentry;
-}
-
 /**
  * tracefs_create_file - create a file in the tracefs filesystem
  * @name: a pointer to a string containing the name of the file to create.
@@ -385,49 +334,28 @@ struct dentry *tracefs_create_file(const char *name, 
umode_t mode,
if (security_locked_down(LOCKDOWN_TRACEFS))
return NULL;
 
-   if (!(mode & S_IFMT))
-   mode |= S_IFREG;
-   BUG_ON(!S_ISREG(mode));
-   dentry = start_creating(name, parent);
-
+   dentry = simplefs_create_file(, _fs_type,
+ name, mode, parent, data, );
if (IS_ERR(dentry))
return NULL;
 
-   inode = simple_new_inode(dentry->d_sb);
-   if (unlikely(!inode))
-   return failed_creating(dentry);
-
-   inode->i_mode = mode;
inode->i_fop = fops ? fops : _file_operations;
-   inode->i_private = data;
-   d_instantiate(dentry, inode);
-   fsnotify_create(dentry->d_parent->d_inode, dentry);
-   return end_creating(dentry);
+   return simplefs_finish_dentry(dentry, inode);
 }
 
 static struct dentry *__create_dir(const char *name, struct dentry *parent,
   const struct inode_operations *ops)
 {
-   struct dentry *dentry = start_creating(name, parent);
+   struct dentry *dentry;
struct inode *inode;
 
+   dentry = simplefs_create_dir(, _fs_type,
+name, 0755, parent, );
if (IS_ERR(dentry))
return NULL;
 
-   inode = simple_new_inode(dentry->d_sb);
-   if (unlikely(!inode))
-   return failed_creating(dentry);
-
-   inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
inode->i_op = ops;
-   inode->i_fop = _dir_operations;
-
-   /* directory inodes start off with i_nlink == 2 (for "." entry) */
-   inc_nlink(inode);
-   d_instantiate(dentry, inode);
-   inc_nlink(dentry->d_parent->d_inode);
-   fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
-   return end_creating(dentry);
+   return simplefs_finish_dentry(dentry, inode);
 }
 
 /**
-- 
2.25.2

[PATCH 7/8] debugfs: switch to simplefs inode creation API

2020-04-14 Thread Emanuele Giuseppe Esposito

The only difference, compared to the pre-existing code, is that symlink
creation now triggers fsnotify_create.  This was a bug in the debugfs
code, since for example vfs_symlink does call fsnotify_create.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 fs/debugfs/inode.c | 144 +
 1 file changed, 15 insertions(+), 129 deletions(-)

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 834b5872ca0d..7a2369373b85 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -294,68 +294,6 @@ struct dentry *debugfs_lookup(const char *name, struct 
dentry *parent)
 }
 EXPORT_SYMBOL_GPL(debugfs_lookup);
 
-static struct dentry *start_creating(const char *name, struct dentry *parent)
-{
-   struct dentry *dentry;
-   int error;
-
-   pr_debug("creating file '%s'\n", name);
-
-   if (IS_ERR(parent))
-   return parent;
-
-   error = simple_pin_fs(, _fs_type);
-   if (error) {
-   pr_err("Unable to pin filesystem for file '%s'\n", name);
-   return ERR_PTR(error);
-   }
-
-   /* If the parent is not specified, we create it in the root.
-* We need the root dentry to do this, which is in the super
-* block. A pointer to that is in the struct vfsmount that we
-* have around.
-*/
-   if (!parent)
-   parent = debugfs.mount->mnt_root;
-
-   inode_lock(d_inode(parent));
-   if (unlikely(IS_DEADDIR(d_inode(parent
-   dentry = ERR_PTR(-ENOENT);
-   else
-   dentry = lookup_one_len(name, parent, strlen(name));
-   if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
-   if (d_is_dir(dentry))
-   pr_err("Directory '%s' with parent '%s' already 
present!\n",
-  name, parent->d_name.name);
-   else
-   pr_err("File '%s' in directory '%s' already present!\n",
-  name, parent->d_name.name);
-   dput(dentry);
-   dentry = ERR_PTR(-EEXIST);
-   }
-
-   if (IS_ERR(dentry)) {
-   inode_unlock(d_inode(parent));
-   simple_release_fs();
-   }
-
-   return dentry;
-}
-
-static struct dentry *failed_creating(struct dentry *dentry)
-{
-   inode_unlock(d_inode(dentry->d_parent));
-   dput(dentry);
-   simple_release_fs();
-   return ERR_PTR(-ENOMEM);
-}
-
-static struct dentry *end_creating(struct dentry *dentry)
-{
-   inode_unlock(d_inode(dentry->d_parent));
-   return dentry;
-}
-
 static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *proxy_fops,
@@ -364,32 +302,17 @@ static struct dentry *__debugfs_create_file(const char 
*name, umode_t mode,
struct dentry *dentry;
struct inode *inode;
 
-   if (!(mode & S_IFMT))
-   mode |= S_IFREG;
-   BUG_ON(!S_ISREG(mode));
-   dentry = start_creating(name, parent);
-
+   dentry = simplefs_create_file(, _fs_type,
+ name, mode, parent, data, );
if (IS_ERR(dentry))
return dentry;
 
-   inode = simple_new_inode(dentry->d_sb);
-   if (unlikely(!inode)) {
-   pr_err("out of free dentries, can not create file '%s'\n",
-  name);
-   return failed_creating(dentry);
-   }
-
-   inode->i_mode = mode;
-   inode->i_private = data;
-
inode->i_op = _file_inode_operations;
inode->i_fop = proxy_fops;
dentry->d_fsdata = (void *)((unsigned long)real_fops |
DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
 
-   d_instantiate(dentry, inode);
-   fsnotify_create(d_inode(dentry->d_parent), dentry);
-   return end_creating(dentry);
+   return simplefs_finish_dentry(dentry, inode);
 }
 
 /**
@@ -522,29 +445,16 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
  */
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 {
-   struct dentry *dentry = start_creating(name, parent);
+   struct dentry *dentry;
struct inode *inode;
 
+   dentry = simplefs_create_dir(, _fs_type,
+name, 0755, parent, );
if (IS_ERR(dentry))
return dentry;
 
-   inode = simple_new_inode(dentry->d_sb);
-   if (unlikely(!inode)) {
-   pr_err("out of free dentries, can not create directory '%s'\n",
-  name);
-   return failed_creating(dentry);
-   }
-
-   inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
inode->i_op = _dir_inode_operations;
-   inode->i_fop = _dir_operations;
-
-   /* directory inodes start off with i_nlink == 2 (for "." entry) */
-   inc_nlink(inode);
-

[PATCH 6/8] simplefs: add file creation functions

2020-04-14 Thread Emanuele Giuseppe Esposito

A bunch of code is duplicated between debugfs and tracefs, unify it to the
simplefs library.

The code is very similar, except that dentry and inode creation are unified
into a single function (unlike start_creating in debugfs and tracefs, which
only takes care of dentries).  This adds an output parameter to the creation
functions, but pushes all error recovery into fs/simplefs.c.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 fs/simplefs.c| 150 +++
 include/linux/simplefs.h |  19 +
 2 files changed, 169 insertions(+)

diff --git a/fs/simplefs.c b/fs/simplefs.c
index c59eb8d996be..3e48a288beb3 100644
--- a/fs/simplefs.c
+++ b/fs/simplefs.c
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include 
 #include 
+#include 
+#include 
 
 static DEFINE_SPINLOCK(pin_fs_lock);
 
@@ -42,3 +44,151 @@ struct inode *simple_alloc_anon_inode(struct simple_fs *fs)
return alloc_anon_inode(fs->mount->mnt_sb);
 }
 EXPORT_SYMBOL(simple_alloc_anon_inode);
+
+static struct dentry *failed_creating(struct simple_fs *fs, struct dentry 
*dentry)
+{
+   inode_unlock(d_inode(dentry->d_parent));
+   dput(dentry);
+   simple_release_fs(fs);
+   return ERR_PTR(-ENOMEM);
+}
+
+struct dentry *simplefs_create_dentry(struct simple_fs *fs, struct 
file_system_type *type,
+ const char *name, struct dentry *parent,
+ struct inode **inode)
+{
+   struct dentry *dentry;
+   int error;
+
+   pr_debug("creating file '%s'\n", name);
+
+   if (IS_ERR(parent))
+   return parent;
+
+   error = simple_pin_fs(fs, type);
+   if (error) {
+   pr_err("Unable to pin filesystem for file '%s'\n", name);
+   return ERR_PTR(error);
+   }
+
+   /* If the parent is not specified, we create it in the root.
+* We need the root dentry to do this, which is in the super
+* block. A pointer to that is in the struct vfsmount that we
+* have around.
+*/
+   if (!parent)
+   parent = fs->mount->mnt_root;
+
+   inode_lock(d_inode(parent));
+   dentry = lookup_one_len(name, parent, strlen(name));
+   if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
+   if (d_is_dir(dentry))
+   pr_err("Directory '%s' with parent '%s' already 
present!\n",
+  name, parent->d_name.name);
+   else
+   pr_err("File '%s' in directory '%s' already present!\n",
+  name, parent->d_name.name);
+   dput(dentry);
+   dentry = ERR_PTR(-EEXIST);
+   }
+
+   if (IS_ERR(dentry)) {
+   inode_unlock(d_inode(parent));
+   simple_release_fs(fs);
+   }
+
+
+   if (IS_ERR(dentry))
+   return dentry;
+
+   *inode = simple_new_inode(fs->mount->mnt_sb);
+   if (unlikely(!(*inode))) {
+   pr_err("out of free inodes, can not create file '%s'\n",
+  name);
+   return failed_creating(fs, dentry);
+   }
+
+   return dentry;
+}
+EXPORT_SYMBOL(simplefs_create_dentry);
+
+struct dentry *simplefs_create_file(struct simple_fs *fs, struct 
file_system_type *type,
+   const char *name, umode_t mode,
+   struct dentry *parent, void *data,
+   struct inode **inode)
+{
+   struct dentry *dentry;
+
+   WARN_ON((mode & S_IFMT) && !S_ISREG(mode));
+   mode |= S_IFREG;
+
+   dentry = simplefs_create_dentry(fs, type, name, parent, inode);
+
+   if (IS_ERR(dentry))
+   return dentry;
+
+   (*inode)->i_mode = mode;
+   (*inode)->i_private = data;
+
+   return dentry;
+}
+EXPORT_SYMBOL(simplefs_create_file);
+
+struct dentry *simplefs_finish_dentry(struct dentry *dentry, struct inode 
*inode)
+{
+   d_instantiate(dentry, inode);
+   if (S_ISDIR(inode->i_mode)) {
+   inc_nlink(d_inode(dentry->d_parent));
+   fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
+   } else {
+   fsnotify_create(d_inode(dentry->d_parent), dentry);
+   }
+   inode_unlock(d_inode(dentry->d_parent));
+   return dentry;
+}
+EXPORT_SYMBOL(simplefs_finish_dentry);
+
+struct dentry *simplefs_create_dir(struct simple_fs *fs, struct 
file_system_type *type,
+  const char *name, umode_t mode, struct 
dentry *parent,
+  struct inode **inode)
+{
+   struct dentry *dentry;
+
+   WARN_ON((mode & S_IFMT) && !S_ISDIR(mode));
+   mode |= S_IFDIR;
+
+   dentry = simplefs_create_dentry(fs, type, name, parent, inode);
+   if (IS_ERR(dentry))
+   return dentry;
+
+   (*inode)->i_mode = mode;
+   (*inode)->i_op = _dir_inode_operations;
+

[PATCH 5/8] simplefs: add alloc_anon_inode wrapper

2020-04-14 Thread Emanuele Giuseppe Esposito

Start adding file creation wrappers, the simplest returns an anonymous
inode.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 drivers/gpu/drm/drm_drv.c   | 2 +-
 drivers/misc/cxl/api.c  | 2 +-
 drivers/scsi/cxlflash/ocxl_hw.c | 2 +-
 fs/simplefs.c   | 6 ++
 include/linux/simplefs.h| 2 ++
 5 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index b4b357725be2..4e4ea1bf312c 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -539,7 +539,7 @@ static struct inode *drm_fs_inode_new(void)
return ERR_PTR(r);
}
 
-   inode = alloc_anon_inode(drm_fs.mount->mnt_sb);
+   inode = simple_alloc_anon_inode(_fs);
if (IS_ERR(inode))
simple_release_fs(_fs);
 
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index 6c6566d8bc17..a3d2682eb3a7 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -73,7 +73,7 @@ static struct file *cxl_getfile(const char *name,
goto err_module;
}
 
-   inode = alloc_anon_inode(cxl_fs.mount->mnt_sb);
+   inode = simple_alloc_anon_inode(_fs);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
goto err_fs;
diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
index 23afde0c6c0e..770fdf186028 100644
--- a/drivers/scsi/cxlflash/ocxl_hw.c
+++ b/drivers/scsi/cxlflash/ocxl_hw.c
@@ -86,7 +86,7 @@ static struct file *ocxlflash_getfile(struct device *dev, 
const char *name,
goto err2;
}
 
-   inode = alloc_anon_inode(ocxlflash_fs.mount->mnt_sb);
+   inode = simple_alloc_anon_inode(_fs);
if (IS_ERR(inode)) {
rc = PTR_ERR(inode);
dev_err(dev, "%s: alloc_anon_inode failed rc=%d\n",
diff --git a/fs/simplefs.c b/fs/simplefs.c
index 790d8beb9cc3..c59eb8d996be 100644
--- a/fs/simplefs.c
+++ b/fs/simplefs.c
@@ -36,3 +36,9 @@ void simple_release_fs(struct simple_fs *fs)
mntput(mnt);
 }
 EXPORT_SYMBOL(simple_release_fs);
+
+struct inode *simple_alloc_anon_inode(struct simple_fs *fs)
+{
+   return alloc_anon_inode(fs->mount->mnt_sb);
+}
+EXPORT_SYMBOL(simple_alloc_anon_inode);
diff --git a/include/linux/simplefs.h b/include/linux/simplefs.h
index 18010414a16f..c62ab526414e 100644
--- a/include/linux/simplefs.h
+++ b/include/linux/simplefs.h
@@ -12,4 +12,6 @@ struct simple_fs {
 extern int simple_pin_fs(struct simple_fs *, struct file_system_type *);
 extern void simple_release_fs(struct simple_fs *);
 
+extern struct inode *simple_alloc_anon_inode(struct simple_fs *fs);
+
 #endif
-- 
2.25.2

1 2 >

1 - 100 of 183 matches

Mail list logo