Re: [PATCH 2/2] tools: do not include scripts/Kbuild.include

2021-04-15 Thread Christian Borntraeger



On 15.04.21 10:06, Christian Borntraeger wrote:


On 15.04.21 09:27, Masahiro Yamada wrote:

Since commit d9f4ff50d2aa ("kbuild: spilt cc-option and friends to
scripts/Makefile.compiler"), some kselftests fail to build.

The tools/ directory opted out Kbuild, and went in a different
direction. They copy any kind of files to the tools/ directory
in order to do whatever they want to do in their world.

tools/build/Build.include mimics scripts/Kbuild.include, but some
tool Makefiles included the Kbuild one to import a feature that is
missing in tools/build/Build.include:

  - Commit ec04aa3ae87b ("tools/thermal: tmon: use "-fstack-protector"
    only if supported") included scripts/Kbuild.include from
    tools/thermal/tmon/Makefile to import the cc-option macro.

  - Commit c2390f16fc5b ("selftests: kvm: fix for compilers that do
    not support -no-pie") included scripts/Kbuild.include from
    tools/testing/selftests/kvm/Makefile to import the try-run macro.

  - Commit 9cae4ace80ef ("selftests/bpf: do not ignore clang
    failures") included scripts/Kbuild.include from
    tools/testing/selftests/bpf/Makefile to import the .DELETE_ON_ERROR
    target.

  - Commit 0695f8bca93e ("selftests/powerpc: Handle Makefile for
    unrecognized option") included scripts/Kbuild.include from
    tools/testing/selftests/powerpc/pmu/ebb/Makefile to import the
    try-run macro.

Copy what they want there, and stop including scripts/Kbuild.include
from the tool Makefiles.

Link: 
https://lore.kernel.org/lkml/86dadf33-70f7-a5ac-cb8c-64966d2f4...@linux.ibm.com/
Fixes: d9f4ff50d2aa ("kbuild: spilt cc-option and friends to 
scripts/Makefile.compiler")
Reported-by: Janosch Frank 
Reported-by: Christian Borntraeger 
Signed-off-by: Masahiro Yamada 


When applying this on top of d9f4ff50d2aa ("kbuild: spilt cc-option and friends to 
scripts/Makefile.compiler")

I still do get

#  Test Assertion Failure 
#   lib/kvm_util.c:142: vm->fd >= 0
#   pid=315635 tid=315635 - Invalid argument
#  1    0x01002f4b: vm_open at kvm_util.c:142
#  2 (inlined by) vm_create at kvm_util.c:258
#  3    0x010015ef: test_add_max_memory_regions at 
set_memory_region_test.c:351
#  4 (inlined by) main at set_memory_region_test.c:397
#  5    0x03ff971abb89: ?? ??:0
#  6    0x010017ad: .annobin_abi_note.c.hot at crt1.o:?
#   KVM_CREATE_VM ioctl failed, rc: -1 errno: 22
not ok 7 selftests: kvm: set_memory_region_test # exit=254

and the testcase compilation does not pickup the pgste option.



What does work is the following:
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..d9c6d9c2069e 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 include ../../../../scripts/Kbuild.include
+include ../../../../scripts/Makefile.compiler
 
 all:
 


as it does pickup the linker option handling.




Re: [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables

2021-04-15 Thread Christophe Leroy




Le 16/04/2021 à 00:43, Daniel Axtens a écrit :

Hi Christophe,


Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.

Signed-off-by: Christophe Leroy 
---
  mm/pagewalk.c | 54 +--
  1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..410a9d8f7572 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,32 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
unsigned long end,
return err;
  }
  
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,

+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   int err = 0;
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+   const struct mm_walk_ops *ops = walk->ops;
+   int shift = hugepd_shift(*phpd);
+   int page_size = 1 << shift;
+
+   if (addr & (page_size - 1))
+   return 0;
+
+   for (;;) {
+   pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
+
+   err = ops->pte_entry(pte, addr, addr + page_size, walk);
+   if (err)
+   break;
+   if (addr >= end - page_size)
+   break;
+   addr += page_size;
+   }


Initially I thought this was a somewhat unintuitive way to structure
this loop, but I see it parallels the structure of walk_pte_range_inner,
so I think the consistency is worth it.

I notice the pte walking code potentially takes some locks: does this
code need to do that?

arch/powerpc/mm/hugetlbpage.c says that hugepds are protected by the
mm->page_table_lock, but I don't think we're taking it in this code.


I'll add it, thanks.




+#endif
+   return err;
+}
+
  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
  {
@@ -108,7 +134,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
goto again;
}
  
-		err = walk_pte_range(pmd, addr, next, walk);

+   if (is_hugepd(__hugepd(pmd_val(*pmd
+   err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
walk, PMD_SHIFT);
+   else
+   err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -157,7 +186,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, 
unsigned long end,
if (pud_none(*pud))
goto again;
  
-		err = walk_pmd_range(pud, addr, next, walk);

+   if (is_hugepd(__hugepd(pud_val(*pud
+   err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
+   else
+   err = walk_pmd_range(pud, addr, next, walk);


I'm a bit worried you might end up calling into walk_hugepd_range with
ops->pte_entry == NULL, and then jumping to 0.


You are right, I missed it.
I'll bail out of walk_hugepd_range() when ops->pte_entry is NULL.




static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
{
...
 pud = pud_offset(p4d, addr);
do {
 ...
 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
walk->action == ACTION_CONTINUE ||
!(ops->pmd_entry || ops->pte_entry)) <<< THIS CHECK
continue;
 ...
if (is_hugepd(__hugepd(pud_val(*pud
err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
else
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);

walk_pud_range will proceed if there is _either_ an ops->pmd_entry _or_
an ops->pte_entry, but walk_hugepd_range will call ops->pte_entry
unconditionally.

The same issue applies to walk_{p4d,pgd}_range...

Kind regards,
Daniel



Thanks
Christophe


Re: [PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-15 Thread Srikar Dronamraju
* David Gibson  [2021-04-16 13:21:34]:

Thanks for having a look at the patches.

> On Thu, Apr 15, 2021 at 05:39:32PM +0530, Srikar Dronamraju wrote:
> > Daniel reported that with Commit 4ca234a9cbd7 ("powerpc/smp: Stop
> > updating cpu_core_mask") QEMU was unable to set single NUMA node SMP
> > topologies such as:
> >  -smp 8,maxcpus=8,cores=2,threads=2,sockets=2
> >  i.e he expected 2 sockets in one NUMA node.
> 
> Well, strictly speaking, you can still set that toplogy in qemu but a
> PAPR guest with that commit will show as having 1 socket in lscpu and
> similar things.
> 

Right, I did mention the o/p of lscpu in QEMU with the said commit and
with the new patches in the cover letter. Somehow I goofed up the cc
list for the cover letter.

Reference for the cover letter:
https://lore.kernel.org/linuxppc-dev/20210415120934.232271-1-sri...@linux.vnet.ibm.com/t/#u

> Basically, this is because PAPR has no meaningful distinction between
> cores and sockets.  So it's kind of a cosmetic problem, but it is a
> user-unexpected behaviour that it would be nice to avoid if it's not
> excessively difficult.
> 
> > The above commit helped to reduce boot time on Large Systems for
> > example 4096 vCPU single socket QEMU instance. PAPR is silent on
> > having more than one socket within a NUMA node.
> > 
> > cpu_core_mask and cpu_cpu_mask for any CPU would be same unless the
> > number of sockets is different from the number of NUMA nodes.
> 
> Number of sockets being different from number of NUMA nodes is routine
> in qemu, and I don't think it's something we should enforce.
> 
> > One option is to reintroduce cpu_core_mask but use a slightly
> > different method to arrive at the cpu_core_mask. Previously each CPU's
> > chip-id would be compared with all other CPU's chip-id to verify if
> > both the CPUs were related at the chip level. Now if a CPU 'A' is
> > found related / (unrelated) to another CPU 'B', all the thread
> > siblings of 'A' and thread siblings of 'B' are automatically marked as
> > related / (unrelated).
> > 
> > Also if a platform doesn't support ibm,chip-id property, i.e its
> > cpu_to_chip_id returns -1, cpu_core_map holds a copy of
> > cpu_cpu_mask().
> 
> Yeah, the other weirdness here is that ibm,chip-id isn't a PAPR
> property at all - it was added for powernv.  We then added it to qemu
> for PAPR guests because that was the way at the time to get the guest
> to advertise the expected number of sockets.  It therefore basically
> *only* exists on PAPR/qemu for that purpose, so if it's not serving it
> we need to come up with something else.
> 

Do you have ideas on what that something could be like? So if that's
more beneficial then we could move over to that scheme. Also apart
from ibm,chip-id being not a PAPR property, do you have any other
concerns with it.


-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v1 4/5] mm: ptdump: Support hugepd table entries

2021-04-15 Thread Christophe Leroy

Hi Daniel,

Le 16/04/2021 à 01:29, Daniel Axtens a écrit :

Hi Christophe,


Which hugepd, page table entries can be at any level
and can be of any size.

Add support for them.

Signed-off-by: Christophe Leroy 
---
  mm/ptdump.c | 17 +++--
  1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 61cd16afb1c8..6efdb8c15a7d 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -112,11 +112,24 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long 
addr,
  {
struct ptdump_state *st = walk->private;
pte_t val = ptep_get(pte);
+   unsigned long page_size = next - addr;
+   int level;
+
+   if (page_size >= PGDIR_SIZE)
+   level = 0;
+   else if (page_size >= P4D_SIZE)
+   level = 1;
+   else if (page_size >= PUD_SIZE)
+   level = 2;
+   else if (page_size >= PMD_SIZE)
+   level = 3;
+   else
+   level = 4;
  
  	if (st->effective_prot)

-   st->effective_prot(st, 4, pte_val(val));
+   st->effective_prot(st, level, pte_val(val));
  
-	st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);

+   st->note_page(st, addr, level, pte_val(val), page_size);


It seems to me that passing both level and page_size is a bit redundant,
but I guess it does reduce the impact on each arch's code?


Exactly, as shown above, the level can be re-calculated based on the page size, but it would be a 
unnecessary impact on all architectures and would duplicate the re-calculation of the level whereas 
in most cases we get it for free from the caller.




Kind regards,
Daniel

  
  	return 0;

  }
--
2.25.0


Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()

2021-04-15 Thread Christophe Leroy




Le 16/04/2021 à 01:12, Daniel Axtens a écrit :

Hi Christophe,


  static void note_page(struct ptdump_state *pt_st, unsigned long addr, int 
level,
- u64 val)
+ u64 val, unsigned long page_size)


Compilers can warn about unused parameters at -Wextra level.  However,
reading scripts/Makefile.extrawarn it looks like the warning is
explicitly _disabled_ in the kernel at W=1 and not reenabled at W=2 or
W=3. So I guess this is fine...


There are a lot lot lot functions having unused parameters in the kernel , especially the ones that 
are re-implemented by each architecture.





@@ -126,7 +126,7 @@ static int ptdump_hole(unsigned long addr, unsigned long 
next,
  {
struct ptdump_state *st = walk->private;
  
-	st->note_page(st, addr, depth, 0);

+   st->note_page(st, addr, depth, 0, 0);


I know it doesn't matter at this point, but I'm not really thrilled by
the idea of passing 0 as the size here. Doesn't the hole have a known
page size?


The hole has a size for sure, I don't think we can call it a page size:

On powerpc 8xx, we have 4 page sizes: 8M, 512k, 16k and 4k.
A page table will cover 4M areas and will contain pages of size 512k, 16k and 
4k.
A PGD table contains either entries which points to a page table (covering 4M), or two identical 
consecutive entries pointing to the same hugepd which contains a single PTE for an 8M page.


So, if a PGD entry is empty, the hole is 4M, it corresponds to none of the page sizes the 
architecture supports.



But looking at what is done with that size, it can make sense to pass it to notepage() anyway. Let's 
do that.




  
  	return 0;

  }
@@ -153,5 +153,5 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct 
mm_struct *mm, pgd_t *pgd)
mmap_read_unlock(mm);
  
  	/* Flush out the last page */

-   st->note_page(st, 0, -1, 0);
+   st->note_page(st, 0, -1, 0, 0);


I'm more OK with the idea of passing 0 as the size when the depth is -1
(don't know): if we don't know the depth we conceptually can't know the
page size.

Regards,
Daniel



Re: [PATCH v1 1/1] powerpc/papr_scm: Properly handle UUID types and API

2021-04-15 Thread Aneesh Kumar K.V

On 4/15/21 7:16 PM, Andy Shevchenko wrote:

Parse to and export from UUID own type, before dereferencing.
This also fixes wrong comment (Little Endian UUID is something else)
and should fix Sparse warnings about assigning strict types to POD.



I am wondering whether this will break older namespace created. IIRC 
that cpu_to_le64 was done to be backward compatible with namespaces 
created before 259a948c4ba1.


What we need to test is create a namespace in little endian kernel and 
read it back in via big endian and vice versa. Also we need to make sure 
we can read the already created namespace before this patch.




Fixes: 43001c52b603 ("powerpc/papr_scm: Use ibm,unit-guid as the iset cookie")
Fixes: 259a948c4ba1 ("powerpc/pseries/scm: Use a specific endian format for storing 
uuid from the device tree")
Cc: Oliver O'Halloran 
Cc: Aneesh Kumar K.V 
Signed-off-by: Andy Shevchenko 
---
Not tested
  arch/powerpc/platforms/pseries/papr_scm.c | 13 -
  1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index ae6f5d80d5ce..4366e1902890 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -1085,8 +1085,9 @@ static int papr_scm_probe(struct platform_device *pdev)
u32 drc_index, metadata_size;
u64 blocks, block_size;
struct papr_scm_priv *p;
+   u8 uuid_raw[UUID_SIZE];
const char *uuid_str;
-   u64 uuid[2];
+   uuid_t uuid;
int rc;
  
  	/* check we have all the required DT properties */

@@ -1129,16 +1130,18 @@ static int papr_scm_probe(struct platform_device *pdev)
p->hcall_flush_required = of_property_read_bool(dn, 
"ibm,hcall-flush-required");
  
  	/* We just need to ensure that set cookies are unique across */

-   uuid_parse(uuid_str, (uuid_t *) uuid);
+   uuid_parse(uuid_str, );
+
/*
 * cookie1 and cookie2 are not really little endian
-* we store a little endian representation of the
+* we store a raw buffer representation of the
 * uuid str so that we can compare this with the label
 * area cookie irrespective of the endian config with which
 * the kernel is built.
 */
-   p->nd_set.cookie1 = cpu_to_le64(uuid[0]);
-   p->nd_set.cookie2 = cpu_to_le64(uuid[1]);
+   export_uuid(uuid_raw, );
+   p->nd_set.cookie1 = get_unaligned_le64(_raw[0]);
+   p->nd_set.cookie2 = get_unaligned_le64(_raw[8]);
  
  	/* might be zero */

p->metadata_size = metadata_size;





Re: [PATCH v3] powerpc: fix EDEADLOCK redefinition error in uapi/asm/errno.h

2021-04-15 Thread Tony Ambardar
Hello Michael,

The latest version of this patch addressed all feedback I'm aware of
when submitted last September, and I've seen no further comments from
reviewers since then.

Could you please let me know where this stands and if anything further
is needed?

Kind regards,
Tony

On Thu, 17 Sept 2020 at 06:54, Tony Ambardar  wrote:
>
> A few archs like powerpc have different errno.h values for macros
> EDEADLOCK and EDEADLK. In code including both libc and linux versions of
> errno.h, this can result in multiple definitions of EDEADLOCK in the
> include chain. Definitions to the same value (e.g. seen with mips) do
> not raise warnings, but on powerpc there are redefinitions changing the
> value, which raise warnings and errors (if using "-Werror").
>
> Guard against these redefinitions to avoid build errors like the following,
> first seen cross-compiling libbpf v5.8.9 for powerpc using GCC 8.4.0 with
> musl 1.1.24:
>
>   In file included from ../../arch/powerpc/include/uapi/asm/errno.h:5,
>from ../../include/linux/err.h:8,
>from libbpf.c:29:
>   ../../include/uapi/asm-generic/errno.h:40: error: "EDEADLOCK" redefined 
> [-Werror]
>#define EDEADLOCK EDEADLK
>
>   In file included from 
> toolchain-powerpc_8540_gcc-8.4.0_musl/include/errno.h:10,
>from libbpf.c:26:
>   toolchain-powerpc_8540_gcc-8.4.0_musl/include/bits/errno.h:58: note: this 
> is the location of the previous definition
>#define EDEADLOCK   58
>
>   cc1: all warnings being treated as errors
>
> CC: Stable 
> Reported-by: Rosen Penev 
> Signed-off-by: Tony Ambardar 
> ---
> v1 -> v2:
>  * clean up commit description formatting
>
> v2 -> v3: (per Michael Ellerman)
>  * drop indeterminate 'Fixes' tags, request stable backports instead
> ---
>  arch/powerpc/include/uapi/asm/errno.h   | 1 +
>  tools/arch/powerpc/include/uapi/asm/errno.h | 1 +
>  2 files changed, 2 insertions(+)
>
> diff --git a/arch/powerpc/include/uapi/asm/errno.h 
> b/arch/powerpc/include/uapi/asm/errno.h
> index cc79856896a1..4ba87de32be0 100644
> --- a/arch/powerpc/include/uapi/asm/errno.h
> +++ b/arch/powerpc/include/uapi/asm/errno.h
> @@ -2,6 +2,7 @@
>  #ifndef _ASM_POWERPC_ERRNO_H
>  #define _ASM_POWERPC_ERRNO_H
>
> +#undef EDEADLOCK
>  #include 
>
>  #undef EDEADLOCK
> diff --git a/tools/arch/powerpc/include/uapi/asm/errno.h 
> b/tools/arch/powerpc/include/uapi/asm/errno.h
> index cc79856896a1..4ba87de32be0 100644
> --- a/tools/arch/powerpc/include/uapi/asm/errno.h
> +++ b/tools/arch/powerpc/include/uapi/asm/errno.h
> @@ -2,6 +2,7 @@
>  #ifndef _ASM_POWERPC_ERRNO_H
>  #define _ASM_POWERPC_ERRNO_H
>
> +#undef EDEADLOCK
>  #include 
>
>  #undef EDEADLOCK
> --
> 2.25.1
>


[PATCH 8/8] CMDLINE: arm64: convert to generic builtin command line

2021-04-15 Thread Daniel Walker
This removes arm64 from the device tree handling of the
command line arguments.

The boot_command_line variable is populated inside the earliest
user of the command line, which is in idreg-override.c.

The device tree should not be needed to do any further handling
of the boot command line options.

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Daniel Walker 
---
 arch/arm64/Kconfig | 33 +-
 arch/arm64/include/asm/setup.h |  2 ++
 arch/arm64/kernel/idreg-override.c |  9 
 3 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e4e1b6550115..9781ba3758b1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -105,6 +105,7 @@ config ARM64
select GENERIC_ALLOCATOR
select GENERIC_ARCH_TOPOLOGY
select GENERIC_CLOCKEVENTS_BROADCAST
+   select GENERIC_CMDLINE
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
@@ -1841,38 +1842,6 @@ config ARM64_ACPI_PARKING_PROTOCOL
  protocol even if the corresponding data is present in the ACPI
  MADT table.
 
-config CMDLINE
-   string "Default kernel command string"
-   default ""
-   help
- Provide a set of default command-line options at build time by
- entering them here. As a minimum, you should specify the the
- root device (e.g. root=/dev/nfs).
-
-choice
-   prompt "Kernel command line type" if CMDLINE != ""
-   default CMDLINE_FROM_BOOTLOADER
-   help
- Choose how the kernel will handle the provided default kernel
- command line string.
-
-config CMDLINE_FROM_BOOTLOADER
-   bool "Use bootloader kernel arguments if available"
-   help
- Uses the command-line options passed by the boot loader. If
- the boot loader doesn't provide any, the default kernel command
- string provided in CMDLINE will be used.
-
-config CMDLINE_FORCE
-   bool "Always use the default kernel command string"
-   help
- Always use the default kernel command string, even if the boot
- loader passes other arguments to the kernel.
- This is useful if you cannot or don't want to change the
- command-line options your boot loader passes to the kernel.
-
-endchoice
-
 config EFI_STUB
bool
 
diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h
index d3320618ed14..1f5b6d8f2433 100644
--- a/arch/arm64/include/asm/setup.h
+++ b/arch/arm64/include/asm/setup.h
@@ -5,7 +5,9 @@
 
 #include 
 
+#ifndef __ASSEMBLY__
 void *get_early_fdt_ptr(void);
 void early_fdt_map(u64 dt_phys);
+#endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/kernel/idreg-override.c 
b/arch/arm64/kernel/idreg-override.c
index 83f1c4b92095..0a3fcae13043 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -188,11 +189,11 @@ static __init void parse_cmdline(void)
 {
const u8 *prop = get_bootargs_cmdline();
 
-   if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop)
-   __parse_cmdline(CONFIG_CMDLINE, true);
+   strscpy(boot_command_line, prop, COMMAND_LINE_SIZE);
+   cmdline_add_builtin(boot_command_line);
+
+   __parse_cmdline(boot_command_line, true);
 
-   if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop)
-   __parse_cmdline(prop, true);
 }
 
 /* Keep checkers quiet */
-- 
2.25.1



[PATCH 7/8] of: allow sending a NULL value to early_init_dt_scan_chosen

2021-04-15 Thread Daniel Walker
It's possible that an architecture may want to populate
boot_command_line before calling the device tree code.
Currently, early_init_dt_scan_chosen won't accept a NULL
in the data parameter and it returns immediately if you
send one.

I changed early_init_dt_scan_nodes() to send a NULL into
early_init_dt_scan_chosen() , then I made
early_init_dt_scan_chosen() to do the initrd checking, and
the rng-seed checking and skip all the command line related
code.

Given lots of changes to the command line, I think it makes sense
to allow the initrd code and rng-seed code to be run without
forcing the command line handling. I'm also submitting changes
to arm64 which populate boot_command_line much early and this
device tree code overwrites boot_command_line in that case.

This code depends on all architecture to have a NULL
boot_command_line at boot up when this function runs, unless
it's already populated.

This code was boot tested on powerpc 32bit, x86, and arm64.

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Daniel Walker 
---
 drivers/of/fdt.c | 44 +---
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index adb26aff481d..a1fda952ce60 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1052,36 +1052,38 @@ int __init early_init_dt_scan_chosen(unsigned long 
node, const char *uname,
 
pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
 
-   if (depth != 1 || !data ||
-   (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
+   if (depth != 1 || (strcmp(uname, "chosen") != 0
+   && strcmp(uname, "chosen@0") != 0))
return 0;
 
early_init_dt_check_for_initrd(node);
 
-   /* Retrieve command line */
-   p = of_get_flat_dt_prop(node, "bootargs", );
-   if (p != NULL && l > 0)
-   strlcpy(data, p, min(l, COMMAND_LINE_SIZE));
+   if (data) {
+   /* Retrieve command line */
+   p = of_get_flat_dt_prop(node, "bootargs", );
+   if (p != NULL && l > 0)
+   strlcpy(data, p, min(l, COMMAND_LINE_SIZE));
 
-   /*
-* CONFIG_CMDLINE is meant to be a default in case nothing else
-* managed to set the command line, unless CONFIG_CMDLINE_FORCE
-* is set in which case we override whatever was found earlier.
-*/
+   /*
+* CONFIG_CMDLINE is meant to be a default in case nothing else
+* managed to set the command line, unless CONFIG_CMDLINE_FORCE
+* is set in which case we override whatever was found earlier.
+*/
 #ifdef CONFIG_CMDLINE
 #if defined(CONFIG_CMDLINE_EXTEND)
-   strlcat(data, " ", COMMAND_LINE_SIZE);
-   strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+   strlcat(data, " ", COMMAND_LINE_SIZE);
+   strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
 #elif defined(CONFIG_CMDLINE_FORCE)
-   strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#else
-   /* No arguments from boot loader, use kernel's  cmdl*/
-   if (!((char *)data)[0])
strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+#else
+   /* No arguments from boot loader, use kernel's  cmdl*/
+   if (!((char *)data)[0])
+   strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
 #endif
 #endif /* CONFIG_CMDLINE */
 
-   pr_debug("Command line is: %s\n", (char *)data);
+   pr_debug("Command line is: %s\n", (char *)data);
+   }
 
rng_seed = of_get_flat_dt_prop(node, "rng-seed", );
if (rng_seed && l > 0) {
@@ -1202,7 +1204,11 @@ void __init early_init_dt_scan_nodes(void)
int rc = 0;
 
/* Retrieve various information from the /chosen node */
-   rc = of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line);
+   if (boot_command_line[0])
+   rc = of_scan_flat_dt(early_init_dt_scan_chosen, NULL);
+   else
+   rc = of_scan_flat_dt(early_init_dt_scan_chosen,
+   boot_command_line);
if (!rc)
pr_warn("No chosen node found, continuing without\n");
 
-- 
2.25.1



[PATCH 5/8] drivers: firmware: efi: libstub: enable generic commandline

2021-04-15 Thread Daniel Walker
This adds code to handle the generic command line changes.
The efi code appears that it doesn't benefit as much from this design
as it could.

For example, if you had a prepend command line with "nokaslr" then
you might be helpful to re-enable it in the boot loader or dts,
but there appears to be no way to re-enable kaslr or some of the
other options.

The efi command line handling is incorrect. x86 and arm have an append
system however the efi code prepends the command line.

For example, you could have a non-upgradable bios which sends

efi=disable_early_pci_dma

This hypothetically could have been set because early pci dma caused
issues on early versions of the product.

Then later the early pci dma was made to work and the company desired
to start using it. To override the bios you could set the CONFIG_CMDLINE
to,

efi=no_disable_early_pci_dma

then parsing would normally start with the bios command line, then move
to the CONFIG_CMDLINE and you would end up with early pci dma turned on.

however, current efi code keeps early pci dma off because the bios
arguments always override the built in.

Per my reading this is different from the main body of x86, arm, and
arm64.

The generic command line provides both append and prepend, so it
alleviates this issue if it's used. However not all architectures use
it.

It would be desirable to allow the efi stub to have it's builtin command
line to be modified after compile, but I don't see a feasible way to do
that currently.

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Daniel Walker 
---
 .../firmware/efi/libstub/efi-stub-helper.c| 29 +++
 drivers/firmware/efi/libstub/efi-stub.c   |  9 ++
 drivers/firmware/efi/libstub/efistub.h|  1 +
 drivers/firmware/efi/libstub/x86-stub.c   | 13 +++--
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c 
b/drivers/firmware/efi/libstub/efi-stub-helper.c
index aa8da0a49829..16318f55f187 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include  /* For CONSOLE_LOGLEVEL_* */
+#include 
 #include 
 #include 
 
@@ -172,6 +173,34 @@ int efi_printk(const char *fmt, ...)
return printed;
 }
 
+/**
+ * efi_handle_cmdline() - handle adding in building parts of the command line
+ * @cmdline:   kernel command line
+ *
+ * Add in the generic parts of the commandline and start the parsing of the
+ * command line.
+ *
+ * Return: status code
+ */
+efi_status_t efi_handle_cmdline(char const *cmdline)
+{
+   efi_status_t status = EFI_SUCCESS;
+
+   if (sizeof(CMDLINE_STATIC_PREPEND) > 1)
+   status |= efi_parse_options(CMDLINE_STATIC_PREPEND);
+
+   if (!IS_ENABLED(CONFIG_CMDLINE_OVERRIDE))
+   status |= efi_parse_options(cmdline);
+
+   if (sizeof(CMDLINE_STATIC_APPEND) > 1)
+   status |= efi_parse_options(CMDLINE_STATIC_APPEND);
+
+   if (status != EFI_SUCCESS)
+   efi_err("Failed to parse options\n");
+
+   return status;
+}
+
 /**
  * efi_parse_options() - Parse EFI command line options
  * @cmdline:   kernel command line
diff --git a/drivers/firmware/efi/libstub/efi-stub.c 
b/drivers/firmware/efi/libstub/efi-stub.c
index 26e69788f27a..baa69b24cfdd 100644
--- a/drivers/firmware/efi/libstub/efi-stub.c
+++ b/drivers/firmware/efi/libstub/efi-stub.c
@@ -172,6 +172,14 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
goto fail;
}
 
+#ifdef CONFIG_GENERIC_CMDLINE
+   status = efi_handle_cmdline(cmdline_ptr);
+   if (status != EFI_SUCCESS) {
+   goto fail_free_cmdline;
+   }
+#endif
+
+#ifdef CONFIG_CMDLINE
if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
cmdline_size == 0) {
@@ -189,6 +197,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
goto fail_free_cmdline;
}
}
+#endif
 
efi_info("Booting Linux Kernel...\n");
 
diff --git a/drivers/firmware/efi/libstub/efistub.h 
b/drivers/firmware/efi/libstub/efistub.h
index cde0a2ef507d..07c7f9fdfffc 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -800,6 +800,7 @@ efi_status_t efi_relocate_kernel(unsigned long *image_addr,
 unsigned long alignment,
 unsigned long min_addr);
 
+efi_status_t efi_handle_cmdline(char const *cmdline);
 efi_status_t efi_parse_options(char const *cmdline);
 
 void efi_parse_option_graphics(char *option);
diff --git a/drivers/firmware/efi/libstub/x86-stub.c 
b/drivers/firmware/efi/libstub/x86-stub.c
index f14c4ff5839f..30ad8fb7122d 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -673,6 +673,8 @@ unsigned long efi_main(efi_handle_t handle,
   

[PATCH 6/8] CMDLINE: x86: convert to generic builtin command line

2021-04-15 Thread Daniel Walker
This updates the x86 code to use the CONFIG_GENERIC_CMDLINE
option.

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Ruslan Ruslichenko 
Signed-off-by: Ruslan Bilovol 
Signed-off-by: Daniel Walker 
---
 arch/x86/Kconfig| 44 +
 arch/x86/kernel/setup.c | 18 ++---
 2 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..73ea9589e50d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -118,6 +118,7 @@ config X86
select EDAC_SUPPORT
select GENERIC_CLOCKEVENTS_BROADCASTif X86_64 || (X86_32 && 
X86_LOCAL_APIC)
select GENERIC_CLOCKEVENTS_MIN_ADJUST
+   select GENERIC_CMDLINE
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
@@ -2358,49 +2359,6 @@ choice
 
 endchoice
 
-config CMDLINE_BOOL
-   bool "Built-in kernel command line"
-   help
- Allow for specifying boot arguments to the kernel at
- build time.  On some systems (e.g. embedded ones), it is
- necessary or convenient to provide some or all of the
- kernel boot arguments with the kernel itself (that is,
- to not rely on the boot loader to provide them.)
-
- To compile command line arguments into the kernel,
- set this option to 'Y', then fill in the
- boot arguments in CONFIG_CMDLINE.
-
- Systems with fully functional boot loaders (i.e. non-embedded)
- should leave this option set to 'N'.
-
-config CMDLINE
-   string "Built-in kernel command string"
-   depends on CMDLINE_BOOL
-   default ""
-   help
- Enter arguments here that should be compiled into the kernel
- image and used at boot time.  If the boot loader provides a
- command line at boot time, it is appended to this string to
- form the full kernel command line, when the system boots.
-
- However, you can use the CONFIG_CMDLINE_OVERRIDE option to
- change this behavior.
-
- In most cases, the command line (whether built-in or provided
- by the boot loader) should specify the device for the root
- file system.
-
-config CMDLINE_OVERRIDE
-   bool "Built-in command line overrides boot loader arguments"
-   depends on CMDLINE_BOOL && CMDLINE != ""
-   help
- Set this option to 'Y' to have the kernel ignore the boot loader
- command line, and use ONLY the built-in command line.
-
- This is used to work around broken boot loaders.  This should
- be set to 'N' under normal conditions.
-
 config MODIFY_LDT_SYSCALL
bool "Enable the LDT (local descriptor table)" if EXPERT
default y
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5ecd69a48393..cd2aa33c44d7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * max_low_pfn_mapped: highest directly mapped pfn < 4 GB
@@ -161,9 +162,6 @@ unsigned long saved_video_mode;
 #define RAMDISK_LOAD_FLAG  0x4000
 
 static char __initdata command_line[COMMAND_LINE_SIZE];
-#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
-#endif
 
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
@@ -883,19 +881,7 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
 
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
-   strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
-   if (builtin_cmdline[0]) {
-   /* append boot loader cmdline to builtin */
-   strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
-   strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
-   strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-   }
-#endif
-#endif
-
+   cmdline_add_builtin(boot_command_line);
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
 
-- 
2.25.1



[PATCH 4/8] CMDLINE: mips: convert to generic builtin command line

2021-04-15 Thread Daniel Walker
This updates the mips code to use the CONFIG_GENERIC_CMDLINE
option.

This deletes the option for MIPS_CMDLINE_BUILTIN_EXTEND
and replaces the functionality with generic code.

Of note, the pic32 has some strange handling of the current built
in command line. It was converted to use the static variant which
can't be updated after compilation. It should eventually be updated
to use to append and prepend symbols.

This includes a scripted mass convert of the config files to use
the new generic cmdline. There is a bit of a trim effect here.
It would seems that some of the config haven't been trimmed in
a while.

The script used is as follows,

if [[ -z "$1" || -z "$2" ]]; then
echo "Two arguments are needed."
exit 1
fi
mkdir $1
cp $2 $1/.config
sed -i 's/CONFIG_CMDLINE=/CONFIG_CMDLINE_BOOL=y\nCONFIG_CMDLINE_PREPEND=/g' 
$1/.config
make ARCH=$1 O=$1 olddefconfig
make ARCH=$1 O=$1 savedefconfig
cp $1/defconfig $2
rm -Rf $1

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Ruslan Ruslichenko 
Signed-off-by: Ruslan Bilovol 
Signed-off-by: Daniel Walker 
---
 arch/mips/Kconfig |  4 +--
 arch/mips/Kconfig.debug   | 44 ---
 arch/mips/configs/ar7_defconfig   |  9 ++---
 arch/mips/configs/bcm47xx_defconfig   |  8 ++---
 arch/mips/configs/bcm63xx_defconfig   | 15 +++-
 arch/mips/configs/bmips_be_defconfig  | 11 +++---
 arch/mips/configs/bmips_stb_defconfig | 11 +++---
 arch/mips/configs/capcella_defconfig  | 11 ++
 arch/mips/configs/ci20_defconfig  | 10 +++---
 arch/mips/configs/cu1000-neo_defconfig| 10 +++---
 arch/mips/configs/cu1830-neo_defconfig| 10 +++---
 arch/mips/configs/e55_defconfig   |  4 +--
 arch/mips/configs/generic_defconfig   |  6 ++--
 arch/mips/configs/gpr_defconfig   | 18 ++
 arch/mips/configs/loongson3_defconfig | 13 ++-
 arch/mips/configs/mpc30x_defconfig|  7 ++--
 arch/mips/configs/tb0219_defconfig|  7 ++--
 arch/mips/configs/tb0226_defconfig|  7 ++--
 arch/mips/configs/tb0287_defconfig|  7 ++--
 arch/mips/configs/workpad_defconfig   | 11 +++---
 arch/mips/include/asm/setup.h |  2 ++
 arch/mips/kernel/relocate.c   | 17 +++--
 arch/mips/kernel/setup.c  | 36 +++
 arch/mips/pic32/pic32mzda/early_console.c |  2 +-
 arch/mips/pic32/pic32mzda/init.c  |  3 +-
 25 files changed, 78 insertions(+), 205 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d89efba3d8a4..0e753894d28d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -24,6 +24,7 @@ config MIPS
select CPU_NO_EFFICIENT_FFS if (TARGET_ISA_REV < 1)
select CPU_PM if CPU_IDLE
select GENERIC_ATOMIC64 if !64BIT
+   select GENERIC_CMDLINE
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
select GENERIC_GETTIMEOFDAY
@@ -3212,9 +3213,6 @@ choice
config MIPS_CMDLINE_FROM_BOOTLOADER
bool "Bootloader kernel arguments if available"
 
-   config MIPS_CMDLINE_BUILTIN_EXTEND
-   depends on CMDLINE_BOOL
-   bool "Extend builtin kernel arguments with bootloader arguments"
 endchoice
 
 endmenu
diff --git a/arch/mips/Kconfig.debug b/arch/mips/Kconfig.debug
index 7a8d94cdd493..b5a099c74eb6 100644
--- a/arch/mips/Kconfig.debug
+++ b/arch/mips/Kconfig.debug
@@ -30,50 +30,6 @@ config EARLY_PRINTK_8250
 config USE_GENERIC_EARLY_PRINTK_8250
bool
 
-config CMDLINE_BOOL
-   bool "Built-in kernel command line"
-   help
- For most systems, it is firmware or second stage bootloader that
- by default specifies the kernel command line options.  However,
- it might be necessary or advantageous to either override the
- default kernel command line or add a few extra options to it.
- For such cases, this option allows you to hardcode your own
- command line options directly into the kernel.  For that, you
- should choose 'Y' here, and fill in the extra boot arguments
- in CONFIG_CMDLINE.
-
- The built-in options will be concatenated to the default command
- line if CMDLINE_OVERRIDE is set to 'N'. Otherwise, the default
- command line will be ignored and replaced by the built-in string.
-
- Most MIPS systems will normally expect 'N' here and rely upon
- the command line from the firmware or the second-stage bootloader.
-
-config CMDLINE
-   string "Default kernel command string"
-   depends on CMDLINE_BOOL
-   help
- On some platforms, there is currently no way for the boot loader to
- pass arguments to the kernel.  For these platforms, and for the cases
- when you want to add some extra options to the command line or ignore
- the default command line, you can supply some command-line options at
- build time by 

[PATCH 3/8] scripts: insert-sys-cert: change name to insert-symbol

2021-04-15 Thread Daniel Walker
Since the tool is used to update the command line and/or
to update the certificates, I think it makes sense to
changes the name of this tool.

Update the name of the tool to better reflect it's new use.

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Daniel Walker 
---
 scripts/Makefile   | 2 +-
 scripts/{insert-sys-cert.c => insert-symbol.c} | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename scripts/{insert-sys-cert.c => insert-symbol.c} (99%)

diff --git a/scripts/Makefile b/scripts/Makefile
index c36106bce80e..ed6b9f8f91fa 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -13,7 +13,7 @@ hostprogs-always-$(CONFIG_BUILDTIME_TABLE_SORT)   
+= sorttable
 hostprogs-always-$(CONFIG_ASN1)+= asn1_compiler
 hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT)   += sign-file
 hostprogs-always-$(CONFIG_SYSTEM_TRUSTED_KEYRING)  += extract-cert
-hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE)+= insert-sys-cert
+hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE)+= insert-symbol
 
 HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include
 HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include
diff --git a/scripts/insert-sys-cert.c b/scripts/insert-symbol.c
similarity index 99%
rename from scripts/insert-sys-cert.c
rename to scripts/insert-symbol.c
index 77d3306cfbfb..6866e3a84974 100644
--- a/scripts/insert-sys-cert.c
+++ b/scripts/insert-symbol.c
@@ -7,7 +7,7 @@
  * This software may be used and distributed according to the terms
  * of the GNU General Public License, incorporated herein by reference.
  *
- * Usage: insert-sys-cert [-s  -b  -c 
+ * Usage: insert-symbol [-s  -b  -c 
  */
 
 #define _GNU_SOURCE
-- 
2.25.1



[PATCH 2/8] scripts: insert-sys-cert: add command line insert capability

2021-04-15 Thread Daniel Walker
This adds changes to the insert-sys-cert tool to allow updating
the cmdline_prepend and cmdline_append symbols in addition to
adding certificates.

Updating the cmdline symbols was tested on a PVH virtual machine
with a vmlinux, and with a bzImage which was repackaged on x86.

This commit intentionally keeps the tool filename the same to allow
the changes to be seen more easily. The next commit will change
the name of the tool.

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Daniel Walker 
---
 scripts/insert-sys-cert.c | 241 +++---
 1 file changed, 170 insertions(+), 71 deletions(-)

diff --git a/scripts/insert-sys-cert.c b/scripts/insert-sys-cert.c
index 8902836c2342..77d3306cfbfb 100644
--- a/scripts/insert-sys-cert.c
+++ b/scripts/insert-sys-cert.c
@@ -30,6 +30,9 @@
 #define USED_SYM  "system_extra_cert_used"
 #define LSIZE_SYM "system_certificate_list_size"
 
+#define CMDLINE_APPEND "cmdline_append"
+#define CMDLINE_PREPEND "cmdline_prepend"
+
 #define info(format, args...) fprintf(stderr, "INFO:" format, ## args)
 #define warn(format, args...) fprintf(stdout, "WARNING: " format, ## args)
 #define  err(format, args...) fprintf(stderr, "ERROR:   " format, ## args)
@@ -267,95 +270,46 @@ static void print_sym(Elf_Ehdr *hdr, struct sym *s)
 
 static void print_usage(char *e)
 {
-   printf("Usage %s [-s ] -b  -c \n", e);
+   printf("Usage %s [-s ] -b  [ -c  | -p 
 | -a  ]-\n", e);
 }
 
-int main(int argc, char **argv)
+static char *cmdline_prepend, *cmdline_append;
+static char *system_map_file;
+static char *cert_file;
+static char *cli_name;
+
+static int insert_certificate(Elf_Ehdr *hdr)
 {
-   char *system_map_file = NULL;
-   char *vmlinux_file = NULL;
-   char *cert_file = NULL;
-   int vmlinux_size;
+   struct sym cert_sym, lsize_sym, used_sym;
+   Elf_Shdr *symtab = NULL;
+   unsigned long *lsize;
+   FILE *system_map;
int cert_size;
-   Elf_Ehdr *hdr;
char *cert;
-   FILE *system_map;
-   unsigned long *lsize;
int *used;
-   int opt;
-   Elf_Shdr *symtab = NULL;
-   struct sym cert_sym, lsize_sym, used_sym;
-
-   while ((opt = getopt(argc, argv, "b:c:s:")) != -1) {
-   switch (opt) {
-   case 's':
-   system_map_file = optarg;
-   break;
-   case 'b':
-   vmlinux_file = optarg;
-   break;
-   case 'c':
-   cert_file = optarg;
-   break;
-   default:
-   break;
-   }
-   }
 
-   if (!vmlinux_file || !cert_file) {
-   print_usage(argv[0]);
-   exit(EXIT_FAILURE);
+   if (!cert_file) {
+   print_usage(cli_name);
+   return EXIT_FAILURE;
}
 
cert = read_file(cert_file, _size);
if (!cert)
-   exit(EXIT_FAILURE);
-
-   hdr = map_file(vmlinux_file, _size);
-   if (!hdr)
-   exit(EXIT_FAILURE);
-
-   if (vmlinux_size < sizeof(*hdr)) {
-   err("Invalid ELF file.\n");
-   exit(EXIT_FAILURE);
-   }
-
-   if ((hdr->e_ident[EI_MAG0] != ELFMAG0) ||
-   (hdr->e_ident[EI_MAG1] != ELFMAG1) ||
-   (hdr->e_ident[EI_MAG2] != ELFMAG2) ||
-   (hdr->e_ident[EI_MAG3] != ELFMAG3)) {
-   err("Invalid ELF magic.\n");
-   exit(EXIT_FAILURE);
-   }
-
-   if (hdr->e_ident[EI_CLASS] != CURRENT_ELFCLASS) {
-   err("ELF class mismatch.\n");
-   exit(EXIT_FAILURE);
-   }
-
-   if (hdr->e_ident[EI_DATA] != endianness()) {
-   err("ELF endian mismatch.\n");
-   exit(EXIT_FAILURE);
-   }
-
-   if (hdr->e_shoff > vmlinux_size) {
-   err("Could not find section header.\n");
-   exit(EXIT_FAILURE);
-   }
+   return EXIT_FAILURE;
 
symtab = get_symbol_table(hdr);
if (!symtab) {
warn("Could not find the symbol table.\n");
if (!system_map_file) {
err("Please provide a System.map file.\n");
-   print_usage(argv[0]);
-   exit(EXIT_FAILURE);
+   print_usage(cli_name);
+   return EXIT_FAILURE;
}
 
system_map = fopen(system_map_file, "r");
if (!system_map) {
perror(system_map_file);
-   exit(EXIT_FAILURE);
+   return EXIT_FAILURE;
}
get_symbol_from_map(hdr, system_map, CERT_SYM, _sym);
get_symbol_from_map(hdr, system_map, USED_SYM, _sym);
@@ -371,7 +325,7 @@ int main(int argc, char **argv)
}
 
if (!cert_sym.offset || !lsize_sym.offset || !used_sym.offset)
-   

[PATCH 1/8] CMDLINE: add generic builtin command line

2021-04-15 Thread Daniel Walker
This code allows architectures to use a generic builtin command line.
The state of the builtin command line options across architecture is
diverse. MIPS and X86 once has similar systems, then mips added some
options to allow extending the command line. Powerpc did something
simiar in adding the ability to extend. Even with mips and powerpc
enhancement the needs of Cisco are not met on these platforms.

The code in this commit unifies the code into a generic
header file under the CONFIG_GENERIC_CMDLINE option. When this
option is enabled the architecture can call the cmdline_add_builtin()
to add the builtin command line. The generic code provides both
append and/or prepend options and provides a way to redefine these
option after the kernel is compiled.

This code also includes test's which are meant to confirm
functionality.

This unified implementation offers the same functionality needed by
Cisco on all platform which we enable it on.

Cc: xe-linux-exter...@cisco.com
Signed-off-by: Ruslan Bilovol 
Signed-off-by: Daniel Walker 
---
 include/linux/cmdline.h | 103 +
 init/Kconfig|  78 ++
 lib/Kconfig |   4 ++
 lib/Makefile|   3 +
 lib/generic_cmdline.S   |  53 +++
 lib/test_cmdline1.c | 139 
 6 files changed, 380 insertions(+)
 create mode 100644 include/linux/cmdline.h
 create mode 100644 lib/generic_cmdline.S
 create mode 100644 lib/test_cmdline1.c

diff --git a/include/linux/cmdline.h b/include/linux/cmdline.h
new file mode 100644
index ..34d9d8d14672
--- /dev/null
+++ b/include/linux/cmdline.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CMDLINE_H
+#define _LINUX_CMDLINE_H
+/*
+ *
+ * Copyright (C) 2006,2021. Cisco Systems, Inc.
+ *
+ * Generic Append/Prepend cmdline support.
+ */
+
+
+#include 
+#include 
+
+#ifdef CONFIG_CMDLINE_BOOL
+extern char cmdline_prepend[];
+extern char cmdline_append[];
+extern char cmdline_tmp[];
+#define CMDLINE_PREPEND cmdline_prepend
+#define CMDLINE_APPEND cmdline_append
+#define CMDLINE_TMP cmdline_tmp
+#define CMDLINE_STATIC_PREPEND CONFIG_CMDLINE_PREPEND
+#define CMDLINE_STATIC_APPEND CONFIG_CMDLINE_APPEND
+#else
+#define CMDLINE_PREPEND ""
+#define CMDLINE_APPEND ""
+#define CMDLINE_TMP ""
+#define CMDLINE_STATIC_PREPEND ""
+#define CMDLINE_STATIC_APPEND ""
+#endif
+
+#ifndef CMDLINE_STRLCAT
+#define CMDLINE_STRLCAT strlcat
+#endif
+
+#ifndef CMDLINE_STRLEN
+#define CMDLINE_STRLEN strlen
+#endif
+
+/*
+ * This function will append or prepend a builtin command line to the command
+ * line provided by the bootloader. Kconfig options can be used to alter
+ * the behavior of this builtin command line.
+ * @dest: The destination of the final appended/prepended string
+ * @tmp: temporary space used for prepending
+ * @prepend: string to prepend to @dest
+ * @append: string to append to @dest
+ * @length: the maximum length of the strings above.
+ * @cmdline_strlen: point to a compatible strlen
+ * @cmdline_strlcat: point to a compatible strlcat
+ * This function returns true when the builtin command line was copied 
successfully
+ * and false when there was not enough room to copy all parts of the command 
line.
+ */
+static inline bool
+__cmdline_add_builtin(
+   char *dest,
+   char *tmp,
+   char *prepend,
+   char *append,
+   unsigned long length,
+   size_t (*cmdline_strlen)(const char *s),
+   size_t (*cmdline_strlcat)(char *dest, const char *src, size_t 
count))
+{
+   size_t total_length = 0, tmp_length;
+
+   if (!IS_ENABLED(CONFIG_GENERIC_CMDLINE))
+   return true;
+
+   if (!IS_ENABLED(CONFIG_CMDLINE_BOOL))
+   return true;
+
+   if (IS_ENABLED(CONFIG_CMDLINE_OVERRIDE))
+   dest[0] = '\0';
+   else
+   total_length += cmdline_strlen(dest);
+
+   tmp_length = cmdline_strlen(append);
+   if (tmp_length > 0) {
+   cmdline_strlcat(dest, append, length);
+   total_length += tmp_length;
+   }
+
+   tmp_length = cmdline_strlen(prepend);
+   if (tmp_length > 0) {
+   cmdline_strlcat(tmp, prepend, length);
+   cmdline_strlcat(tmp, dest, length);
+   dest[0] = '\0';
+   cmdline_strlcat(dest, tmp, length);
+   total_length += tmp_length;
+   }
+
+   tmp[0] = '\0';
+
+   if (total_length > length)
+   return false;
+
+   return true;
+}
+
+#define cmdline_add_builtin(dest) \
+   __cmdline_add_builtin(dest, CMDLINE_TMP, CMDLINE_PREPEND, 
CMDLINE_APPEND, COMMAND_LINE_SIZE, CMDLINE_STRLEN, CMDLINE_STRLCAT)
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 5f5c776ef192..d72eb5a804c6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2034,6 +2034,84 @@ config PROFILING
 config TRACEPOINTS
bool
 
+config 

[PATCH 0/8] generic command line v4

2021-04-15 Thread Daniel Walker


v4 release changes

* Updated insert-sys-cert tool to change command line symbols after
  compilation.

This tool is used to release binary kernels internally to companies
and then later insert certificates for each product by consumers of
the binary kernel. Cisco uses this tool for this purpose.

Cisco has a similar need for the command line to be modified on a
binary released kernels similar to how certificates are setup.

* Added global symbols to hold append and prepend values.

These changes follow the system certificate code to allow the
insert-sys-cert tool to be used.

* Added a test case to confirm functionality.

Seemed sensible to add this to make sure everything is working.

* Dropped powerpc changes

Christophe Leroy has reservations about the features for powerpc. I
don't think his reservations are founded, and these changes should
fully work on powerpc. However, I dropped these changes so Christophe
can have more time to get comfortable with the changes.


Enjoy!


Daniel Walker (8):
  CMDLINE: add generic builtin command line
  scripts: insert-sys-cert: add command line insert capability
  scripts: insert-sys-cert: change name to insert-symbol
  CMDLINE: mips: convert to generic builtin command line
  drivers: firmware: efi: libstub: enable generic commandline
  CMDLINE: x86: convert to generic builtin command line
  of: allow sending a NULL value to early_init_dt_scan_chosen
  CMDLINE: arm64: convert to generic builtin command line

 arch/arm64/Kconfig|  33 +--
 arch/arm64/include/asm/setup.h|   2 +
 arch/arm64/kernel/idreg-override.c|   9 +-
 arch/mips/Kconfig |   4 +-
 arch/mips/Kconfig.debug   |  44 
 arch/mips/configs/ar7_defconfig   |   9 +-
 arch/mips/configs/bcm47xx_defconfig   |   8 +-
 arch/mips/configs/bcm63xx_defconfig   |  15 +-
 arch/mips/configs/bmips_be_defconfig  |  11 +-
 arch/mips/configs/bmips_stb_defconfig |  11 +-
 arch/mips/configs/capcella_defconfig  |  11 +-
 arch/mips/configs/ci20_defconfig  |  10 +-
 arch/mips/configs/cu1000-neo_defconfig|  10 +-
 arch/mips/configs/cu1830-neo_defconfig|  10 +-
 arch/mips/configs/e55_defconfig   |   4 +-
 arch/mips/configs/generic_defconfig   |   6 +-
 arch/mips/configs/gpr_defconfig   |  18 +-
 arch/mips/configs/loongson3_defconfig |  13 +-
 arch/mips/configs/mpc30x_defconfig|   7 +-
 arch/mips/configs/tb0219_defconfig|   7 +-
 arch/mips/configs/tb0226_defconfig|   7 +-
 arch/mips/configs/tb0287_defconfig|   7 +-
 arch/mips/configs/workpad_defconfig   |  11 +-
 arch/mips/include/asm/setup.h |   2 +
 arch/mips/kernel/relocate.c   |  17 +-
 arch/mips/kernel/setup.c  |  36 +--
 arch/mips/pic32/pic32mzda/early_console.c |   2 +-
 arch/mips/pic32/pic32mzda/init.c  |   3 +-
 arch/x86/Kconfig  |  44 +---
 arch/x86/kernel/setup.c   |  18 +-
 .../firmware/efi/libstub/efi-stub-helper.c|  29 +++
 drivers/firmware/efi/libstub/efi-stub.c   |   9 +
 drivers/firmware/efi/libstub/efistub.h|   1 +
 drivers/firmware/efi/libstub/x86-stub.c   |  13 +-
 drivers/of/fdt.c  |  44 ++--
 include/linux/cmdline.h   | 103 
 init/Kconfig  |  78 ++
 lib/Kconfig   |   4 +
 lib/Makefile  |   3 +
 lib/generic_cmdline.S |  53 
 lib/test_cmdline1.c   | 139 ++
 scripts/Makefile  |   2 +-
 .../{insert-sys-cert.c => insert-symbol.c}| 243 --
 43 files changed, 716 insertions(+), 394 deletions(-)
 create mode 100644 include/linux/cmdline.h
 create mode 100644 lib/generic_cmdline.S
 create mode 100644 lib/test_cmdline1.c
 rename scripts/{insert-sys-cert.c => insert-symbol.c} (72%)

-- 
2.25.1



Re: [PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-15 Thread David Gibson
On Thu, Apr 15, 2021 at 05:39:32PM +0530, Srikar Dronamraju wrote:
> Daniel reported that with Commit 4ca234a9cbd7 ("powerpc/smp: Stop
> updating cpu_core_mask") QEMU was unable to set single NUMA node SMP
> topologies such as:
>  -smp 8,maxcpus=8,cores=2,threads=2,sockets=2
>  i.e he expected 2 sockets in one NUMA node.

Well, strictly speaking, you can still set that toplogy in qemu but a
PAPR guest with that commit will show as having 1 socket in lscpu and
similar things.

Basically, this is because PAPR has no meaningful distinction between
cores and sockets.  So it's kind of a cosmetic problem, but it is a
user-unexpected behaviour that it would be nice to avoid if it's not
excessively difficult.

> The above commit helped to reduce boot time on Large Systems for
> example 4096 vCPU single socket QEMU instance. PAPR is silent on
> having more than one socket within a NUMA node.
> 
> cpu_core_mask and cpu_cpu_mask for any CPU would be same unless the
> number of sockets is different from the number of NUMA nodes.

Number of sockets being different from number of NUMA nodes is routine
in qemu, and I don't think it's something we should enforce.

> One option is to reintroduce cpu_core_mask but use a slightly
> different method to arrive at the cpu_core_mask. Previously each CPU's
> chip-id would be compared with all other CPU's chip-id to verify if
> both the CPUs were related at the chip level. Now if a CPU 'A' is
> found related / (unrelated) to another CPU 'B', all the thread
> siblings of 'A' and thread siblings of 'B' are automatically marked as
> related / (unrelated).
> 
> Also if a platform doesn't support ibm,chip-id property, i.e its
> cpu_to_chip_id returns -1, cpu_core_map holds a copy of
> cpu_cpu_mask().

Yeah, the other weirdness here is that ibm,chip-id isn't a PAPR
property at all - it was added for powernv.  We then added it to qemu
for PAPR guests because that was the way at the time to get the guest
to advertise the expected number of sockets.  It therefore basically
*only* exists on PAPR/qemu for that purpose, so if it's not serving it
we need to come up with something else.

> 
> Fixes: 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask")
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: qemu-...@nongnu.org
> Cc: Cedric Le Goater 
> Cc: David Gibson 
> Cc: Nathan Lynch 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Gautham R Shenoy 
> Reported-by: Daniel Henrique Barboza 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/include/asm/smp.h |  5 +
>  arch/powerpc/kernel/smp.c  | 39 --
>  2 files changed, 37 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 7a13bc20f0a0..47081a9e13ca 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -121,6 +121,11 @@ static inline struct cpumask *cpu_sibling_mask(int cpu)
>   return per_cpu(cpu_sibling_map, cpu);
>  }
>  
> +static inline struct cpumask *cpu_core_mask(int cpu)
> +{
> + return per_cpu(cpu_core_map, cpu);
> +}
> +
>  static inline struct cpumask *cpu_l2_cache_mask(int cpu)
>  {
>   return per_cpu(cpu_l2_cache_map, cpu);
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 5a4d59a1070d..5c7ce1d50631 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1057,17 +1057,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   local_memory_node(numa_cpu_lookup_table[cpu]));
>   }
>  #endif
> - /*
> -  * cpu_core_map is now more updated and exists only since
> -  * its been exported for long. It only will have a snapshot
> -  * of cpu_cpu_mask.
> -  */
> - cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
>   }
>  
>   /* Init the cpumasks so the boot CPU is related to itself */
>   cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
>   cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
> + cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
>  
>   if (has_coregroup_support())
>   cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> @@ -1408,6 +1403,9 @@ static void remove_cpu_from_masks(int cpu)
>   set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
>   }
>  
> + for_each_cpu(i, cpu_core_mask(cpu))
> + set_cpus_unrelated(cpu, i, cpu_core_mask);
> +
>   if (has_coregroup_support()) {
>   for_each_cpu(i, cpu_coregroup_mask(cpu))
>   set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
> @@ -1468,8 +1466,11 @@ static void update_coregroup_mask(int cpu, 
> cpumask_var_t *mask)
>  
>  static void add_cpu_to_masks(int cpu)
>  {
> + struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
>   int first_thread = 

Re: [PATCH net-next v4 2/2] of: net: fix of_get_mac_addr_nvmem() for non-platform devices

2021-04-15 Thread Benjamin Herrenschmidt
On Mon, 2021-04-12 at 19:47 +0200, Michael Walle wrote:
> 
>  /**
>   * of_get_phy_mode - Get phy mode for given device_node
> @@ -59,15 +60,39 @@ static int of_get_mac_addr(struct device_node *np, const 
> char *name, u8 *addr)
>  static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
>  {
> struct platform_device *pdev = of_find_device_by_node(np);
> +   struct nvmem_cell *cell;
> +   const void *mac;
> +   size_t len;
> int ret;
>  
> -   if (!pdev)
> -   return -ENODEV;
> +   /* Try lookup by device first, there might be a nvmem_cell_lookup
> +* associated with a given device.
> +*/
> +   if (pdev) {
> +   ret = nvmem_get_mac_address(>dev, addr);
> +   put_device(>dev);
> +   return ret;
> +   }
> +

This smells like the wrong band aid :)

Any struct device can contain an OF node pointer these days.

This seems all backwards. I think we are dealing with bad evolution.

We need to do a lookup for the device because we get passed an of_node.
We should just get passed a device here... or rather stop calling
of_get_mac_addr() from all those drivers and instead call
eth_platform_get_mac_address() which in turns calls of_get_mac_addr().

Then the nvmem stuff gets put in eth_platform_get_mac_address().

of_get_mac_addr() becomes a low-level thingy that most drivers don't
care about.

Cheers,
Ben.




Re: [PATCH] ibmvfc: Fix invalid state machine BUG_ON

2021-04-15 Thread Martin K. Petersen
On Mon, 12 Apr 2021 18:10:09 -0600, Tyrel Datwyler wrote:

> This fixes an issue hitting the BUG_ON in ibmvfc_do_work. When
> going through a host action of IBMVFC_HOST_ACTION_RESET,
> we change the action to IBMVFC_HOST_ACTION_TGT_DEL,
> then drop the host lock, and reset the CRQ, which changes
> the host state to IBMVFC_NO_CRQ. If, prior to setting the
> host state to IBMVFC_NO_CRQ, ibmvfc_init_host is called,
> it can then end up changing the host action to IBMVFC_HOST_ACTION_INIT.
> If we then change the host state to IBMVFC_NO_CRQ, we will then
> hit the BUG_ON. This patch makes a couple of changes to avoid this.
> It leaves the host action to be IBMVFC_HOST_ACTION_RESET
> or IBMVFC_HOST_ACTION_REENABLE until after we drop the host
> lock and reset or reenable the CRQ. It also hardens the
> host state machine to ensure we cannot leave the reset / reenable
> state until we've finished processing the reset or reenable.

Applied to 5.13/scsi-queue, thanks!

[1/1] ibmvfc: Fix invalid state machine BUG_ON
  https://git.kernel.org/mkp/scsi/c/15cfef8623a4

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH bpf-next 1/2] bpf: Remove bpf_jit_enable=2 debugging mode

2021-04-15 Thread Alexei Starovoitov
On Thu, Apr 15, 2021 at 8:41 AM Quentin Monnet  wrote:
>
> 2021-04-15 16:37 UTC+0200 ~ Daniel Borkmann 
> > On 4/15/21 11:32 AM, Jianlin Lv wrote:
> >> For debugging JITs, dumping the JITed image to kernel log is discouraged,
> >> "bpftool prog dump jited" is much better way to examine JITed dumps.
> >> This patch get rid of the code related to bpf_jit_enable=2 mode and
> >> update the proc handler of bpf_jit_enable, also added auxiliary
> >> information to explain how to use bpf_jit_disasm tool after this change.
> >>
> >> Signed-off-by: Jianlin Lv 
>
> Hello,
>
> For what it's worth, I have already seen people dump the JIT image in
> kernel logs in Qemu VMs running with just a busybox, not for kernel
> development, but in a context where buiding/using bpftool was not
> possible.

If building/using bpftool is not possible then majority of selftests won't
be exercised. I don't think such environment is suitable for any kind
of bpf development. Much so for JIT debugging.
While bpf_jit_enable=2 is nothing but the debugging tool for JIT developers.
I'd rather nuke that code instead of carrying it from kernel to kernel.


Re: [PATCH v1 4/5] mm: ptdump: Support hugepd table entries

2021-04-15 Thread Daniel Axtens
Hi Christophe,

> Which hugepd, page table entries can be at any level
> and can be of any size.
>
> Add support for them.
>
> Signed-off-by: Christophe Leroy 
> ---
>  mm/ptdump.c | 17 +++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/mm/ptdump.c b/mm/ptdump.c
> index 61cd16afb1c8..6efdb8c15a7d 100644
> --- a/mm/ptdump.c
> +++ b/mm/ptdump.c
> @@ -112,11 +112,24 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long 
> addr,
>  {
>   struct ptdump_state *st = walk->private;
>   pte_t val = ptep_get(pte);
> + unsigned long page_size = next - addr;
> + int level;
> +
> + if (page_size >= PGDIR_SIZE)
> + level = 0;
> + else if (page_size >= P4D_SIZE)
> + level = 1;
> + else if (page_size >= PUD_SIZE)
> + level = 2;
> + else if (page_size >= PMD_SIZE)
> + level = 3;
> + else
> + level = 4;
>  
>   if (st->effective_prot)
> - st->effective_prot(st, 4, pte_val(val));
> + st->effective_prot(st, level, pte_val(val));
>  
> - st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);
> + st->note_page(st, addr, level, pte_val(val), page_size);

It seems to me that passing both level and page_size is a bit redundant,
but I guess it does reduce the impact on each arch's code?

Kind regards,
Daniel

>  
>   return 0;
>  }
> -- 
> 2.25.0


Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()

2021-04-15 Thread Daniel Axtens
Hi Christophe,

>  static void note_page(struct ptdump_state *pt_st, unsigned long addr, int 
> level,
> -   u64 val)
> +   u64 val, unsigned long page_size)

Compilers can warn about unused parameters at -Wextra level.  However,
reading scripts/Makefile.extrawarn it looks like the warning is
explicitly _disabled_ in the kernel at W=1 and not reenabled at W=2 or
W=3. So I guess this is fine...

> @@ -126,7 +126,7 @@ static int ptdump_hole(unsigned long addr, unsigned long 
> next,
>  {
>   struct ptdump_state *st = walk->private;
>  
> - st->note_page(st, addr, depth, 0);
> + st->note_page(st, addr, depth, 0, 0);

I know it doesn't matter at this point, but I'm not really thrilled by
the idea of passing 0 as the size here. Doesn't the hole have a known
page size?

>  
>   return 0;
>  }
> @@ -153,5 +153,5 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct 
> mm_struct *mm, pgd_t *pgd)
>   mmap_read_unlock(mm);
>  
>   /* Flush out the last page */
> - st->note_page(st, 0, -1, 0);
> + st->note_page(st, 0, -1, 0, 0);

I'm more OK with the idea of passing 0 as the size when the depth is -1
(don't know): if we don't know the depth we conceptually can't know the
page size.

Regards,
Daniel



[PATCH v3 2/2] KVM: PPC: Book3S HV: Stop forwarding all HFSCR cause bits to L1

2021-04-15 Thread Fabiano Rosas
Since commit 73937deb4b2d ("KVM: PPC: Book3S HV: Sanitise hv_regs on
nested guest entry") we have been disabling for the nested guest the
hypervisor facility bits that its nested hypervisor don't have access
to.

If the nested guest tries to use one of those facilities, the hardware
will cause a Hypervisor Facility Unavailable interrupt. The HFSCR
register is modified by the hardware to contain information about the
cause of the interrupt.

We have been returning the cause bits to the nested hypervisor but
since commit 549e29b458c5 ("KVM: PPC: Book3S HV: Sanitise vcpu
registers in nested path") we are reducing the amount of information
exposed to L1, so it seems like a good idea to restrict some of the
cause bits as well.

With this patch the L1 guest will be allowed to handle only the
interrupts caused by facilities it has disabled for L2. The interrupts
caused by facilities that L0 denied will cause a Program Interrupt in
L1.

Signed-off-by: Fabiano Rosas 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 270552dd42c5..912a2bcdf7b0 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -138,6 +138,23 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, 
int trap,
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
hr->heir = vcpu->arch.emul_inst;
break;
+   case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+   {
+   u8 cause = vcpu->arch.hfscr >> 56;
+
+   WARN_ON_ONCE(cause >= BITS_PER_LONG);
+
+   if (hr->hfscr & (1UL << cause)) {
+   hr->hfscr &= ~HFSCR_INTR_CAUSE;
+   /*
+* We have not restored L1 state yet, so queue
+* this interrupt instead of delivering it
+* immediately.
+*/
+   kvmppc_book3s_queue_irqprio(vcpu, 
BOOK3S_INTERRUPT_PROGRAM);
+   }
+   break;
+   }
}
 }
 
-- 
2.29.2



[PATCH v3 1/2] KVM: PPC: Book3S HV: Sanitise vcpu registers in nested path

2021-04-15 Thread Fabiano Rosas
As one of the arguments of the H_ENTER_NESTED hypercall, the nested
hypervisor (L1) prepares a structure containing the values of various
hypervisor-privileged registers with which it wants the nested guest
(L2) to run. Since the nested HV runs in supervisor mode it needs the
host to write to these registers.

To stop a nested HV manipulating this mechanism and using a nested
guest as a proxy to access a facility that has been made unavailable
to it, we have a routine that sanitises the values of the HV registers
before copying them into the nested guest's vcpu struct.

However, when coming out of the guest the values are copied as they
were back into L1 memory, which means that any sanitisation we did
during guest entry will be exposed to L1 after H_ENTER_NESTED returns.

This patch alters this sanitisation to have effect on the vcpu->arch
registers directly before entering and after exiting the guest,
leaving the structure that is copied back into L1 unchanged (except
when we really want L1 to access the value, e.g the Cause bits of
HFSCR).

Signed-off-by: Fabiano Rosas 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 55 ++---
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 0cd0e7aad588..270552dd42c5 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -102,8 +102,17 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, 
int trap,
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+   /*
+* When loading the hypervisor-privileged registers to run L2,
+* we might have used bits from L1 state to restrict what the
+* L2 state is allowed to be. Since L1 is not allowed to read
+* the HV registers, do not include these modifications in the
+* return state.
+*/
+   hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) |
+(HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
+
hr->dpdes = vc->dpdes;
-   hr->hfscr = vcpu->arch.hfscr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
hr->ic = vcpu->arch.ic;
@@ -132,24 +141,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, 
int trap,
}
 }
 
-static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
-{
-   /*
-* Don't let L1 enable features for L2 which we've disabled for L1,
-* but preserve the interrupt cause field.
-*/
-   hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
-
-   /* Don't let data address watchpoint match in hypervisor state */
-   hr->dawrx0 &= ~DAWRX_HYP;
-   hr->dawrx1 &= ~DAWRX_HYP;
-
-   /* Don't let completed instruction address breakpt match in HV state */
-   if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
-   hr->ciabr &= ~CIABR_PRIV;
-}
-
-static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state 
*hr)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
@@ -261,6 +253,27 @@ static int kvmhv_write_guest_state_and_regs(struct 
kvm_vcpu *vcpu,
 sizeof(struct pt_regs));
 }
 
+static void load_l2_hv_regs(struct kvm_vcpu *vcpu,
+   const struct hv_guest_state *l2_hv,
+   const struct hv_guest_state *l1_hv)
+{
+   restore_hv_regs(vcpu, l2_hv);
+
+   /*
+* Don't let L1 enable features for L2 which we've disabled for L1,
+* but preserve the interrupt cause field.
+*/
+   vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | l1_hv->hfscr);
+
+   /* Don't let data address watchpoint match in hypervisor state */
+   vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP;
+   vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP;
+
+   /* Don't let completed instruction address breakpt match in HV state */
+   if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+   vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV;
+}
+
 long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 {
long int err, r;
@@ -324,8 +337,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
LPCR_LPES | LPCR_MER;
lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
-   sanitise_hv_regs(vcpu, _hv);
-   restore_hv_regs(vcpu, _hv);
+
+   load_l2_hv_regs(vcpu, _hv, _l1_hv);
 
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
-- 
2.29.2



[PATCH v3 0/2] KVM: PPC: Book3S HV: Nested guest HFSCR changes

2021-04-15 Thread Fabiano Rosas
Applied Nick's suggestions and added a new patch for the Cause bits
issue.

I'm thinking maybe the approach of crashing L1 when L2 tries to access
a facility that L0 has denied is too heavy-handed. But on the other
hand, if L1 were to access the facility itself, the same thing would
happen and L2 runs "inside of L1" in a sense.

Currently, both L0 and L1s handle only msgsndp. All other HV Facility
Unavailable causes are already met with a Program interrupt.

Changes since v2:

- removed the sanitise functions
- moved the entry code into a new load_l2_hv_regs and the exit code
  into the existing save_hv_return_state
- new patch: removes the cause bits when L0 has disabled the
  corresponding facility

v2:

- made the change more generic, not only applies to hfscr anymore;
- sanitisation is now done directly on the vcpu struct, l2_hv is left unchanged;

https://lkml.kernel.org/r/20210406214645.3315819-1-faro...@linux.ibm.com

v1:
https://lkml.kernel.org/r/20210305231055.2913892-1-faro...@linux.ibm.com

Fabiano Rosas (2):
  KVM: PPC: Book3S HV: Sanitise vcpu registers in nested path
  KVM: PPC: Book3S HV: Stop forwarding all HFSCR cause bits to L1

 arch/powerpc/kvm/book3s_hv_nested.c | 72 -
 1 file changed, 51 insertions(+), 21 deletions(-)

--
2.29.2


Re: [PATCH v13 14/14] powerpc/64s/radix: Enable huge vmalloc mappings

2021-04-15 Thread Stephen Rothwell
Hi all,

On Thu, 15 Apr 2021 11:55:29 -0700 Andrew Morton  
wrote:
>
> On Thu, 15 Apr 2021 12:23:55 +0200 Christophe Leroy 
>  wrote:
> > > +  * is done. STRICT_MODULE_RWX may require extra work to support this
> > > +  * too.
> > > +  */
> > >   
> > > - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, 
> > > GFP_KERNEL,
> > > - PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
> > > NUMA_NO_NODE,  
> > 
> > 
> > I think you should add the following in 
> > 
> > #ifndef MODULES_VADDR
> > #define MODULES_VADDR VMALLOC_START
> > #define MODULES_END VMALLOC_END
> > #endif
> > 
> > And leave module_alloc() as is (just removing the enclosing #ifdef 
> > MODULES_VADDR and adding the 
> > VM_NO_HUGE_VMAP  flag)
> > 
> > This would minimise the conflits with the changes I did in powerpc/next 
> > reported by Stephen R.
> >   
> 
> I'll drop powerpc-64s-radix-enable-huge-vmalloc-mappings.patch for now,
> make life simpler.

I have dropped that patch from linux-next.
-- 
Cheers,
Stephen Rothwell


pgpvI5_dKXrq0.pgp
Description: OpenPGP digital signature


Re: [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables

2021-04-15 Thread Daniel Axtens
Hi Christophe,

> Pagewalk ignores hugepd entries and walk down the tables
> as if it was traditionnal entries, leading to crazy result.
>
> Add walk_hugepd_range() and use it to walk hugepage tables.
>
> Signed-off-by: Christophe Leroy 
> ---
>  mm/pagewalk.c | 54 +--
>  1 file changed, 48 insertions(+), 6 deletions(-)
>
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index e81640d9f177..410a9d8f7572 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -58,6 +58,32 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
> unsigned long end,
>   return err;
>  }
>  
> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
> +  unsigned long end, struct mm_walk *walk, int 
> pdshift)
> +{
> + int err = 0;
> +#ifdef CONFIG_ARCH_HAS_HUGEPD
> + const struct mm_walk_ops *ops = walk->ops;
> + int shift = hugepd_shift(*phpd);
> + int page_size = 1 << shift;
> +
> + if (addr & (page_size - 1))
> + return 0;
> +
> + for (;;) {
> + pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
> +
> + err = ops->pte_entry(pte, addr, addr + page_size, walk);
> + if (err)
> + break;
> + if (addr >= end - page_size)
> + break;
> + addr += page_size;
> + }

Initially I thought this was a somewhat unintuitive way to structure
this loop, but I see it parallels the structure of walk_pte_range_inner,
so I think the consistency is worth it.

I notice the pte walking code potentially takes some locks: does this
code need to do that?

arch/powerpc/mm/hugetlbpage.c says that hugepds are protected by the
mm->page_table_lock, but I don't think we're taking it in this code.

> +#endif
> + return err;
> +}
> +
>  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
> struct mm_walk *walk)
>  {
> @@ -108,7 +134,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long 
> addr, unsigned long end,
>   goto again;
>   }
>  
> - err = walk_pte_range(pmd, addr, next, walk);
> + if (is_hugepd(__hugepd(pmd_val(*pmd
> + err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
> walk, PMD_SHIFT);
> + else
> + err = walk_pte_range(pmd, addr, next, walk);
>   if (err)
>   break;
>   } while (pmd++, addr = next, addr != end);
> @@ -157,7 +186,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long 
> addr, unsigned long end,
>   if (pud_none(*pud))
>   goto again;
>  
> - err = walk_pmd_range(pud, addr, next, walk);
> + if (is_hugepd(__hugepd(pud_val(*pud
> + err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
> walk, PUD_SHIFT);
> + else
> + err = walk_pmd_range(pud, addr, next, walk);

I'm a bit worried you might end up calling into walk_hugepd_range with
ops->pte_entry == NULL, and then jumping to 0.

static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
{
...
pud = pud_offset(p4d, addr);
do {
...
if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
walk->action == ACTION_CONTINUE ||
!(ops->pmd_entry || ops->pte_entry)) <<< THIS CHECK
continue;
...
if (is_hugepd(__hugepd(pud_val(*pud
err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
else
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);

walk_pud_range will proceed if there is _either_ an ops->pmd_entry _or_
an ops->pte_entry, but walk_hugepd_range will call ops->pte_entry
unconditionally.

The same issue applies to walk_{p4d,pgd}_range...

Kind regards,
Daniel


Re: [PATCH 1/1] mm: Fix struct page layout on 32-bit systems

2021-04-15 Thread Matthew Wilcox
On Thu, Apr 15, 2021 at 09:11:56PM +, David Laight wrote:
> Isn't it possible to move the field down one long?
> This might require an explicit zero - but this is not a common
> code path - the extra write will be noise.

Then it overlaps page->mapping.  See emails passim.


Re: [PATCH bpf-next 1/2] bpf: Remove bpf_jit_enable=2 debugging mode

2021-04-15 Thread Quentin Monnet
2021-04-15 16:37 UTC+0200 ~ Daniel Borkmann 
> On 4/15/21 11:32 AM, Jianlin Lv wrote:
>> For debugging JITs, dumping the JITed image to kernel log is discouraged,
>> "bpftool prog dump jited" is much better way to examine JITed dumps.
>> This patch get rid of the code related to bpf_jit_enable=2 mode and
>> update the proc handler of bpf_jit_enable, also added auxiliary
>> information to explain how to use bpf_jit_disasm tool after this change.
>>
>> Signed-off-by: Jianlin Lv 

Hello,

For what it's worth, I have already seen people dump the JIT image in
kernel logs in Qemu VMs running with just a busybox, not for kernel
development, but in a context where buiding/using bpftool was not
possible. Maybe not a common case, but still, removing the debugging
mode will make that impossible. Is there a particular incentive to
remove the feature?

Best regards,
Quentin


Re: [PATCH bpf-next 1/2] bpf: Remove bpf_jit_enable=2 debugging mode

2021-04-15 Thread Daniel Borkmann

On 4/15/21 11:32 AM, Jianlin Lv wrote:

For debugging JITs, dumping the JITed image to kernel log is discouraged,
"bpftool prog dump jited" is much better way to examine JITed dumps.
This patch get rid of the code related to bpf_jit_enable=2 mode and
update the proc handler of bpf_jit_enable, also added auxiliary
information to explain how to use bpf_jit_disasm tool after this change.

Signed-off-by: Jianlin Lv 

[...]

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 0a7a2870f111..8d36b4658076 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2566,9 +2566,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
*prog)
cond_resched();
}
  
-	if (bpf_jit_enable > 1)

-   bpf_jit_dump(prog->len, proglen, pass + 1, image);
-
if (image) {
bpf_jit_binary_lock_ro(header);
prog->bpf_func = (void *)image;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8496c1142c9..990b1720c7a4 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -273,16 +273,8 @@ static int proc_dointvec_minmax_bpf_enable(struct 
ctl_table *table, int write,
  
  	tmp.data = _enable;

ret = proc_dointvec_minmax(, write, buffer, lenp, ppos);
-   if (write && !ret) {
-   if (jit_enable < 2 ||
-   (jit_enable == 2 && bpf_dump_raw_ok(current_cred( {
-   *(int *)table->data = jit_enable;
-   if (jit_enable == 2)
-   pr_warn("bpf_jit_enable = 2 was set! NEVER use this 
in production, only for JIT debugging!\n");
-   } else {
-   ret = -EPERM;
-   }
-   }
+   if (write && !ret)
+   *(int *)table->data = jit_enable;
return ret;
  }
  
@@ -389,7 +381,7 @@ static struct ctl_table net_core_table[] = {

.extra2 = SYSCTL_ONE,
  # else
.extra1 = SYSCTL_ZERO,
-   .extra2 = ,
+   .extra2 = SYSCTL_ONE,
  # endif
},
  # ifdef CONFIG_HAVE_EBPF_JIT
diff --git a/tools/bpf/bpf_jit_disasm.c b/tools/bpf/bpf_jit_disasm.c
index c8ae95804728..efa4b17ae016 100644
--- a/tools/bpf/bpf_jit_disasm.c
+++ b/tools/bpf/bpf_jit_disasm.c
@@ -7,7 +7,7 @@
   *
   * To get the disassembly of the JIT code, do the following:
   *
- *  1) `echo 2 > /proc/sys/net/core/bpf_jit_enable`
+ *  1) Insert bpf_jit_dump() and recompile the kernel to output JITed image 
into log


Hmm, if we remove bpf_jit_dump(), the next drive-by cleanup patch will be thrown
at bpf@vger stating that bpf_jit_dump() has no in-tree users and should be 
removed.
Maybe we should be removing bpf_jit_disasm.c along with it as well as 
bpf_jit_dump()
itself ... I guess if it's ever needed in those rare occasions for JIT 
debugging we
can resurrect it from old kernels just locally. But yeah, bpftool's jit dump 
should
suffice for vast majority of use cases.

There was a recent set for ppc32 jit which was merged into ppc tree which will 
create
a merge conflict with this one [0]. So we would need a rebase and take it maybe 
during
merge win once the ppc32 landed..

  [0] 
https://lore.kernel.org/bpf/cover.1616430991.git.christophe.le...@csgroup.eu/


   *  2) Load a BPF filter (e.g. `tcpdump -p -n -s 0 -i eth1 host 
192.168.20.0/24`)
   *  3) Run e.g. `bpf_jit_disasm -o` to read out the last JIT code
   *
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 40a88df275f9..98c7eec2923f 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -203,9 +203,6 @@ static void probe_jit_enable(void)
case 1:
printf("JIT compiler is enabled\n");
break;
-   case 2:
-   printf("JIT compiler is enabled with debugging traces in 
kernel logs\n");
-   break;


This would still need to be there for older kernels ...


case -1:
printf("Unable to retrieve JIT-compiler status\n");
break;





RE: [PATCH 1/1] mm: Fix struct page layout on 32-bit systems

2021-04-15 Thread David Laight
From: Matthew Wilcox 
> Sent: 15 April 2021 19:22
> 
> On Thu, Apr 15, 2021 at 08:08:32PM +0200, Jesper Dangaard Brouer wrote:
> > +static inline
> > +dma_addr_t page_pool_dma_addr_read(dma_addr_t dma_addr)
> > +{
> > +   /* Workaround for storing 64-bit DMA-addr on 32-bit machines in struct
> > +* page.  The page->dma_addr share area with page->compound_head which
> > +* use bit zero to mark compound pages. This is okay, as DMA-addr are
> > +* aligned pointers which have bit zero cleared.
> > +*
> > +* In the 32-bit case, page->compound_head is 32-bit.  Thus, when
> > +* dma_addr_t is 64-bit it will be located in top 32-bit.  Solve by
> > +* swapping dma_addr 32-bit segments.
> > +*/
> > +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
> 
> #if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) && defined(__BIG_ENDIAN)
> otherwise you'll create the problem on ARM that you're avoiding on PPC ...
> 
> I think you want to delete the word '_read' from this function name because
> you're using it for both read and write.

I think I'd use explicit dma_addr_hi and dma_addr_lo and
separate read/write functions just to make absolutely sure
nothing picks up the swapped value.

Isn't it possible to move the field down one long?
This might require an explicit zero - but this is not a common
code path - the extra write will be noise.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)



Re: [PATCH] powerpc: Initialize local variable fdt to NULL in elf64_load()

2021-04-15 Thread Lakshmi Ramasubramanian

On 4/15/21 12:14 PM, Lakshmi Ramasubramanian wrote:

Sorry - missed copying device-tree and powerpc mailing lists.


There are a few "goto out;" statements before the local variable "fdt"
is initialized through the call to of_kexec_alloc_and_setup_fdt() in
elf64_load(). This will result in an uninitialized "fdt" being passed
to kvfree() in this function if there is an error before the call to
of_kexec_alloc_and_setup_fdt().

Initialize the local variable "fdt" to NULL.

Signed-off-by: Lakshmi Ramasubramanian 
Reported-by: kernel test robot 
Reported-by: Dan Carpenter 
---
  arch/powerpc/kexec/elf_64.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 5a569bb51349..0051440c1f77 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -32,7 +32,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
int ret;
unsigned long kernel_load_addr;
unsigned long initrd_load_addr = 0, fdt_load_addr;
-   void *fdt;
+   void *fdt = NULL;
const void *slave_code;
struct elfhdr ehdr;
char *modified_cmdline = NULL;



thanks,
 -lakshmi


Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-15 Thread Rob Herring
+PPC and PCI lists

On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  wrote:
>
> Many other resource flag parsers already add this flag when the input
> has bits 24 & 25 set, so update this one to do the same.

Many others? Looks like sparc and powerpc to me. Those would be the
ones I worry about breaking. Sparc doesn't use of/address.c so it's
fine. Powerpc version of the flags code was only fixed in 2019, so I
don't think powerpc will care either.

I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
the flags. AFAICT, that's not set anywhere outside of arch code. So
never for riscv, arm and arm64 at least. That leads me to
pci_std_update_resource() which is where the PCI code sets BARs and
just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
IORESOURCE_* flags. So it seems like 64-bit is still not handled and
neither is prefetch.

> Some devices (like virtio-net) have more than one memory resource
> (like MMIO32 and MMIO64) and without this flag it would be needed to
> verify the address range to know which one is which.
>
> Signed-off-by: Leonardo Bras 
> ---
>  drivers/of/address.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/of/address.c b/drivers/of/address.c
> index 73ddf2540f3f..dc7147843783 100644
> --- a/drivers/of/address.c
> +++ b/drivers/of/address.c
> @@ -116,9 +116,12 @@ static unsigned int of_bus_pci_get_flags(const __be32 
> *addr)
> flags |= IORESOURCE_IO;
> break;
> case 0x02: /* 32 bits */
> -   case 0x03: /* 64 bits */
> flags |= IORESOURCE_MEM;
> break;
> +
> +   case 0x03: /* 64 bits */
> +   flags |= IORESOURCE_MEM | IORESOURCE_MEM_64;
> +   break;
> }
> if (w & 0x4000)
> flags |= IORESOURCE_PREFETCH;
> --
> 2.30.2
>


Re: [PATCH v13 14/14] powerpc/64s/radix: Enable huge vmalloc mappings

2021-04-15 Thread Andrew Morton
On Thu, 15 Apr 2021 12:23:55 +0200 Christophe Leroy 
 wrote:
> > +* is done. STRICT_MODULE_RWX may require extra work to support this
> > +* too.
> > +*/
> >   
> > -   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, 
> > GFP_KERNEL,
> > -   PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
> > NUMA_NO_NODE,
> 
> 
> I think you should add the following in 
> 
> #ifndef MODULES_VADDR
> #define MODULES_VADDR VMALLOC_START
> #define MODULES_END VMALLOC_END
> #endif
> 
> And leave module_alloc() as is (just removing the enclosing #ifdef 
> MODULES_VADDR and adding the 
> VM_NO_HUGE_VMAP  flag)
> 
> This would minimise the conflits with the changes I did in powerpc/next 
> reported by Stephen R.
> 

I'll drop powerpc-64s-radix-enable-huge-vmalloc-mappings.patch for now,
make life simpler.

Nick, a redo on top of Christophe's changes in linux-next would be best
please.



Re: [PATCH 1/1] mm: Fix struct page layout on 32-bit systems

2021-04-15 Thread Matthew Wilcox
On Thu, Apr 15, 2021 at 08:08:32PM +0200, Jesper Dangaard Brouer wrote:
> +static inline
> +dma_addr_t page_pool_dma_addr_read(dma_addr_t dma_addr)
> +{
> + /* Workaround for storing 64-bit DMA-addr on 32-bit machines in struct
> +  * page.  The page->dma_addr share area with page->compound_head which
> +  * use bit zero to mark compound pages. This is okay, as DMA-addr are
> +  * aligned pointers which have bit zero cleared.
> +  *
> +  * In the 32-bit case, page->compound_head is 32-bit.  Thus, when
> +  * dma_addr_t is 64-bit it will be located in top 32-bit.  Solve by
> +  * swapping dma_addr 32-bit segments.
> +  */
> +#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT

#if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) && defined(__BIG_ENDIAN)
otherwise you'll create the problem on ARM that you're avoiding on PPC ...

I think you want to delete the word '_read' from this function name because
you're using it for both read and write.



Re: [PATCH 1/1] mm: Fix struct page layout on 32-bit systems

2021-04-15 Thread Jesper Dangaard Brouer
On Wed, 14 Apr 2021 21:56:39 +
David Laight  wrote:

> From: Matthew Wilcox
> > Sent: 14 April 2021 22:36
> > 
> > On Wed, Apr 14, 2021 at 09:13:22PM +0200, Jesper Dangaard Brouer wrote:  
> > > (If others want to reproduce).  First I could not reproduce on ARM32.
> > > Then I found out that enabling CONFIG_XEN on ARCH=arm was needed to
> > > cause the issue by enabling CONFIG_ARCH_DMA_ADDR_T_64BIT.  
> > 
> > hmmm ... you should be able to provoke it by enabling ARM_LPAE,
> > which selects PHYS_ADDR_T_64BIT, and
> > 
> > config ARCH_DMA_ADDR_T_64BIT
> > def_bool 64BIT || PHYS_ADDR_T_64BIT
> >   
> > >  struct page {
> > > long unsigned int  flags;/* 0 4 */
> > >
> > > /* XXX 4 bytes hole, try to pack */
> > >
> > > union {
> > > struct {
> > > struct list_head lru;/* 8 8 */
> > > struct address_space * mapping;  /*16 4 */
> > > long unsigned int index; /*20 4 */
> > > long unsigned int private;   /*24 4 */
> > > };   /* 820 */
> > > struct {
> > > dma_addr_t dma_addr  
> 
> Adding __packed here will remove the 4 byte hole before the union
> and the compiler seems clever enough to know that anything following
> a 'long' must also be 'long' aligned.

Played with __packed in below patch, and I can confirm it seems to work.

> So you don't get anything horrid like byte accesses.
> On 64bit dma_addr will remain 64bit aligned.
> On arm32 dma_addr will be 32bit aligned - but forcing two 32bit access
> won't make any difference.

See below patch.  Where I swap32 the dma address to satisfy
page->compound having bit zero cleared. (It is the simplest fix I could
come up with).


[PATCH] page_pool: handling 32-bit archs with 64-bit dma_addr_t

From: Jesper Dangaard Brouer 

Workaround for storing 64-bit DMA-addr on 32-bit machines in struct
page.  The page->dma_addr share area with page->compound_head which
use bit zero to mark compound pages. This is okay, as DMA-addr are
aligned pointers which have bit zero cleared.

In the 32-bit case, page->compound_head is 32-bit.  Thus, when
dma_addr_t is 64-bit it will be located in top 32-bit.  Solve by
swapping dma_addr 32-bit segments.

Signed-off-by: Jesper Dangaard Brouer 
---
 include/linux/mm_types.h |2 +-
 include/linux/types.h|1 +
 include/net/page_pool.h  |   21 -
 net/core/page_pool.c |8 +---
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..27406e3b1e1b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -100,7 +100,7 @@ struct page {
 * @dma_addr: might require a 64-bit value even on
 * 32-bit architectures.
 */
-   dma_addr_t dma_addr;
+   dma_addr_t dma_addr __packed;
};
struct {/* slab, slob and slub */
union {
diff --git a/include/linux/types.h b/include/linux/types.h
index ac825ad90e44..65fd5d630016 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -141,6 +141,7 @@ typedef u64 blkcnt_t;
  */
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 typedef u64 dma_addr_t;
+//typedef u64 __attribute__((aligned(sizeof(void * dma_addr_t;
 #else
 typedef u32 dma_addr_t;
 #endif
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b5b195305346..c2329088665c 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -196,9 +196,28 @@ static inline void page_pool_recycle_direct(struct 
page_pool *pool,
page_pool_put_full_page(pool, page, true);
 }
 
+static inline
+dma_addr_t page_pool_dma_addr_read(dma_addr_t dma_addr)
+{
+   /* Workaround for storing 64-bit DMA-addr on 32-bit machines in struct
+* page.  The page->dma_addr share area with page->compound_head which
+* use bit zero to mark compound pages. This is okay, as DMA-addr are
+* aligned pointers which have bit zero cleared.
+*
+* In the 32-bit case, page->compound_head is 32-bit.  Thus, when
+* dma_addr_t is 64-bit it will be located in top 32-bit.  Solve by
+* swapping dma_addr 32-bit segments.
+*/
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+   if (sizeof(long unsigned int) == 4) /* 32-bit system */
+   dma_addr = (dma_addr << 32) | (dma_addr >> 32);
+#endif
+   return dma_addr;
+}
+
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-   return page->dma_addr;
+   return page_pool_dma_addr_read(page->dma_addr);
 }
 
 static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c 

Re: [PATCH 3/3] powerpc/smp: Cache CPU to chip lookup

2021-04-15 Thread Srikar Dronamraju
* Gautham R Shenoy  [2021-04-15 22:49:21]:

> > 
> > +int *chip_id_lookup_table;
> > +
> >  #ifdef CONFIG_PPC64
> >  int __initdata iommu_is_off;
> >  int __initdata iommu_force_on;
> > @@ -914,13 +916,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id);
> >  int cpu_to_chip_id(int cpu)
> >  {
> > struct device_node *np;
> > +   int ret = -1, idx;
> > +
> > +   idx = cpu / threads_per_core;
> > +   if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1)
> 

> The value -1 is ambiguous since we won't be able to determine if
> it is because we haven't yet made a of_get_ibm_chip_id() call
> or if of_get_ibm_chip_id() call was made and it returned a -1.
> 

We don't allocate chip_id_lookup_table unless cpu_to_chip_id() return
!-1 value for the boot-cpuid. So this ensures that we dont
unnecessarily allocate chip_id_lookup_table. Also I check for
chip_id_lookup_table before calling cpu_to_chip_id() for other CPUs.
So this avoids overhead of calling cpu_to_chip_id() for platforms that
dont support it.  Also its most likely that if the
chip_id_lookup_table is initialized then of_get_ibm_chip_id() call
would return a valid value.

+ Below we are only populating the lookup table, only when the
of_get_cpu_node is valid.

So I dont see any drawbacks of initializing it to -1. Do you see any?

> Thus, perhaps we can initialize chip_id_lookup_table[idx] with a
> different unique negative value. How about S32_MIN ? and check
> chip_id_lookup_table[idx] is different here ?
> 

I had initially initialized to -2, But then I thought we adding in
more confusion than necessary and it was not solving any issues.


-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-15 Thread Srikar Dronamraju
* Gautham R Shenoy  [2021-04-15 22:41:34]:

> Hi Srikar,
> 
> 

Thanks for taking a look.

> > @@ -1485,12 +1486,36 @@ static void add_cpu_to_masks(int cpu)
> > add_cpu_to_smallcore_masks(cpu);
> > 
> > /* In CPU-hotplug path, hence use GFP_ATOMIC */
> > -   alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
> > +   ret = alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
> > update_mask_by_l2(cpu, );
> > 
> > if (has_coregroup_support())
> > update_coregroup_mask(cpu, );
> > 
> > +   if (chip_id == -1 || !ret) {
> > +   cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
> > +   goto out;
> > +   }
> > +
> > +   if (shared_caches)
> > +   submask_fn = cpu_l2_cache_mask;
> > +
> > +   /* Update core_mask with all the CPUs that are part of submask */
> > +   or_cpumasks_related(cpu, cpu, submask_fn, cpu_core_mask);
> >
> 
> If coregroups exist, we can add the cpus of the coregroup to the
> cpu_core_mask thereby reducing the scope of the for_each_cpu() search
> below. This will still cut down the time on Baremetal systems
> supporting coregroups.
> 

Yes, once we upstream coregroup support to Baremetal, we should look
at adding it. Also do note, number of CPUs we support for Baremetal is
comparatively lower than in PowerVM + QEMU. And more importantly the
number of cores per coregroup is also very low. So the optimization
may not yield too much of a benefit.

Its only in the QEMU case, where we end up having too many cores in
the same chip, where we see a drastic increase in the boot-up time.

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH 3/3] powerpc/smp: Cache CPU to chip lookup

2021-04-15 Thread Gautham R Shenoy
On Thu, Apr 15, 2021 at 05:39:34PM +0530, Srikar Dronamraju wrote:
> On systems with large CPUs per node, even with the filtered matching of
> related CPUs, there can be large number of calls to cpu_to_chip_id for
> the same CPU. For example with 4096 vCPU, 1 node QEMU configuration,
> with 4 threads per core, system could be see upto 1024 calls to
> cpu_to_chip_id() for the same CPU. On a given system, cpu_to_chip_id()
> for a given CPU would always return the same. Hence cache the result in
> a lookup table for use in subsequent calls.
> 
> Since all CPUs sharing the same core will belong to the same chip, the
> lookup_table has an entry for one CPU per core.  chip_id_lookup_table is
> not being freed and would be used on subsequent CPU online post CPU
> offline.
> 
> Suggested-by: Michael Ellerman 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: qemu-...@nongnu.org
> Cc: Cedric Le Goater 
> Cc: David Gibson 
> Cc: Nathan Lynch 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Gautham R Shenoy 
> Reported-by: Daniel Henrique Barboza 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/include/asm/smp.h |  1 +
>  arch/powerpc/kernel/prom.c | 19 +++
>  arch/powerpc/kernel/smp.c  | 21 +++--
>  3 files changed, 35 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 47081a9e13ca..03b3d010cbab 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -31,6 +31,7 @@ extern u32 *cpu_to_phys_id;
>  extern bool coregroup_enabled;
> 
>  extern int cpu_to_chip_id(int cpu);
> +extern int *chip_id_lookup_table;
> 
>  #ifdef CONFIG_SMP
> 
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index 9a4797d1d40d..6d2e4a5bc471 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -65,6 +65,8 @@
>  #define DBG(fmt...)
>  #endif
> 
> +int *chip_id_lookup_table;
> +
>  #ifdef CONFIG_PPC64
>  int __initdata iommu_is_off;
>  int __initdata iommu_force_on;
> @@ -914,13 +916,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id);
>  int cpu_to_chip_id(int cpu)
>  {
>   struct device_node *np;
> + int ret = -1, idx;
> +
> + idx = cpu / threads_per_core;
> + if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1)

The value -1 is ambiguous since we won't be able to determine if
it is because we haven't yet made a of_get_ibm_chip_id() call
or if of_get_ibm_chip_id() call was made and it returned a -1.

Thus, perhaps we can initialize chip_id_lookup_table[idx] with a
different unique negative value. How about S32_MIN ? and check
chip_id_lookup_table[idx] is different here ?


> + return chip_id_lookup_table[idx];
> 
>   np = of_get_cpu_node(cpu, NULL);
> - if (!np)
> - return -1;
> + if (np) {
> + ret = of_get_ibm_chip_id(np);
> + of_node_put(np);
> +
> + if (chip_id_lookup_table)
> + chip_id_lookup_table[idx] = ret;
> + }
> 
> - of_node_put(np);
> - return of_get_ibm_chip_id(np);
> + return ret;
>  }
>  EXPORT_SYMBOL(cpu_to_chip_id);
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 5c7ce1d50631..50520fbea424 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1073,6 +1073,20 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   cpu_smallcore_mask(boot_cpuid));
>   }
> 
> + if (cpu_to_chip_id(boot_cpuid) != -1) {
> + int idx = num_possible_cpus() / threads_per_core;
> +
> + /*
> +  * All threads of a core will all belong to the same core,
> +  * chip_id_lookup_table will have one entry per core.
> +  * Assumption: if boot_cpuid doesn't have a chip-id, then no
> +  * other CPUs, will also not have chip-id.
> +  */
> + chip_id_lookup_table = kcalloc(idx, sizeof(int), GFP_KERNEL);
> + if (chip_id_lookup_table)
> + memset(chip_id_lookup_table, -1, sizeof(int) * idx);
> + }
> +
>   if (smp_ops && smp_ops->probe)
>   smp_ops->probe();
>  }
> @@ -1468,8 +1482,8 @@ static void add_cpu_to_masks(int cpu)
>  {
>   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
>   int first_thread = cpu_first_thread_sibling(cpu);
> - int chip_id = cpu_to_chip_id(cpu);
>   cpumask_var_t mask;
> + int chip_id = -1;
>   bool ret;
>   int i;
> 
> @@ -1492,7 +1506,10 @@ static void add_cpu_to_masks(int cpu)
>   if (has_coregroup_support())
>   update_coregroup_mask(cpu, );
> 
> - if (chip_id == -1 || !ret) {
> + if (chip_id_lookup_table && ret)
> + chip_id = cpu_to_chip_id(cpu);
> +
> + if (chip_id == -1) {
>   cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
>   goto 

Re: [PATCH v1 1/1] powerpc/papr_scm: Properly handle UUID types and API

2021-04-15 Thread Andy Shevchenko
On Thu, Apr 15, 2021 at 8:10 PM Vaibhav Jain  wrote:
>
>
> Thanks for the patch Andy,
>
> Unfortunately ran into a compilation issue due to missing "#include
> " that provides definition for
> get_unaligned_le64(). Gcc reported following error:
>
> error: implicit declaration of function ‘get_unaligned_le64’

Right, I have not tested it (as mentioned in the comments to the patch)

> After including the necessary header file, kernel compiled fine and I
> was able to test & verify the patch.

Thank you very much for the testing.

I'm not sure what the coverage of your test is. That's why I have an
additional question below. Is the byte ordering kept the same in BE
(32- and 64-bit) cases? Because I'm worrying that I might have missed
something.


-- 
With Best Regards,
Andy Shevchenko


[PATCH v1 5/5] powerpc/mm: Convert powerpc to GENERIC_PTDUMP

2021-04-15 Thread Christophe Leroy
This patch converts powerpc to the generic PTDUMP implementation.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig  |   2 +
 arch/powerpc/Kconfig.debug|  30 --
 arch/powerpc/mm/Makefile  |   2 +-
 arch/powerpc/mm/mmu_decl.h|   2 +-
 arch/powerpc/mm/ptdump/8xx.c  |   6 +-
 arch/powerpc/mm/ptdump/Makefile   |   9 +-
 arch/powerpc/mm/ptdump/book3s64.c |   6 +-
 arch/powerpc/mm/ptdump/ptdump.c   | 161 +-
 arch/powerpc/mm/ptdump/shared.c   |   6 +-
 9 files changed, 68 insertions(+), 156 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 475d77a6ebbe..40259437a28f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -120,6 +120,7 @@ config PPC
select ARCH_32BIT_OFF_T if PPC32
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
+   select ARCH_HAS_DEBUG_WXif STRICT_KERNEL_RWX
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
@@ -177,6 +178,7 @@ config PPC
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_PCI_IOMAPif PCI
+   select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 6342f9da4545..05b1180ea502 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -360,36 +360,6 @@ config FAIL_IOMMU
 
  If you are unsure, say N.
 
-config PPC_PTDUMP
-   bool "Export kernel pagetable layout to userspace via debugfs"
-   depends on DEBUG_KERNEL && DEBUG_FS
-   help
- This option exports the state of the kernel pagetables to a
- debugfs file. This is only useful for kernel developers who are
- working in architecture specific areas of the kernel - probably
- not a good idea to enable this feature in a production kernel.
-
- If you are unsure, say N.
-
-config PPC_DEBUG_WX
-   bool "Warn on W+X mappings at boot"
-   depends on PPC_PTDUMP && STRICT_KERNEL_RWX
-   help
- Generate a warning if any W+X mappings are found at boot.
-
- This is useful for discovering cases where the kernel is leaving
- W+X mappings after applying NX, as such mappings are a security risk.
-
- Note that even if the check fails, your kernel is possibly
- still fine, as W+X mappings are not a security hole in
- themselves, what they do is that they make the exploitation
- of other unfixed kernel bugs easier.
-
- There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
-
- If in doubt, say "Y".
-
 config PPC_FAST_ENDIAN_SWITCH
bool "Deprecated fast endian-switch syscall"
depends on DEBUG_KERNEL && PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index c3df3a8501d4..c90d58aaebe2 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES)   += slice.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
-obj-$(CONFIG_PPC_PTDUMP)   += ptdump/
+obj-$(CONFIG_PTDUMP_CORE)  += ptdump/
 obj-$(CONFIG_KASAN)+= kasan/
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7dac910c0b21..dd1cabc2ea0f 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { }
 void __init mmu_mapin_immr(void);
 #endif
 
-#ifdef CONFIG_PPC_DEBUG_WX
+#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void);
 #else
 static inline void ptdump_check_wx(void) { }
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 86da2a669680..fac932eb8f9a 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-   {
-   }, { /* pgd */
+   { /* pgd */
+   .flag   = flag_array,
+   .num= ARRAY_SIZE(flag_array),
+   }, { /* p4d */
.flag   = flag_array,
.num= ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
index 712762be3cb1..4050cbb55acf 100644
--- a/arch/powerpc/mm/ptdump/Makefile
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -5,5 +5,10 @@ obj-y  += ptdump.o
 obj-$(CONFIG_4xx)  += shared.o
 obj-$(CONFIG_PPC_8xx)  += 8xx.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += shared.o
-obj-$(CONFIG_PPC_BOOK3S_32)+= shared.o bats.o segment_regs.o
-obj-$(CONFIG_PPC_BOOK3S_64)+= book3s64.o 

[PATCH v1 2/5] mm: ptdump: Fix build failure

2021-04-15 Thread Christophe Leroy
  CC  mm/ptdump.o
In file included from :
mm/ptdump.c: In function 'ptdump_pte_entry':
././include/linux/compiler_types.h:320:38: error: call to 
'__compiletime_assert_207' declared with attribute error: Unsupported access 
size for {READ,WRITE}_ONCE().
  320 |  _compiletime_assert(condition, msg, __compiletime_assert_, 
__COUNTER__)
  |  ^
././include/linux/compiler_types.h:301:4: note: in definition of macro 
'__compiletime_assert'
  301 |prefix ## suffix();\
  |^~
././include/linux/compiler_types.h:320:2: note: in expansion of macro 
'_compiletime_assert'
  320 |  _compiletime_assert(condition, msg, __compiletime_assert_, 
__COUNTER__)
  |  ^~~
./include/asm-generic/rwonce.h:36:2: note: in expansion of macro 
'compiletime_assert'
   36 |  compiletime_assert(__native_word(t) || sizeof(t) == 
sizeof(long long), \
  |  ^~
./include/asm-generic/rwonce.h:49:2: note: in expansion of macro 
'compiletime_assert_rwonce_type'
   49 |  compiletime_assert_rwonce_type(x);\
  |  ^~
mm/ptdump.c:114:14: note: in expansion of macro 'READ_ONCE'
  114 |  pte_t val = READ_ONCE(*pte);
  |  ^
make[2]: *** [mm/ptdump.o] Error 1

READ_ONCE() cannot be used for reading PTEs. Use ptep_get()
instead. See commit 481e980a7c19 ("mm: Allow arches to provide ptep_get()")
and commit c0e1c8c22beb ("powerpc/8xx: Provide ptep_get() with 16k pages")
for details.

Fixes: 30d621f6723b ("mm: add generic ptdump")
Cc: Steven Price 
Signed-off-by: Christophe Leroy 
---
 mm/ptdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 4354c1422d57..da751448d0e4 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -111,7 +111,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
 {
struct ptdump_state *st = walk->private;
-   pte_t val = READ_ONCE(*pte);
+   pte_t val = ptep_get(pte);
 
if (st->effective_prot)
st->effective_prot(st, 4, pte_val(val));
-- 
2.25.0



[PATCH v1 3/5] mm: ptdump: Provide page size to notepage()

2021-04-15 Thread Christophe Leroy
In order to support large pages on powerpc, notepage()
needs to know the page size of the page.

Add a page_size argument to notepage().

Signed-off-by: Christophe Leroy 
---
 arch/arm64/mm/ptdump.c |  2 +-
 arch/riscv/mm/ptdump.c |  2 +-
 arch/s390/mm/dump_pagetables.c |  3 ++-
 arch/x86/mm/dump_pagetables.c  |  2 +-
 include/linux/ptdump.h |  2 +-
 mm/ptdump.c| 16 
 6 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 0e050d76b83a..ea1a1c3a3ea0 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -257,7 +257,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long 
addr)
 }
 
 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int 
level,
- u64 val)
+ u64 val, unsigned long page_size)
 {
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
static const char units[] = "KMGTPE";
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index ace74dec7492..0a7f276ba799 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -235,7 +235,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long 
addr)
 }
 
 static void note_page(struct ptdump_state *pt_st, unsigned long addr,
- int level, u64 val)
+ int level, u64 val, unsigned long page_size)
 {
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
u64 pa = PFN_PHYS(pte_pfn(__pte(val)));
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index e40a30647d99..29673c38e773 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -116,7 +116,8 @@ static void note_prot_wx(struct pg_state *st, unsigned long 
addr)
 #endif /* CONFIG_DEBUG_WX */
 }
 
-static void note_page(struct ptdump_state *pt_st, unsigned long addr, int 
level, u64 val)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int 
level,
+ u64 val, unsigned long page_size)
 {
int width = sizeof(unsigned long) * 2;
static const char units[] = "KMGTPE";
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e1b599ecbbc2..2ec76737c1f1 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -272,7 +272,7 @@ static void effective_prot(struct ptdump_state *pt_st, int 
level, u64 val)
  * print what we collected so far.
  */
 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int 
level,
- u64 val)
+ u64 val, unsigned long page_size)
 {
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
pgprotval_t new_prot, new_eff;
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 2a3a95586425..3a971fadc95e 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -13,7 +13,7 @@ struct ptdump_range {
 struct ptdump_state {
/* level is 0:PGD to 4:PTE, or -1 if unknown */
void (*note_page)(struct ptdump_state *st, unsigned long addr,
- int level, u64 val);
+ int level, u64 val, unsigned long page_size);
void (*effective_prot)(struct ptdump_state *st, int level, u64 val);
const struct ptdump_range *range;
 };
diff --git a/mm/ptdump.c b/mm/ptdump.c
index da751448d0e4..61cd16afb1c8 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
 {
struct ptdump_state *st = walk->private;
 
-   st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+   st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), 
PAGE_SIZE);
 
walk->action = ACTION_CONTINUE;
 
@@ -41,7 +41,7 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
st->effective_prot(st, 0, pgd_val(val));
 
if (pgd_leaf(val))
-   st->note_page(st, addr, 0, pgd_val(val));
+   st->note_page(st, addr, 0, pgd_val(val), PGDIR_SIZE);
 
return 0;
 }
@@ -62,7 +62,7 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
st->effective_prot(st, 1, p4d_val(val));
 
if (p4d_leaf(val))
-   st->note_page(st, addr, 1, p4d_val(val));
+   st->note_page(st, addr, 1, p4d_val(val), P4D_SIZE);
 
return 0;
 }
@@ -83,7 +83,7 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
st->effective_prot(st, 2, pud_val(val));
 
if (pud_leaf(val))
-   st->note_page(st, addr, 2, pud_val(val));
+   st->note_page(st, addr, 2, pud_val(val), PUD_SIZE);
 
return 0;
 }
@@ -102,7 +102,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 3, pmd_val(val));

[PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables

2021-04-15 Thread Christophe Leroy
Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.

Signed-off-by: Christophe Leroy 
---
 mm/pagewalk.c | 54 +--
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..410a9d8f7572 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,32 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
unsigned long end,
return err;
 }
 
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   int err = 0;
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+   const struct mm_walk_ops *ops = walk->ops;
+   int shift = hugepd_shift(*phpd);
+   int page_size = 1 << shift;
+
+   if (addr & (page_size - 1))
+   return 0;
+
+   for (;;) {
+   pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
+
+   err = ops->pte_entry(pte, addr, addr + page_size, walk);
+   if (err)
+   break;
+   if (addr >= end - page_size)
+   break;
+   addr += page_size;
+   }
+#endif
+   return err;
+}
+
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
 {
@@ -108,7 +134,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
goto again;
}
 
-   err = walk_pte_range(pmd, addr, next, walk);
+   if (is_hugepd(__hugepd(pmd_val(*pmd
+   err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
walk, PMD_SHIFT);
+   else
+   err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -157,7 +186,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, 
unsigned long end,
if (pud_none(*pud))
goto again;
 
-   err = walk_pmd_range(pud, addr, next, walk);
+   if (is_hugepd(__hugepd(pud_val(*pud
+   err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
+   else
+   err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -189,8 +221,13 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, 
unsigned long end,
if (err)
break;
}
-   if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
-   err = walk_pud_range(p4d, addr, next, walk);
+   if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) {
+   if (is_hugepd(__hugepd(p4d_val(*p4d
+   err = walk_hugepd_range((hugepd_t *)p4d, addr, 
next, walk,
+   P4D_SHIFT);
+   else
+   err = walk_pud_range(p4d, addr, next, walk);
+   }
if (err)
break;
} while (p4d++, addr = next, addr != end);
@@ -225,8 +262,13 @@ static int walk_pgd_range(unsigned long addr, unsigned 
long end,
break;
}
if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
-   ops->pte_entry)
-   err = walk_p4d_range(pgd, addr, next, walk);
+   ops->pte_entry) {
+   if (is_hugepd(__hugepd(pgd_val(*pgd
+   err = walk_hugepd_range((hugepd_t *)pgd, addr, 
next, walk,
+   PGDIR_SHIFT);
+   else
+   err = walk_p4d_range(pgd, addr, next, walk);
+   }
if (err)
break;
} while (pgd++, addr = next, addr != end);
-- 
2.25.0



[PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP

2021-04-15 Thread Christophe Leroy
This series converts powerpc to generic PTDUMP.

For that, we first need to add missing hugepd support
to pagewalk and ptdump.

Christophe Leroy (5):
  mm: pagewalk: Fix walk for hugepage tables
  mm: ptdump: Fix build failure
  mm: ptdump: Provide page size to notepage()
  mm: ptdump: Support hugepd table entries
  powerpc/mm: Convert powerpc to GENERIC_PTDUMP

 arch/arm64/mm/ptdump.c|   2 +-
 arch/powerpc/Kconfig  |   2 +
 arch/powerpc/Kconfig.debug|  30 --
 arch/powerpc/mm/Makefile  |   2 +-
 arch/powerpc/mm/mmu_decl.h|   2 +-
 arch/powerpc/mm/ptdump/8xx.c  |   6 +-
 arch/powerpc/mm/ptdump/Makefile   |   9 +-
 arch/powerpc/mm/ptdump/book3s64.c |   6 +-
 arch/powerpc/mm/ptdump/ptdump.c   | 161 +-
 arch/powerpc/mm/ptdump/shared.c   |   6 +-
 arch/riscv/mm/ptdump.c|   2 +-
 arch/s390/mm/dump_pagetables.c|   3 +-
 arch/x86/mm/dump_pagetables.c |   2 +-
 include/linux/ptdump.h|   2 +-
 mm/pagewalk.c |  54 --
 mm/ptdump.c   |  33 --
 16 files changed, 145 insertions(+), 177 deletions(-)

-- 
2.25.0



[PATCH v1 4/5] mm: ptdump: Support hugepd table entries

2021-04-15 Thread Christophe Leroy
Which hugepd, page table entries can be at any level
and can be of any size.

Add support for them.

Signed-off-by: Christophe Leroy 
---
 mm/ptdump.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 61cd16afb1c8..6efdb8c15a7d 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -112,11 +112,24 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long 
addr,
 {
struct ptdump_state *st = walk->private;
pte_t val = ptep_get(pte);
+   unsigned long page_size = next - addr;
+   int level;
+
+   if (page_size >= PGDIR_SIZE)
+   level = 0;
+   else if (page_size >= P4D_SIZE)
+   level = 1;
+   else if (page_size >= PUD_SIZE)
+   level = 2;
+   else if (page_size >= PMD_SIZE)
+   level = 3;
+   else
+   level = 4;
 
if (st->effective_prot)
-   st->effective_prot(st, 4, pte_val(val));
+   st->effective_prot(st, level, pte_val(val));
 
-   st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);
+   st->note_page(st, addr, level, pte_val(val), page_size);
 
return 0;
 }
-- 
2.25.0



Re: [PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-15 Thread Gautham R Shenoy
Hi Srikar,

On Thu, Apr 15, 2021 at 05:39:32PM +0530, Srikar Dronamraju wrote:
 [..snip..]



> @@ -1485,12 +1486,36 @@ static void add_cpu_to_masks(int cpu)
>   add_cpu_to_smallcore_masks(cpu);
> 
>   /* In CPU-hotplug path, hence use GFP_ATOMIC */
> - alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
> + ret = alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
>   update_mask_by_l2(cpu, );
> 
>   if (has_coregroup_support())
>   update_coregroup_mask(cpu, );
> 
> + if (chip_id == -1 || !ret) {
> + cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
> + goto out;
> + }
> +
> + if (shared_caches)
> + submask_fn = cpu_l2_cache_mask;
> +
> + /* Update core_mask with all the CPUs that are part of submask */
> + or_cpumasks_related(cpu, cpu, submask_fn, cpu_core_mask);
>

If coregroups exist, we can add the cpus of the coregroup to the
cpu_core_mask thereby reducing the scope of the for_each_cpu() search
below. This will still cut down the time on Baremetal systems
supporting coregroups.


> + /* Skip all CPUs already part of current CPU core mask */
> + cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
> +
> + for_each_cpu(i, mask) {
> + if (chip_id == cpu_to_chip_id(i)) {
> + or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask);
> + cpumask_andnot(mask, mask, submask_fn(i));
> + } else {
> + cpumask_andnot(mask, mask, cpu_core_mask(i));
> + }
> + }
> +
> +out:
>   free_cpumask_var(mask);
>  }
> 
> -- 
> 2.25.1
> 


Re: [PATCH v1 1/1] powerpc/papr_scm: Properly handle UUID types and API

2021-04-15 Thread Vaibhav Jain


Thanks for the patch Andy,

Unfortunately ran into a compilation issue due to missing "#include
" that provides definition for
get_unaligned_le64(). Gcc reported following error:
 
error: implicit declaration of function ‘get_unaligned_le64’

After including the necessary header file, kernel compiled fine and I
was able to test & verify the patch.

-- 
Cheers
~ Vaibhav

Andy Shevchenko  writes:

> Parse to and export from UUID own type, before dereferencing.
> This also fixes wrong comment (Little Endian UUID is something else)
> and should fix Sparse warnings about assigning strict types to POD.
>
> Fixes: 43001c52b603 ("powerpc/papr_scm: Use ibm,unit-guid as the iset cookie")
> Fixes: 259a948c4ba1 ("powerpc/pseries/scm: Use a specific endian format for 
> storing uuid from the device tree")
> Cc: Oliver O'Halloran 
> Cc: Aneesh Kumar K.V 
> Signed-off-by: Andy Shevchenko 
> ---
> Not tested
>  arch/powerpc/platforms/pseries/papr_scm.c | 13 -
>  1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index ae6f5d80d5ce..4366e1902890 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -1085,8 +1085,9 @@ static int papr_scm_probe(struct platform_device *pdev)
>   u32 drc_index, metadata_size;
>   u64 blocks, block_size;
>   struct papr_scm_priv *p;
> + u8 uuid_raw[UUID_SIZE];
>   const char *uuid_str;
> - u64 uuid[2];
> + uuid_t uuid;
>   int rc;
>  
>   /* check we have all the required DT properties */
> @@ -1129,16 +1130,18 @@ static int papr_scm_probe(struct platform_device 
> *pdev)
>   p->hcall_flush_required = of_property_read_bool(dn, 
> "ibm,hcall-flush-required");
>  
>   /* We just need to ensure that set cookies are unique across */
> - uuid_parse(uuid_str, (uuid_t *) uuid);
> + uuid_parse(uuid_str, );
> +
>   /*
>* cookie1 and cookie2 are not really little endian
> -  * we store a little endian representation of the
> +  * we store a raw buffer representation of the
>* uuid str so that we can compare this with the label
>* area cookie irrespective of the endian config with which
>* the kernel is built.
>*/
> - p->nd_set.cookie1 = cpu_to_le64(uuid[0]);
> - p->nd_set.cookie2 = cpu_to_le64(uuid[1]);
> + export_uuid(uuid_raw, );
> + p->nd_set.cookie1 = get_unaligned_le64(_raw[0]);
> + p->nd_set.cookie2 = get_unaligned_le64(_raw[8]);
>  
>   /* might be zero */
>   p->metadata_size = metadata_size;
> -- 
> 2.30.2
>



Re: [PATCH] powerpc/papr_scm: Reduce error severity if nvdimm stats inaccessible

2021-04-15 Thread Dan Williams
On Thu, Apr 15, 2021 at 4:44 AM Vaibhav Jain  wrote:
>
> Thanks for looking into this Dan,
>
> Dan Williams  writes:
>
> > On Wed, Apr 14, 2021 at 5:40 AM Vaibhav Jain  wrote:
> >>
> >> Currently drc_pmem_qeury_stats() generates a dev_err in case
> >> "Enable Performance Information Collection" feature is disabled from
> >> HMC. The error is of the form below:
> >>
> >> papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Failed to query
> >>  performance stats, Err:-10
> >>
> >> This error message confuses users as it implies a possible problem
> >> with the nvdimm even though its due to a disabled feature.
> >>
> >> So we fix this by explicitly handling the H_AUTHORITY error from the
> >> H_SCM_PERFORMANCE_STATS hcall and generating a warning instead of an
> >> error, saying that "Performance stats in-accessible".
> >>
> >> Fixes: 2d02bf835e57('powerpc/papr_scm: Fetch nvdimm performance stats from 
> >> PHYP')
> >> Signed-off-by: Vaibhav Jain 
> >> ---
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 3 +++
> >>  1 file changed, 3 insertions(+)
> >>
> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> >> b/arch/powerpc/platforms/pseries/papr_scm.c
> >> index 835163f54244..9216424f8be3 100644
> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> >> @@ -277,6 +277,9 @@ static ssize_t drc_pmem_query_stats(struct 
> >> papr_scm_priv *p,
> >> dev_err(>pdev->dev,
> >> "Unknown performance stats, Err:0x%016lX\n", 
> >> ret[0]);
> >> return -ENOENT;
> >> +   } else if (rc == H_AUTHORITY) {
> >> +   dev_warn(>pdev->dev, "Performance stats in-accessible");
> >> +   return -EPERM;
> >
> > So userspace can spam the kernel log? Why is kernel log message needed
> > at all? EPERM told the caller what happened.
> Currently this error message is only reported during probe of the
> nvdimm. So userspace cannot directly spam kernel log.

Oh, ok, I saw things like papr_pdsm_fuel_gauge() in the call stack and
thought this was reachable through an ioctl. Sorry for the noise.


[PATCH v1 1/1] powerpc/papr_scm: Properly handle UUID types and API

2021-04-15 Thread Andy Shevchenko
Parse to and export from UUID own type, before dereferencing.
This also fixes wrong comment (Little Endian UUID is something else)
and should fix Sparse warnings about assigning strict types to POD.

Fixes: 43001c52b603 ("powerpc/papr_scm: Use ibm,unit-guid as the iset cookie")
Fixes: 259a948c4ba1 ("powerpc/pseries/scm: Use a specific endian format for 
storing uuid from the device tree")
Cc: Oliver O'Halloran 
Cc: Aneesh Kumar K.V 
Signed-off-by: Andy Shevchenko 
---
Not tested
 arch/powerpc/platforms/pseries/papr_scm.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index ae6f5d80d5ce..4366e1902890 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -1085,8 +1085,9 @@ static int papr_scm_probe(struct platform_device *pdev)
u32 drc_index, metadata_size;
u64 blocks, block_size;
struct papr_scm_priv *p;
+   u8 uuid_raw[UUID_SIZE];
const char *uuid_str;
-   u64 uuid[2];
+   uuid_t uuid;
int rc;
 
/* check we have all the required DT properties */
@@ -1129,16 +1130,18 @@ static int papr_scm_probe(struct platform_device *pdev)
p->hcall_flush_required = of_property_read_bool(dn, 
"ibm,hcall-flush-required");
 
/* We just need to ensure that set cookies are unique across */
-   uuid_parse(uuid_str, (uuid_t *) uuid);
+   uuid_parse(uuid_str, );
+
/*
 * cookie1 and cookie2 are not really little endian
-* we store a little endian representation of the
+* we store a raw buffer representation of the
 * uuid str so that we can compare this with the label
 * area cookie irrespective of the endian config with which
 * the kernel is built.
 */
-   p->nd_set.cookie1 = cpu_to_le64(uuid[0]);
-   p->nd_set.cookie2 = cpu_to_le64(uuid[1]);
+   export_uuid(uuid_raw, );
+   p->nd_set.cookie1 = get_unaligned_le64(_raw[0]);
+   p->nd_set.cookie2 = get_unaligned_le64(_raw[8]);
 
/* might be zero */
p->metadata_size = metadata_size;
-- 
2.30.2



Re: consolidate the flock uapi definitions

2021-04-15 Thread Heiko Carstens
On Mon, Apr 12, 2021 at 10:55:40AM +0200, Christoph Hellwig wrote:
> Hi all,
> 
> currently we deal with the slight differents in the various architecture
> variants of the flock and flock64 stuctures in a very cruft way.  This
> series switches to just use small arch hooks and define the rest in
> asm-generic and linux/compat.h instead.
> 
> Diffstat:
>  arch/arm64/include/asm/compat.h|   20 
>  arch/mips/include/asm/compat.h |   23 ++-
>  arch/mips/include/uapi/asm/fcntl.h |   28 +++-
>  arch/parisc/include/asm/compat.h   |   16 
>  arch/powerpc/include/asm/compat.h  |   20 
>  arch/s390/include/asm/compat.h |   20 
>  arch/sparc/include/asm/compat.h|   22 +-
>  arch/x86/include/asm/compat.h  |   24 +++-
>  include/linux/compat.h |   31 +++
>  include/uapi/asm-generic/fcntl.h   |   21 +++--
>  tools/include/uapi/asm-generic/fcntl.h |   21 +++--
>  11 files changed, 54 insertions(+), 192 deletions(-)

for the s390 bits:
Acked-by: Heiko Carstens 


Re: [PATCH 0/3] Reintroduce cpu_core_mask

2021-04-15 Thread Daniel Henrique Barboza

Hi,


Using a QEMU pseries guest with this follwing SMP topology, with a
single NUMA node:


(...) -smp 32,threads=4,cores=4,sockets=2, (...)

This is the output of lscpu with a guest running v5.12-rc5:

[root@localhost ~]# lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  32
On-line CPU(s) list: 0-31
Thread(s) per core:  4
Core(s) per socket:  8
Socket(s):   1
NUMA node(s):1
Model:   2.2 (pvr 004e 1202)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:   32K
L1i cache:   32K
NUMA node0 CPU(s):   0-31
[root@localhost ~]#


The changes with cpu_core_mask made the topology sockets matching NUMA nodes.
In this case, given that we have a single NUMA node, the SMP topology got
adjusted to have 8 cores instead of 4 so we can have a single socket as well.

Although sockets equal to NUMA nodes is true for Power hardware, QEMU doesn't
have this constraint and users expect sockets and NUMA nodes to be kind of
independent, regardless of how unpractical that would be with real hardware.


The same guest running a kernel with this series applied:


[root@localhost ~]# lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  32
On-line CPU(s) list: 0-31
Thread(s) per core:  4
Core(s) per socket:  4
Socket(s):   2
NUMA node(s):1
Model:   2.2 (pvr 004e 1202)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:   32K
L1i cache:   32K
NUMA node0 CPU(s):   0-31


The sockets and NUMA nodes are being represented separately, as intended via
the QEMU command line.


Thanks for the looking this up, Srikar. For all patches:


Tested-by: Daniel Henrique Barboza 



On 4/15/21 9:09 AM, Srikar Dronamraju wrote:

Daniel had reported that
  QEMU is now unable to see requested topologies in a multi socket single
  NUMA node configurations.
  -smp 8,maxcpus=8,cores=2,threads=2,sockets=2

This patchset reintroduces cpu_core_mask so that users can see requested
topologies while still maintaining the boot time of very large system
configurations.

It includes caching the chip_id as suggested by Michael Ellermann

4 Threads/Core; 4 cores/Socket; 4 Sockets/Node, 2 Nodes in System
   -numa node,nodeid=0,memdev=m0 \
   -numa node,nodeid=1,memdev=m1 \
   -smp 128,sockets=8,threads=4,maxcpus=128  \

5.12.0-rc5 (or any kernel with commit 4ca234a9cbd7)
---
srikar@cloudy:~$ lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  128
On-line CPU(s) list: 0-127
Thread(s) per core:  4
Core(s) per socket:  16
Socket(s):   2 <-
NUMA node(s):2
Model:   2.3 (pvr 004e 1203)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:   1 MiB
L1i cache:   1 MiB
NUMA node0 CPU(s):   0-15,32-47,64-79,96-111
NUMA node1 CPU(s):   16-31,48-63,80-95,112-127
--
srikar@cloudy:~$ dmesg |grep smp
[0.010658] smp: Bringing up secondary CPUs ...
[0.424681] smp: Brought up 2 nodes, 128 CPUs
--

5.12.0-rc5 + 3 patches
--
srikar@cloudy:~$ lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  128
On-line CPU(s) list: 0-127
Thread(s) per core:  4
Core(s) per socket:  4
Socket(s):   8-
NUMA node(s):2
Model:   2.3 (pvr 004e 1203)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:   1 MiB
L1i cache:   1 MiB
NUMA node0 CPU(s):   0-15,32-47,64-79,96-111
NUMA node1 CPU(s):   16-31,48-63,80-95,112-127
--
srikar@cloudy:~$ dmesg |grep smp
[0.010372] smp: Bringing up secondary CPUs ...
[0.417892] smp: Brought up 2 nodes, 128 CPUs

5.12.0-rc5
--
srikar@cloudy:~$  lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  1024
On-line CPU(s) list: 0-1023
Thread(s) per core:  8
Core(s) per socket:  128
Socket(s):   1
NUMA node(s):1
Model:   2.3 (pvr 004e 1203)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:

[PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-15 Thread Srikar Dronamraju
Daniel reported that with Commit 4ca234a9cbd7 ("powerpc/smp: Stop
updating cpu_core_mask") QEMU was unable to set single NUMA node SMP
topologies such as:
 -smp 8,maxcpus=8,cores=2,threads=2,sockets=2
 i.e he expected 2 sockets in one NUMA node.

The above commit helped to reduce boot time on Large Systems for
example 4096 vCPU single socket QEMU instance. PAPR is silent on
having more than one socket within a NUMA node.

cpu_core_mask and cpu_cpu_mask for any CPU would be same unless the
number of sockets is different from the number of NUMA nodes.

One option is to reintroduce cpu_core_mask but use a slightly
different method to arrive at the cpu_core_mask. Previously each CPU's
chip-id would be compared with all other CPU's chip-id to verify if
both the CPUs were related at the chip level. Now if a CPU 'A' is
found related / (unrelated) to another CPU 'B', all the thread
siblings of 'A' and thread siblings of 'B' are automatically marked as
related / (unrelated).

Also if a platform doesn't support ibm,chip-id property, i.e its
cpu_to_chip_id returns -1, cpu_core_map holds a copy of
cpu_cpu_mask().

Fixes: 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask")
Cc: linuxppc-dev@lists.ozlabs.org
Cc: qemu-...@nongnu.org
Cc: Cedric Le Goater 
Cc: David Gibson 
Cc: Nathan Lynch 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Gautham R Shenoy 
Reported-by: Daniel Henrique Barboza 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/include/asm/smp.h |  5 +
 arch/powerpc/kernel/smp.c  | 39 --
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 7a13bc20f0a0..47081a9e13ca 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -121,6 +121,11 @@ static inline struct cpumask *cpu_sibling_mask(int cpu)
return per_cpu(cpu_sibling_map, cpu);
 }
 
+static inline struct cpumask *cpu_core_mask(int cpu)
+{
+   return per_cpu(cpu_core_map, cpu);
+}
+
 static inline struct cpumask *cpu_l2_cache_mask(int cpu)
 {
return per_cpu(cpu_l2_cache_map, cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5a4d59a1070d..5c7ce1d50631 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1057,17 +1057,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
local_memory_node(numa_cpu_lookup_table[cpu]));
}
 #endif
-   /*
-* cpu_core_map is now more updated and exists only since
-* its been exported for long. It only will have a snapshot
-* of cpu_cpu_mask.
-*/
-   cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
}
 
/* Init the cpumasks so the boot CPU is related to itself */
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
+   cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
if (has_coregroup_support())
cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
@@ -1408,6 +1403,9 @@ static void remove_cpu_from_masks(int cpu)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
}
 
+   for_each_cpu(i, cpu_core_mask(cpu))
+   set_cpus_unrelated(cpu, i, cpu_core_mask);
+
if (has_coregroup_support()) {
for_each_cpu(i, cpu_coregroup_mask(cpu))
set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
@@ -1468,8 +1466,11 @@ static void update_coregroup_mask(int cpu, cpumask_var_t 
*mask)
 
 static void add_cpu_to_masks(int cpu)
 {
+   struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
int first_thread = cpu_first_thread_sibling(cpu);
+   int chip_id = cpu_to_chip_id(cpu);
cpumask_var_t mask;
+   bool ret;
int i;
 
/*
@@ -1485,12 +1486,36 @@ static void add_cpu_to_masks(int cpu)
add_cpu_to_smallcore_masks(cpu);
 
/* In CPU-hotplug path, hence use GFP_ATOMIC */
-   alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
+   ret = alloc_cpumask_var_node(, GFP_ATOMIC, cpu_to_node(cpu));
update_mask_by_l2(cpu, );
 
if (has_coregroup_support())
update_coregroup_mask(cpu, );
 
+   if (chip_id == -1 || !ret) {
+   cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
+   goto out;
+   }
+
+   if (shared_caches)
+   submask_fn = cpu_l2_cache_mask;
+
+   /* Update core_mask with all the CPUs that are part of submask */
+   or_cpumasks_related(cpu, cpu, submask_fn, cpu_core_mask);
+
+   /* Skip all CPUs already part of current CPU core mask */
+   cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
+
+   for_each_cpu(i, mask) {
+   if 

[PATCH 3/3] powerpc/smp: Cache CPU to chip lookup

2021-04-15 Thread Srikar Dronamraju
On systems with large CPUs per node, even with the filtered matching of
related CPUs, there can be large number of calls to cpu_to_chip_id for
the same CPU. For example with 4096 vCPU, 1 node QEMU configuration,
with 4 threads per core, system could be see upto 1024 calls to
cpu_to_chip_id() for the same CPU. On a given system, cpu_to_chip_id()
for a given CPU would always return the same. Hence cache the result in
a lookup table for use in subsequent calls.

Since all CPUs sharing the same core will belong to the same chip, the
lookup_table has an entry for one CPU per core.  chip_id_lookup_table is
not being freed and would be used on subsequent CPU online post CPU
offline.

Suggested-by: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: qemu-...@nongnu.org
Cc: Cedric Le Goater 
Cc: David Gibson 
Cc: Nathan Lynch 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Gautham R Shenoy 
Reported-by: Daniel Henrique Barboza 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/prom.c | 19 +++
 arch/powerpc/kernel/smp.c  | 21 +++--
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 47081a9e13ca..03b3d010cbab 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -31,6 +31,7 @@ extern u32 *cpu_to_phys_id;
 extern bool coregroup_enabled;
 
 extern int cpu_to_chip_id(int cpu);
+extern int *chip_id_lookup_table;
 
 #ifdef CONFIG_SMP
 
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 9a4797d1d40d..6d2e4a5bc471 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -65,6 +65,8 @@
 #define DBG(fmt...)
 #endif
 
+int *chip_id_lookup_table;
+
 #ifdef CONFIG_PPC64
 int __initdata iommu_is_off;
 int __initdata iommu_force_on;
@@ -914,13 +916,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id);
 int cpu_to_chip_id(int cpu)
 {
struct device_node *np;
+   int ret = -1, idx;
+
+   idx = cpu / threads_per_core;
+   if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1)
+   return chip_id_lookup_table[idx];
 
np = of_get_cpu_node(cpu, NULL);
-   if (!np)
-   return -1;
+   if (np) {
+   ret = of_get_ibm_chip_id(np);
+   of_node_put(np);
+
+   if (chip_id_lookup_table)
+   chip_id_lookup_table[idx] = ret;
+   }
 
-   of_node_put(np);
-   return of_get_ibm_chip_id(np);
+   return ret;
 }
 EXPORT_SYMBOL(cpu_to_chip_id);
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5c7ce1d50631..50520fbea424 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1073,6 +1073,20 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpu_smallcore_mask(boot_cpuid));
}
 
+   if (cpu_to_chip_id(boot_cpuid) != -1) {
+   int idx = num_possible_cpus() / threads_per_core;
+
+   /*
+* All threads of a core will all belong to the same core,
+* chip_id_lookup_table will have one entry per core.
+* Assumption: if boot_cpuid doesn't have a chip-id, then no
+* other CPUs, will also not have chip-id.
+*/
+   chip_id_lookup_table = kcalloc(idx, sizeof(int), GFP_KERNEL);
+   if (chip_id_lookup_table)
+   memset(chip_id_lookup_table, -1, sizeof(int) * idx);
+   }
+
if (smp_ops && smp_ops->probe)
smp_ops->probe();
 }
@@ -1468,8 +1482,8 @@ static void add_cpu_to_masks(int cpu)
 {
struct cpumask *(*submask_fn)(int) = cpu_sibling_mask;
int first_thread = cpu_first_thread_sibling(cpu);
-   int chip_id = cpu_to_chip_id(cpu);
cpumask_var_t mask;
+   int chip_id = -1;
bool ret;
int i;
 
@@ -1492,7 +1506,10 @@ static void add_cpu_to_masks(int cpu)
if (has_coregroup_support())
update_coregroup_mask(cpu, );
 
-   if (chip_id == -1 || !ret) {
+   if (chip_id_lookup_table && ret)
+   chip_id = cpu_to_chip_id(cpu);
+
+   if (chip_id == -1) {
cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
goto out;
}
-- 
2.25.1



[PATCH 2/3] Revert "powerpc/topology: Update topology_core_cpumask"

2021-04-15 Thread Srikar Dronamraju
Now that cpu_core_mask has been reintroduced, lets revert
commit 4bce545903fa ("powerpc/topology: Update topology_core_cpumask")

Post this commit, lscpu should reflect topologies as requested by a user
when a QEMU instance is launched with NUMA spanning multiple sockets.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: qemu-...@nongnu.org
Cc: Cedric Le Goater 
Cc: David Gibson 
Cc: Nathan Lynch 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Gautham R Shenoy 
Reported-by: Daniel Henrique Barboza 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/include/asm/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 3beeb030cd78..e4db64c0e184 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -126,7 +126,7 @@ static inline int cpu_to_coregroup_id(int cpu)
 #define topology_physical_package_id(cpu)  (cpu_to_chip_id(cpu))
 
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
-#define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
+#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
 #define topology_core_id(cpu)  (cpu_to_core_id(cpu))
 
 #endif
-- 
2.25.1



[PATCH 0/3] Reintroduce cpu_core_mask

2021-04-15 Thread Srikar Dronamraju
Daniel had reported that
 QEMU is now unable to see requested topologies in a multi socket single
 NUMA node configurations.
 -smp 8,maxcpus=8,cores=2,threads=2,sockets=2

This patchset reintroduces cpu_core_mask so that users can see requested
topologies while still maintaining the boot time of very large system
configurations.

It includes caching the chip_id as suggested by Michael Ellermann

4 Threads/Core; 4 cores/Socket; 4 Sockets/Node, 2 Nodes in System
  -numa node,nodeid=0,memdev=m0 \
  -numa node,nodeid=1,memdev=m1 \
  -smp 128,sockets=8,threads=4,maxcpus=128  \

5.12.0-rc5 (or any kernel with commit 4ca234a9cbd7)
---
srikar@cloudy:~$ lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  128
On-line CPU(s) list: 0-127
Thread(s) per core:  4
Core(s) per socket:  16
Socket(s):   2 <-
NUMA node(s):2
Model:   2.3 (pvr 004e 1203)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:   1 MiB
L1i cache:   1 MiB
NUMA node0 CPU(s):   0-15,32-47,64-79,96-111
NUMA node1 CPU(s):   16-31,48-63,80-95,112-127
--
srikar@cloudy:~$ dmesg |grep smp
[0.010658] smp: Bringing up secondary CPUs ...
[0.424681] smp: Brought up 2 nodes, 128 CPUs
--

5.12.0-rc5 + 3 patches
--
srikar@cloudy:~$ lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  128
On-line CPU(s) list: 0-127
Thread(s) per core:  4
Core(s) per socket:  4
Socket(s):   8-
NUMA node(s):2
Model:   2.3 (pvr 004e 1203)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:   1 MiB
L1i cache:   1 MiB
NUMA node0 CPU(s):   0-15,32-47,64-79,96-111
NUMA node1 CPU(s):   16-31,48-63,80-95,112-127
--
srikar@cloudy:~$ dmesg |grep smp
[0.010372] smp: Bringing up secondary CPUs ...
[0.417892] smp: Brought up 2 nodes, 128 CPUs

5.12.0-rc5
--
srikar@cloudy:~$  lscpu
Architecture:ppc64le
Byte Order:  Little Endian
CPU(s):  1024
On-line CPU(s) list: 0-1023
Thread(s) per core:  8
Core(s) per socket:  128
Socket(s):   1
NUMA node(s):1
Model:   2.3 (pvr 004e 1203)
Model name:  POWER9 (architected), altivec supported
Hypervisor vendor:   KVM
Virtualization type: para
L1d cache:   4 MiB
L1i cache:   4 MiB
NUMA node0 CPU(s):   0-1023
srikar@cloudy:~$ dmesg | grep smp
[0.027753 ] smp: Bringing up secondary CPUs ...
[2.315193 ] smp: Brought up 1 node, 1024 CPUs

5.12.0-rc5 + 3 patches
--
srikar@cloudy:~$ dmesg | grep smp
[0.027659 ] smp: Bringing up secondary CPUs ...
[2.532739 ] smp: Brought up 1 node, 1024 CPUs

I also have booted and tested the kernels on PowerVM and PowerNV and
even there I see a very negligible increase in the bringing up time of
secondary CPUs

Srikar Dronamraju (3):
  powerpc/smp: Reintroduce cpu_core_mask
  Revert "powerpc/topology: Update topology_core_cpumask"
  powerpc/smp: Cache CPU to chip lookup

 arch/powerpc/include/asm/smp.h  |  6 
 arch/powerpc/include/asm/topology.h |  2 +-
 arch/powerpc/kernel/prom.c  | 19 +++---
 arch/powerpc/kernel/smp.c   | 56 +
 4 files changed, 71 insertions(+), 12 deletions(-)

-- 
2.25.1



Re: [PATCHv5 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents

2021-04-15 Thread Michal Suchánek
Hello,

On Wed, Apr 14, 2021 at 11:08:19AM +0800, Pingfan Liu wrote:
> On Sat, Apr 10, 2021 at 12:33 AM Michal Suchánek  wrote:
> >
> > Hello,
> >
> > On Fri, Aug 28, 2020 at 04:10:09PM +0800, Pingfan Liu wrote:
> > > On Thu, Aug 27, 2020 at 3:53 PM Laurent Dufour  
> > > wrote:
> > > >
> > > > Le 10/08/2020 à 10:52, Pingfan Liu a écrit :
> > > > > A bug is observed on pseries by taking the following steps on rhel:
> > > > > -1. drmgr -c mem -r -q 5
> > > > > -2. echo c > /proc/sysrq-trigger
> > > > >
> > > > > And then, the failure looks like:
> > > > > kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/
> > > > > kdump: saving vmcore-dmesg.txt
> > > > > kdump: saving vmcore-dmesg.txt complete
> > > > > kdump: saving vmcore
> > > > >   Checking for memory holes : [  0.0 %] / 
> > > > >   Checking for memory holes : 
> > > > > [100.0 %] |   Excluding unnecessary pages 
> > > > >   : [100.0 %] \   Copying data
> > > > >   : [  0.3 %] -  eta: 38s[   44.337636] 
> > > > > hash-mmu: mm: Hashing failure ! EA=0x7fffba40 
> > > > > access=0x8004 current=makedumpfile
> > > > > [   44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base 
> > > > > psize=2 psize 2 pte=0xc0005504
> > > > > [   44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 
> > > > > access=0x8004 current=makedumpfile
> > > > > [   44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base 
> > > > > psize=2 psize 2 pte=0xc0005504
> > > > > [   44.337708] makedumpfile[469]: unhandled signal 7 at 
> > > > > 7fffba40 nip 7fffbbc4d7fc lr 00011356ca3c code 2
> > > > > [   44.338548] Core dump to |/bin/false pipe failed
> > > > > /lib/kdump-lib-initramfs.sh: line 98:   469 Bus error   
> > > > > $CORE_COLLECTOR /proc/vmcore 
> > > > > $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete
> > > > > kdump: saving vmcore failed
> > > > >
> > > > > * Root cause *
> > > > >After analyzing, it turns out that in the current implementation,
> > > > > when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt 
> > > > > updating as
> > > > > the code __remove_memory() comes before drmem_update_dt().
> > > > > So in kdump kernel, when read_from_oldmem() resorts to
> > > > > pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to
> > > > > non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, 
> > > > > as it
> > > > > can be observed "Bus error"
> > > > >
> > > > >  From a viewpoint of listener and publisher, the publisher notifies 
> > > > > the
> > > > > listener before data is ready.  This introduces a problem where udev
> > > > > launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before
> > > > > updating. And in capture kernel, makedumpfile will access the memory 
> > > > > based
> > > > > on the stale dt info, and hit a SIGBUS error due to an un-existed lmb.
> > > > >
> > > > > * Fix *
> > > > > This bug is introduced by commit 063b8b1251fd
> > > > > ("powerpc/pseries/memory-hotplug: Only update DT once per memory DLPAR
> > > > > request"), which tried to combine all the dt updating into one.
> > > > >
> > > > > To fix this issue, meanwhile not to introduce a quadratic runtime
> > > > > complexity by the model:
> > > > >dlpar_memory_add_by_count
> > > > >  for_each_drmem_lmb <--
> > > > >dlpar_add_lmb
> > > > >  drmem_update_dt(_v1|_v2)
> > > > >for_each_drmem_lmb   <--
> > > > > The dt should still be only updated once, and just before the last 
> > > > > memory
> > > > > online/offline event is ejected to user space. Achieve this by 
> > > > > tracing the
> > > > > num of lmb added or removed.
> > > > >
> > > > > Signed-off-by: Pingfan Liu 
> > > > > Cc: Michael Ellerman 
> > > > > Cc: Hari Bathini 
> > > > > Cc: Nathan Lynch 
> > > > > Cc: Nathan Fontenot 
> > > > > Cc: Laurent Dufour 
> > > > > To: linuxppc-dev@lists.ozlabs.org
> > > > > Cc: ke...@lists.infradead.org
> > > > > ---
> > > > > v4 -> v5: change dlpar_add_lmb()/dlpar_remove_lmb() prototype to 
> > > > > report
> > > > >whether dt is updated successfully.
> > > > >Fix a condition boundary check bug
> > > > > v3 -> v4: resolve a quadratic runtime complexity issue.
> > > > >This series is applied on next-test branch
> > > > >   arch/powerpc/platforms/pseries/hotplug-memory.c | 102 
> > > > > +++-
> > > > >   1 file changed, 80 insertions(+), 22 deletions(-)
> > > > >
> > > > > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> > > > > b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > > > > index 46cbcd1..1567d9f 100644
> > > > > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> > > > > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > > > 

Re: [PATCH] powerpc/papr_scm: Reduce error severity if nvdimm stats inaccessible

2021-04-15 Thread Vaibhav Jain
Ira Weiny  writes:

> On Wed, Apr 14, 2021 at 09:51:40PM +0530, Vaibhav Jain wrote:
>> Thanks for looking into this patch Ira,
>> 
>> Ira Weiny  writes:
>> 
>> > On Wed, Apr 14, 2021 at 06:10:26PM +0530, Vaibhav Jain wrote:
>> >> Currently drc_pmem_qeury_stats() generates a dev_err in case
>> >> "Enable Performance Information Collection" feature is disabled from
>> >> HMC. The error is of the form below:
>> >> 
>> >> papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Failed to query
>> >>performance stats, Err:-10
>> >> 
>> >> This error message confuses users as it implies a possible problem
>> >> with the nvdimm even though its due to a disabled feature.
>> >> 
>> >> So we fix this by explicitly handling the H_AUTHORITY error from the
>> >> H_SCM_PERFORMANCE_STATS hcall and generating a warning instead of an
>> >> error, saying that "Performance stats in-accessible".
>> >> 
>> >> Fixes: 2d02bf835e57('powerpc/papr_scm: Fetch nvdimm performance stats 
>> >> from PHYP')
>> >> Signed-off-by: Vaibhav Jain 
>> >> ---
>> >>  arch/powerpc/platforms/pseries/papr_scm.c | 3 +++
>> >>  1 file changed, 3 insertions(+)
>> >> 
>> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
>> >> b/arch/powerpc/platforms/pseries/papr_scm.c
>> >> index 835163f54244..9216424f8be3 100644
>> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> >> @@ -277,6 +277,9 @@ static ssize_t drc_pmem_query_stats(struct 
>> >> papr_scm_priv *p,
>> >>   dev_err(>pdev->dev,
>> >>   "Unknown performance stats, Err:0x%016lX\n", ret[0]);
>> >>   return -ENOENT;
>> >> + } else if (rc == H_AUTHORITY) {
>> >> + dev_warn(>pdev->dev, "Performance stats in-accessible");
>> >> + return -EPERM;
>> >
>> > Is this because of a disabled feature or because of permissions?
>> 
>> Its because of a disabled feature that revokes permission for a guest to
>> retrieve performance statistics.
>> 
>> The feature is called "Enable Performance Information Collection" and
>> once disabled the hcall H_SCM_PERFORMANCE_STATS returns an error
>> H_AUTHORITY indicating that the guest doesn't have permission to retrieve
>> performance statistics.
>
> In that case would it be appropriate to have the error message indicate a
> permission issue?
>
> Something like 'permission denied'?

Yes, Something like "Permission denied while accessing performance
stats" might be more clear and intuitive.

Will update the warn message in v2.

>
> Ira
>

-- 
Cheers
~ Vaibhav


Re: [PATCH] powerpc/papr_scm: Reduce error severity if nvdimm stats inaccessible

2021-04-15 Thread Vaibhav Jain
Thanks for looking into this Dan,

Dan Williams  writes:

> On Wed, Apr 14, 2021 at 5:40 AM Vaibhav Jain  wrote:
>>
>> Currently drc_pmem_qeury_stats() generates a dev_err in case
>> "Enable Performance Information Collection" feature is disabled from
>> HMC. The error is of the form below:
>>
>> papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Failed to query
>>  performance stats, Err:-10
>>
>> This error message confuses users as it implies a possible problem
>> with the nvdimm even though its due to a disabled feature.
>>
>> So we fix this by explicitly handling the H_AUTHORITY error from the
>> H_SCM_PERFORMANCE_STATS hcall and generating a warning instead of an
>> error, saying that "Performance stats in-accessible".
>>
>> Fixes: 2d02bf835e57('powerpc/papr_scm: Fetch nvdimm performance stats from 
>> PHYP')
>> Signed-off-by: Vaibhav Jain 
>> ---
>>  arch/powerpc/platforms/pseries/papr_scm.c | 3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
>> b/arch/powerpc/platforms/pseries/papr_scm.c
>> index 835163f54244..9216424f8be3 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -277,6 +277,9 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv 
>> *p,
>> dev_err(>pdev->dev,
>> "Unknown performance stats, Err:0x%016lX\n", ret[0]);
>> return -ENOENT;
>> +   } else if (rc == H_AUTHORITY) {
>> +   dev_warn(>pdev->dev, "Performance stats in-accessible");
>> +   return -EPERM;
>
> So userspace can spam the kernel log? Why is kernel log message needed
> at all? EPERM told the caller what happened.
Currently this error message is only reported during probe of the
nvdimm. So userspace cannot directly spam kernel log.

The callsite for this function in papr_scm_nvdimm_init() doesnt handle
specific error codes. Instead in case of an error it only reports that
"Dimm performance stats are unavailable". The log message just
preceeding that mentions the real cause of failure. Thats why just
returning -EPERM wont be usefui.

Alternatively I can update papr_scm_nvdimm_init() to report the error
code returned from drc_pmem_query_stats().

-- 
Cheers
~ Vaibhav


Re: [PATCH v13 14/14] powerpc/64s/radix: Enable huge vmalloc mappings

2021-04-15 Thread Christophe Leroy

Hi Nick,

Le 17/03/2021 à 07:24, Nicholas Piggin a écrit :

This reduces TLB misses by nearly 30x on a `git diff` workload on a
2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%, due
to vfs hashes being allocated with 2MB pages.

Cc: linuxppc-dev@lists.ozlabs.org
Acked-by: Michael Ellerman 
Signed-off-by: Nicholas Piggin 
---
  .../admin-guide/kernel-parameters.txt |  2 ++
  arch/powerpc/Kconfig  |  1 +
  arch/powerpc/kernel/module.c  | 22 +++
  3 files changed, 21 insertions(+), 4 deletions(-)

--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -8,6 +8,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -87,13 +88,26 @@ int module_finalize(const Elf_Ehdr *hdr,
return 0;
  }
  
-#ifdef MODULES_VADDR

  void *module_alloc(unsigned long size)
  {
+   unsigned long start = VMALLOC_START;
+   unsigned long end = VMALLOC_END;
+
+#ifdef MODULES_VADDR
BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
+   start = MODULES_VADDR;
+   end = MODULES_END;
+#endif
+
+   /*
+* Don't do huge page allocations for modules yet until more testing
+* is done. STRICT_MODULE_RWX may require extra work to support this
+* too.
+*/
  
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL,

-   PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
NUMA_NO_NODE,



I think you should add the following in 

#ifndef MODULES_VADDR
#define MODULES_VADDR VMALLOC_START
#define MODULES_END VMALLOC_END
#endif

And leave module_alloc() as is (just removing the enclosing #ifdef MODULES_VADDR and adding the 
VM_NO_HUGE_VMAP  flag)


This would minimise the conflits with the changes I did in powerpc/next 
reported by Stephen R.


+   return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
+   PAGE_KERNEL_EXEC,
+   VM_NO_HUGE_VMAP | VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE,
__builtin_return_address(0));
  }
-#endif



Re: linux-next: manual merge of the akpm-current tree with the powerpc tree

2021-04-15 Thread Christophe Leroy




Le 15/04/2021 à 12:08, Christophe Leroy a écrit :



Le 15/04/2021 à 12:07, Christophe Leroy a écrit :



Le 15/04/2021 à 11:58, Stephen Rothwell a écrit :

Hi all,

On Thu, 15 Apr 2021 19:44:17 +1000 Stephen Rothwell  
wrote:


Today's linux-next merge of the akpm-current tree got a conflict in:

   arch/powerpc/kernel/module.c

between commit:

   2ec13df16704 ("powerpc/modules: Load modules closer to kernel text")

from the powerpc tree and commit:

   4930ba789f8d ("powerpc/64s/radix: enable huge vmalloc mappings")

from the akpm-current tree.

I fixed it up (I think - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

--
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/kernel/module.c
index fab84024650c,cdb2d88c54e7..
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@@ -88,29 -88,26 +89,42 @@@ int module_finalize(const Elf_Ehdr *hdr
   return 0;
   }
- #ifdef MODULES_VADDR
  -void *module_alloc(unsigned long size)
  +static __always_inline void *
  +__module_alloc(unsigned long size, unsigned long start, unsigned long end)
   {
  -    unsigned long start = VMALLOC_START;
  -    unsigned long end = VMALLOC_END;
  -
  -#ifdef MODULES_VADDR
  -    BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
  -    start = MODULES_VADDR;
  -    end = MODULES_END;
  -#endif
  -
+ /*
+  * Don't do huge page allocations for modules yet until more testing
+  * is done. STRICT_MODULE_RWX may require extra work to support this
+  * too.
+  */
+
   return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
- PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+ PAGE_KERNEL_EXEC,
+ VM_NO_HUGE_VMAP | VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE,
   __builtin_return_address(0));
   }
  +
++
  +void *module_alloc(unsigned long size)
  +{
++    unsigned long start = VMALLOC_START;
++    unsigned long end = VMALLOC_END;
  +    unsigned long limit = (unsigned long)_etext - SZ_32M;
  +    void *ptr = NULL;
  +
++#ifdef MODULES_VADDR
  +    BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
++    start = MODULES_VADDR;
++    end = MODULES_END;


The #endif should be here.



  +
  +    /* First try within 32M limit from _etext to avoid branch trampolines */
  +    if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)


Should also use start and end here instead of MODULES_VADDR  and MODULES_END



The cleanest however should be to define MODULES_VADDR and MODULES_END all the time with a fallback 
to VMALLOC_START/VMALLOC_END, to avoid the #ifdef.


The #ifdef was OK when we wanted to define modules_alloc() only when module area was different from 
vmalloc area, but now that we want modules_alloc() at all time, MODULES_VADDR and MODULES_END should 
be defined all the time.






- ptr = __module_alloc(size, limit, MODULES_END);
++    ptr = __module_alloc(size, limit, end);
  +
  +    if (!ptr)
- ptr = __module_alloc(size, MODULES_VADDR, MODULES_END);
++#endif
++    ptr = __module_alloc(size, start, end);
  +
  +    return ptr;
  +}
- #endif


Unfortunately, it also needs this:


Before the #endif is too far.



From: Stephen Rothwell 
Date: Thu, 15 Apr 2021 19:53:58 +1000
Subject: [PATCH] merge fix up for powerpc merge fix

Signed-off-by: Stephen Rothwell 
---
  arch/powerpc/kernel/module.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index d8ab1ad2eb05..c060f99afd4d 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -110,7 +110,9 @@ void *module_alloc(unsigned long size)
  {
  unsigned long start = VMALLOC_START;
  unsigned long end = VMALLOC_END;
+#ifdef MODULES_VADDR
  unsigned long limit = (unsigned long)_etext - SZ_32M;
+#endif
  void *ptr = NULL;
  #ifdef MODULES_VADDR



Re: linux-next: manual merge of the akpm-current tree with the powerpc tree

2021-04-15 Thread Christophe Leroy




Le 15/04/2021 à 12:07, Christophe Leroy a écrit :



Le 15/04/2021 à 11:58, Stephen Rothwell a écrit :

Hi all,

On Thu, 15 Apr 2021 19:44:17 +1000 Stephen Rothwell  
wrote:


Today's linux-next merge of the akpm-current tree got a conflict in:

   arch/powerpc/kernel/module.c

between commit:

   2ec13df16704 ("powerpc/modules: Load modules closer to kernel text")

from the powerpc tree and commit:

   4930ba789f8d ("powerpc/64s/radix: enable huge vmalloc mappings")

from the akpm-current tree.

I fixed it up (I think - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

--
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/kernel/module.c
index fab84024650c,cdb2d88c54e7..
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@@ -88,29 -88,26 +89,42 @@@ int module_finalize(const Elf_Ehdr *hdr
   return 0;
   }
- #ifdef MODULES_VADDR
  -void *module_alloc(unsigned long size)
  +static __always_inline void *
  +__module_alloc(unsigned long size, unsigned long start, unsigned long end)
   {
  -    unsigned long start = VMALLOC_START;
  -    unsigned long end = VMALLOC_END;
  -
  -#ifdef MODULES_VADDR
  -    BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
  -    start = MODULES_VADDR;
  -    end = MODULES_END;
  -#endif
  -
+ /*
+  * Don't do huge page allocations for modules yet until more testing
+  * is done. STRICT_MODULE_RWX may require extra work to support this
+  * too.
+  */
+
   return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
- PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+ PAGE_KERNEL_EXEC,
+ VM_NO_HUGE_VMAP | VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE,
   __builtin_return_address(0));
   }
  +
++
  +void *module_alloc(unsigned long size)
  +{
++    unsigned long start = VMALLOC_START;
++    unsigned long end = VMALLOC_END;
  +    unsigned long limit = (unsigned long)_etext - SZ_32M;
  +    void *ptr = NULL;
  +
++#ifdef MODULES_VADDR
  +    BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
++    start = MODULES_VADDR;
++    end = MODULES_END;


The #endif should be here.



  +
  +    /* First try within 32M limit from _etext to avoid branch trampolines */
  +    if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)


Should also use start and end here instead of MODULES_VADDR  and MODULES_END


- ptr = __module_alloc(size, limit, MODULES_END);
++    ptr = __module_alloc(size, limit, end);
  +
  +    if (!ptr)
- ptr = __module_alloc(size, MODULES_VADDR, MODULES_END);
++#endif
++    ptr = __module_alloc(size, start, end);
  +
  +    return ptr;
  +}
- #endif


Unfortunately, it also needs this:


Before the #endif is too far.



From: Stephen Rothwell 
Date: Thu, 15 Apr 2021 19:53:58 +1000
Subject: [PATCH] merge fix up for powerpc merge fix

Signed-off-by: Stephen Rothwell 
---
  arch/powerpc/kernel/module.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index d8ab1ad2eb05..c060f99afd4d 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -110,7 +110,9 @@ void *module_alloc(unsigned long size)
  {
  unsigned long start = VMALLOC_START;
  unsigned long end = VMALLOC_END;
+#ifdef MODULES_VADDR
  unsigned long limit = (unsigned long)_etext - SZ_32M;
+#endif
  void *ptr = NULL;
  #ifdef MODULES_VADDR



Re: linux-next: manual merge of the akpm-current tree with the powerpc tree

2021-04-15 Thread Christophe Leroy




Le 15/04/2021 à 11:58, Stephen Rothwell a écrit :

Hi all,

On Thu, 15 Apr 2021 19:44:17 +1000 Stephen Rothwell  
wrote:


Today's linux-next merge of the akpm-current tree got a conflict in:

   arch/powerpc/kernel/module.c

between commit:

   2ec13df16704 ("powerpc/modules: Load modules closer to kernel text")

from the powerpc tree and commit:

   4930ba789f8d ("powerpc/64s/radix: enable huge vmalloc mappings")

from the akpm-current tree.

I fixed it up (I think - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

--
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/kernel/module.c
index fab84024650c,cdb2d88c54e7..
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@@ -88,29 -88,26 +89,42 @@@ int module_finalize(const Elf_Ehdr *hdr
return 0;
   }
   
- #ifdef MODULES_VADDR

  -void *module_alloc(unsigned long size)
  +static __always_inline void *
  +__module_alloc(unsigned long size, unsigned long start, unsigned long end)
   {
  - unsigned long start = VMALLOC_START;
  - unsigned long end = VMALLOC_END;
  -
  -#ifdef MODULES_VADDR
  - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
  - start = MODULES_VADDR;
  - end = MODULES_END;
  -#endif
  -
+   /*
+* Don't do huge page allocations for modules yet until more testing
+* is done. STRICT_MODULE_RWX may require extra work to support this
+* too.
+*/
+
return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
-   PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
NUMA_NO_NODE,
+   PAGE_KERNEL_EXEC,
+   VM_NO_HUGE_VMAP | VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE,
__builtin_return_address(0));
   }
  +
++
  +void *module_alloc(unsigned long size)
  +{
++  unsigned long start = VMALLOC_START;
++  unsigned long end = VMALLOC_END;
  + unsigned long limit = (unsigned long)_etext - SZ_32M;
  + void *ptr = NULL;
  +
++#ifdef MODULES_VADDR
  + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
++  start = MODULES_VADDR;
++  end = MODULES_END;


The #endif should be here.



  +
  + /* First try within 32M limit from _etext to avoid branch trampolines */
  + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
-   ptr = __module_alloc(size, limit, MODULES_END);
++  ptr = __module_alloc(size, limit, end);
  +
  + if (!ptr)
-   ptr = __module_alloc(size, MODULES_VADDR, MODULES_END);
++#endif
++  ptr = __module_alloc(size, start, end);
  +
  + return ptr;
  +}
- #endif


Unfortunately, it also needs this:


Before the #endif is too far.



From: Stephen Rothwell 
Date: Thu, 15 Apr 2021 19:53:58 +1000
Subject: [PATCH] merge fix up for powerpc merge fix

Signed-off-by: Stephen Rothwell 
---
  arch/powerpc/kernel/module.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index d8ab1ad2eb05..c060f99afd4d 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -110,7 +110,9 @@ void *module_alloc(unsigned long size)
  {
unsigned long start = VMALLOC_START;
unsigned long end = VMALLOC_END;
+#ifdef MODULES_VADDR
unsigned long limit = (unsigned long)_etext - SZ_32M;
+#endif
void *ptr = NULL;
  
  #ifdef MODULES_VADDR




[PATCH bpf-next 2/2] docs: bpf: bpf_jit_enable mode changed

2021-04-15 Thread Jianlin Lv
Remove information about bpf_jit_enable=2 mode and added description for
how to use the bpf_jit_disasm tool after get rid of =2 mode.

Signed-off-by: Jianlin Lv 
---
 Documentation/admin-guide/sysctl/net.rst |  1 -
 Documentation/networking/filter.rst  | 25 ++--
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/net.rst 
b/Documentation/admin-guide/sysctl/net.rst
index c941b214e0b7..a39f99deac38 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -86,7 +86,6 @@ Values:
 
- 0 - disable the JIT (default value)
- 1 - enable the JIT
-   - 2 - enable the JIT and ask the compiler to emit traces on kernel log.
 
 bpf_jit_harden
 --
diff --git a/Documentation/networking/filter.rst 
b/Documentation/networking/filter.rst
index 251c6bd73d15..86954f922168 100644
--- a/Documentation/networking/filter.rst
+++ b/Documentation/networking/filter.rst
@@ -504,25 +504,12 @@ been previously enabled by root::
 
   echo 1 > /proc/sys/net/core/bpf_jit_enable
 
-For JIT developers, doing audits etc, each compile run can output the generated
-opcode image into the kernel log via::
-
-  echo 2 > /proc/sys/net/core/bpf_jit_enable
-
-Example output from dmesg::
-
-[ 3389.935842] flen=6 proglen=70 pass=3 image=a0069c8f
-[ 3389.935847] JIT code: : 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 
8b 4f 68
-[ 3389.935849] JIT code: 0010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 
00 00 00
-[ 3389.935850] JIT code: 0020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 
17 00 00
-[ 3389.935851] JIT code: 0030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff 
ff 00 00
-[ 3389.935852] JIT code: 0040: eb 02 31 c0 c9 c3
-
-When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 
1 and
-setting any other value than that will return in failure. This is even the 
case for
-setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel 
log
-is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is 
the
-generally recommended approach instead.
+When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set
+to 1 and setting any other value than that will return in failure.
+For debugging JITs, the introspection through bpftool (tools/bpf/bpftool/)
+is the generally recommended approach instead. For JIT developers, doing
+audits etc, you can insert bpf_jit_dump() and recompile the kernel to
+output the generated opcode image into the kernel log.
 
 In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for
 generating disassembly out of the kernel log's hexdump::
-- 
2.25.1



[PATCH bpf-next 1/2] bpf: Remove bpf_jit_enable=2 debugging mode

2021-04-15 Thread Jianlin Lv
For debugging JITs, dumping the JITed image to kernel log is discouraged,
"bpftool prog dump jited" is much better way to examine JITed dumps.
This patch get rid of the code related to bpf_jit_enable=2 mode and
update the proc handler of bpf_jit_enable, also added auxiliary
information to explain how to use bpf_jit_disasm tool after this change.

Signed-off-by: Jianlin Lv 
---
 arch/arm/net/bpf_jit_32.c |  4 
 arch/arm64/net/bpf_jit_comp.c |  4 
 arch/mips/net/bpf_jit.c   |  4 
 arch/mips/net/ebpf_jit.c  |  4 
 arch/powerpc/net/bpf_jit_comp.c   | 10 --
 arch/powerpc/net/bpf_jit_comp64.c | 11 ---
 arch/riscv/net/bpf_jit_core.c |  3 ---
 arch/s390/net/bpf_jit_comp.c  |  4 
 arch/sparc/net/bpf_jit_comp_32.c  |  3 ---
 arch/sparc/net/bpf_jit_comp_64.c  | 13 -
 arch/x86/net/bpf_jit_comp.c   |  3 ---
 arch/x86/net/bpf_jit_comp32.c |  3 ---
 net/core/sysctl_net_core.c| 14 +++---
 tools/bpf/bpf_jit_disasm.c|  2 +-
 tools/bpf/bpftool/feature.c   |  3 ---
 15 files changed, 4 insertions(+), 81 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 897634d0a67c..92d669c0b2d3 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1997,10 +1997,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
*prog)
}
flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx));
 
-   if (bpf_jit_enable > 1)
-   /* there are 2 passes here */
-   bpf_jit_dump(prog->len, image_size, 2, ctx.target);
-
bpf_jit_binary_lock_ro(header);
prog->bpf_func = (void *)ctx.target;
prog->jited = 1;
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index f7b194878a99..a13b83ac4ca8 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1090,10 +1090,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
*prog)
goto out_off;
}
 
-   /* And we're done. */
-   if (bpf_jit_enable > 1)
-   bpf_jit_dump(prog->len, prog_size, 2, ctx.image);
-
bpf_flush_icache(header, ctx.image + ctx.idx);
 
if (!prog->is_func || extra_pass) {
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 0af88622c619..b5221282dd88 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -1250,10 +1250,6 @@ void bpf_jit_compile(struct bpf_prog *fp)
/* Update the icache */
flush_icache_range((ptr)ctx.target, (ptr)(ctx.target + ctx.idx));
 
-   if (bpf_jit_enable > 1)
-   /* Dump JIT code */
-   bpf_jit_dump(fp->len, alloc_size, 2, ctx.target);
-
fp->bpf_func = (void *)ctx.target;
fp->jited = 1;
 
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 939dd06764bc..dac5a1fc2462 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -1910,10 +1910,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
*prog)
flush_icache_range((unsigned long)ctx.target,
   (unsigned long)[ctx.idx]);
 
-   if (bpf_jit_enable > 1)
-   /* Dump JIT code */
-   bpf_jit_dump(prog->len, image_size, 2, ctx.target);
-
bpf_jit_binary_lock_ro(header);
prog->bpf_func = (void *)ctx.target;
prog->jited = 1;
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index e809cb5a1631..ebca629de2d1 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -646,18 +646,8 @@ void bpf_jit_compile(struct bpf_prog *fp)
bpf_jit_build_prologue(fp, code_base, );
bpf_jit_build_body(fp, code_base, , addrs);
bpf_jit_build_epilogue(code_base, );
-
-   if (bpf_jit_enable > 1)
-   pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass,
-   proglen - (cgctx.idx * 4), cgctx.seen);
}
 
-   if (bpf_jit_enable > 1)
-   /* Note that we output the base address of the code_base
-* rather than image, since opcodes are in code_base.
-*/
-   bpf_jit_dump(flen, proglen, pass, code_base);
-
bpf_flush_icache(code_base, code_base + (proglen/4));
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index aaf1a887f653..26243399ef2e 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1215,20 +1215,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
bpf_jit_build_prologue(code_base, );
bpf_jit_build_body(fp, code_base, , addrs, extra_pass);
bpf_jit_build_epilogue(code_base, );
-
-   if (bpf_jit_enable > 1)
-   pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass,
-

Re: linux-next: manual merge of the akpm-current tree with the powerpc tree

2021-04-15 Thread Stephen Rothwell
Hi all,

On Thu, 15 Apr 2021 19:44:17 +1000 Stephen Rothwell  
wrote:
> 
> Today's linux-next merge of the akpm-current tree got a conflict in:
> 
>   arch/powerpc/kernel/module.c
> 
> between commit:
> 
>   2ec13df16704 ("powerpc/modules: Load modules closer to kernel text")
> 
> from the powerpc tree and commit:
> 
>   4930ba789f8d ("powerpc/64s/radix: enable huge vmalloc mappings")
> 
> from the akpm-current tree.
> 
> I fixed it up (I think - see below) and can carry the fix as
> necessary. This is now fixed as far as linux-next is concerned, but any
> non trivial conflicts should be mentioned to your upstream maintainer
> when your tree is submitted for merging.  You may also want to consider
> cooperating with the maintainer of the conflicting tree to minimise any
> particularly complex conflicts.
> 
> -- 
> Cheers,
> Stephen Rothwell
> 
> diff --cc arch/powerpc/kernel/module.c
> index fab84024650c,cdb2d88c54e7..
> --- a/arch/powerpc/kernel/module.c
> +++ b/arch/powerpc/kernel/module.c
> @@@ -88,29 -88,26 +89,42 @@@ int module_finalize(const Elf_Ehdr *hdr
>   return 0;
>   }
>   
> - #ifdef MODULES_VADDR
>  -void *module_alloc(unsigned long size)
>  +static __always_inline void *
>  +__module_alloc(unsigned long size, unsigned long start, unsigned long end)
>   {
>  -unsigned long start = VMALLOC_START;
>  -unsigned long end = VMALLOC_END;
>  -
>  -#ifdef MODULES_VADDR
>  -BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
>  -start = MODULES_VADDR;
>  -end = MODULES_END;
>  -#endif
>  -
> + /*
> +  * Don't do huge page allocations for modules yet until more testing
> +  * is done. STRICT_MODULE_RWX may require extra work to support this
> +  * too.
> +  */
> + 
>   return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
> - PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
> NUMA_NO_NODE,
> + PAGE_KERNEL_EXEC,
> + VM_NO_HUGE_VMAP | VM_FLUSH_RESET_PERMS,
> + NUMA_NO_NODE,
>   __builtin_return_address(0));
>   }
>  +
> ++
>  +void *module_alloc(unsigned long size)
>  +{
> ++unsigned long start = VMALLOC_START;
> ++unsigned long end = VMALLOC_END;
>  +unsigned long limit = (unsigned long)_etext - SZ_32M;
>  +void *ptr = NULL;
>  +
> ++#ifdef MODULES_VADDR
>  +BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
> ++start = MODULES_VADDR;
> ++end = MODULES_END;
>  +
>  +/* First try within 32M limit from _etext to avoid branch trampolines */
>  +if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
> - ptr = __module_alloc(size, limit, MODULES_END);
> ++ptr = __module_alloc(size, limit, end);
>  +
>  +if (!ptr)
> - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END);
> ++#endif
> ++ptr = __module_alloc(size, start, end);
>  +
>  +return ptr;
>  +}
> - #endif

Unfortunately, it also needs this:

From: Stephen Rothwell 
Date: Thu, 15 Apr 2021 19:53:58 +1000
Subject: [PATCH] merge fix up for powerpc merge fix

Signed-off-by: Stephen Rothwell 
---
 arch/powerpc/kernel/module.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index d8ab1ad2eb05..c060f99afd4d 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -110,7 +110,9 @@ void *module_alloc(unsigned long size)
 {
unsigned long start = VMALLOC_START;
unsigned long end = VMALLOC_END;
+#ifdef MODULES_VADDR
unsigned long limit = (unsigned long)_etext - SZ_32M;
+#endif
void *ptr = NULL;
 
 #ifdef MODULES_VADDR
-- 
2.30.2

-- 
Cheers,
Stephen Rothwell


pgpTMl0HKat6g.pgp
Description: OpenPGP digital signature


linux-next: manual merge of the akpm-current tree with the powerpc tree

2021-04-15 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the akpm-current tree got a conflict in:

  arch/powerpc/kernel/module.c

between commit:

  2ec13df16704 ("powerpc/modules: Load modules closer to kernel text")

from the powerpc tree and commit:

  4930ba789f8d ("powerpc/64s/radix: enable huge vmalloc mappings")

from the akpm-current tree.

I fixed it up (I think - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/kernel/module.c
index fab84024650c,cdb2d88c54e7..
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@@ -88,29 -88,26 +89,42 @@@ int module_finalize(const Elf_Ehdr *hdr
return 0;
  }
  
- #ifdef MODULES_VADDR
 -void *module_alloc(unsigned long size)
 +static __always_inline void *
 +__module_alloc(unsigned long size, unsigned long start, unsigned long end)
  {
 -  unsigned long start = VMALLOC_START;
 -  unsigned long end = VMALLOC_END;
 -
 -#ifdef MODULES_VADDR
 -  BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
 -  start = MODULES_VADDR;
 -  end = MODULES_END;
 -#endif
 -
+   /*
+* Don't do huge page allocations for modules yet until more testing
+* is done. STRICT_MODULE_RWX may require extra work to support this
+* too.
+*/
+ 
return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
-   PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
NUMA_NO_NODE,
+   PAGE_KERNEL_EXEC,
+   VM_NO_HUGE_VMAP | VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE,
__builtin_return_address(0));
  }
 +
++
 +void *module_alloc(unsigned long size)
 +{
++  unsigned long start = VMALLOC_START;
++  unsigned long end = VMALLOC_END;
 +  unsigned long limit = (unsigned long)_etext - SZ_32M;
 +  void *ptr = NULL;
 +
++#ifdef MODULES_VADDR
 +  BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
++  start = MODULES_VADDR;
++  end = MODULES_END;
 +
 +  /* First try within 32M limit from _etext to avoid branch trampolines */
 +  if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
-   ptr = __module_alloc(size, limit, MODULES_END);
++  ptr = __module_alloc(size, limit, end);
 +
 +  if (!ptr)
-   ptr = __module_alloc(size, MODULES_VADDR, MODULES_END);
++#endif
++  ptr = __module_alloc(size, start, end);
 +
 +  return ptr;
 +}
- #endif


pgpGDnZbT4tL5.pgp
Description: OpenPGP digital signature


[PATCH] mm: ptdump: Fix build failure

2021-04-15 Thread Christophe Leroy
  CC  mm/ptdump.o
In file included from :
mm/ptdump.c: In function 'ptdump_pte_entry':
././include/linux/compiler_types.h:320:38: error: call to 
'__compiletime_assert_207' declared with attribute error: Unsupported access 
size for {READ,WRITE}_ONCE().
  320 |  _compiletime_assert(condition, msg, __compiletime_assert_, 
__COUNTER__)
  |  ^
././include/linux/compiler_types.h:301:4: note: in definition of macro 
'__compiletime_assert'
  301 |prefix ## suffix();\
  |^~
././include/linux/compiler_types.h:320:2: note: in expansion of macro 
'_compiletime_assert'
  320 |  _compiletime_assert(condition, msg, __compiletime_assert_, 
__COUNTER__)
  |  ^~~
./include/asm-generic/rwonce.h:36:2: note: in expansion of macro 
'compiletime_assert'
   36 |  compiletime_assert(__native_word(t) || sizeof(t) == 
sizeof(long long), \
  |  ^~
./include/asm-generic/rwonce.h:49:2: note: in expansion of macro 
'compiletime_assert_rwonce_type'
   49 |  compiletime_assert_rwonce_type(x);\
  |  ^~
mm/ptdump.c:114:14: note: in expansion of macro 'READ_ONCE'
  114 |  pte_t val = READ_ONCE(*pte);
  |  ^
make[2]: *** [mm/ptdump.o] Error 1

READ_ONCE() cannot be used for reading PTEs. Use ptep_get()
instead. See commit 481e980a7c19 ("mm: Allow arches to provide ptep_get()")
and commit c0e1c8c22beb ("powerpc/8xx: Provide ptep_get() with 16k pages")
for details.

Fixes: 30d621f6723b ("mm: add generic ptdump")
Cc: Steven Price 
Signed-off-by: Christophe Leroy 
---
 mm/ptdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 4354c1422d57..da751448d0e4 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -111,7 +111,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
 {
struct ptdump_state *st = walk->private;
-   pte_t val = READ_ONCE(*pte);
+   pte_t val = ptep_get(pte);
 
if (st->effective_prot)
st->effective_prot(st, 4, pte_val(val));
-- 
2.25.0



linux-next: build warning after merge of the powerpc tree

2021-04-15 Thread Stephen Rothwell
Hi all,

After merging the powerpc tree, today's linux-next build (powerpc
allyesconfig) produced this warning:

In file included from include/linux/device.h:15,
 from arch/powerpc/include/asm/io.h:27,
 from include/linux/io.h:13,
 from include/linux/irq.h:20,
 from arch/powerpc/include/asm/hardirq.h:6,
 from include/linux/hardirq.h:11,
 from include/linux/highmem.h:10,
 from include/linux/bio.h:8,
 from include/linux/libnvdimm.h:14,
 from arch/powerpc/platforms/pseries/papr_scm.c:12:
arch/powerpc/platforms/pseries/papr_scm.c: In function 'papr_scm_pmem_flush':
arch/powerpc/platforms/pseries/papr_scm.c:144:26: warning: format '%lld' 
expects argument of type 'long long int', but argument 3 has type 'long int' 
[-Wformat=]
  144 |   dev_err(>pdev->dev, "flush error: %lld", rc);
  |  ^~~
include/linux/dev_printk.h:19:22: note: in definition of macro 'dev_fmt'
   19 | #define dev_fmt(fmt) fmt
  |  ^~~
arch/powerpc/platforms/pseries/papr_scm.c:144:3: note: in expansion of macro 
'dev_err'
  144 |   dev_err(>pdev->dev, "flush error: %lld", rc);
  |   ^~~
arch/powerpc/platforms/pseries/papr_scm.c:144:43: note: format string is 
defined here
  144 |   dev_err(>pdev->dev, "flush error: %lld", rc);
  |~~~^
  |   |
  |   long long int
  |%ld

Introduced by commit

  75b7c05ebf90 ("powerpc/papr_scm: Implement support for H_SCM_FLUSH hcall")

-- 
Cheers,
Stephen Rothwell


pgpo_5E8HvGg4.pgp
Description: OpenPGP digital signature


[PATCH] soc: fsl: qe: remove unused function

2021-04-15 Thread Jiapeng Chong
Fix the following clang warning:

drivers/soc/fsl/qe/qe_ic.c:234:29: warning: unused function
'qe_ic_from_irq' [-Wunused-function].

Reported-by: Abaci Robot 
Signed-off-by: Jiapeng Chong 
---
 drivers/soc/fsl/qe/qe_ic.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/soc/fsl/qe/qe_ic.c b/drivers/soc/fsl/qe/qe_ic.c
index 0390af9..b573712 100644
--- a/drivers/soc/fsl/qe/qe_ic.c
+++ b/drivers/soc/fsl/qe/qe_ic.c
@@ -231,11 +231,6 @@ static inline void qe_ic_write(__be32  __iomem *base, 
unsigned int reg,
qe_iowrite32be(value, base + (reg >> 2));
 }
 
-static inline struct qe_ic *qe_ic_from_irq(unsigned int virq)
-{
-   return irq_get_chip_data(virq);
-}
-
 static inline struct qe_ic *qe_ic_from_irq_data(struct irq_data *d)
 {
return irq_data_get_irq_chip_data(d);
-- 
1.8.3.1



Re: [PATCH 2/2] tools: do not include scripts/Kbuild.include

2021-04-15 Thread Paolo Bonzini

On 15/04/21 10:04, Masahiro Yamada wrote:

On Thu, Apr 15, 2021 at 4:40 PM Paolo Bonzini  wrote:

I think it would make sense to add try-run, cc-option and
.DELETE_ON_ERROR to tools/build/Build.include?


To be safe, I just copy-pasted what the makefiles need.
If someone wants to refactor the tool build system, that is fine,
but, to me, I do not see consistent rules or policy under tools/.


"Please put this in a common file instead of introducing duplication" is 
not asking for wholesale refactoring.


Paolo



Re: [PATCH 2/2] tools: do not include scripts/Kbuild.include

2021-04-15 Thread Christian Borntraeger



On 15.04.21 09:27, Masahiro Yamada wrote:

Since commit d9f4ff50d2aa ("kbuild: spilt cc-option and friends to
scripts/Makefile.compiler"), some kselftests fail to build.

The tools/ directory opted out Kbuild, and went in a different
direction. They copy any kind of files to the tools/ directory
in order to do whatever they want to do in their world.

tools/build/Build.include mimics scripts/Kbuild.include, but some
tool Makefiles included the Kbuild one to import a feature that is
missing in tools/build/Build.include:

  - Commit ec04aa3ae87b ("tools/thermal: tmon: use "-fstack-protector"
only if supported") included scripts/Kbuild.include from
tools/thermal/tmon/Makefile to import the cc-option macro.

  - Commit c2390f16fc5b ("selftests: kvm: fix for compilers that do
not support -no-pie") included scripts/Kbuild.include from
tools/testing/selftests/kvm/Makefile to import the try-run macro.

  - Commit 9cae4ace80ef ("selftests/bpf: do not ignore clang
failures") included scripts/Kbuild.include from
tools/testing/selftests/bpf/Makefile to import the .DELETE_ON_ERROR
target.

  - Commit 0695f8bca93e ("selftests/powerpc: Handle Makefile for
unrecognized option") included scripts/Kbuild.include from
tools/testing/selftests/powerpc/pmu/ebb/Makefile to import the
try-run macro.

Copy what they want there, and stop including scripts/Kbuild.include
from the tool Makefiles.

Link: 
https://lore.kernel.org/lkml/86dadf33-70f7-a5ac-cb8c-64966d2f4...@linux.ibm.com/
Fixes: d9f4ff50d2aa ("kbuild: spilt cc-option and friends to 
scripts/Makefile.compiler")
Reported-by: Janosch Frank 
Reported-by: Christian Borntraeger 
Signed-off-by: Masahiro Yamada 


When applying this on top of d9f4ff50d2aa ("kbuild: spilt cc-option and friends to 
scripts/Makefile.compiler")

I still do get

#  Test Assertion Failure 
#   lib/kvm_util.c:142: vm->fd >= 0
#   pid=315635 tid=315635 - Invalid argument
#  10x01002f4b: vm_open at kvm_util.c:142
#  2 (inlined by) vm_create at kvm_util.c:258
#  30x010015ef: test_add_max_memory_regions at 
set_memory_region_test.c:351
#  4 (inlined by) main at set_memory_region_test.c:397
#  50x03ff971abb89: ?? ??:0
#  60x010017ad: .annobin_abi_note.c.hot at crt1.o:?
#   KVM_CREATE_VM ioctl failed, rc: -1 errno: 22
not ok 7 selftests: kvm: set_memory_region_test # exit=254

and the testcase compilation does not pickup the pgste option.


Re: [PATCH 2/2] tools: do not include scripts/Kbuild.include

2021-04-15 Thread Masahiro Yamada
On Thu, Apr 15, 2021 at 4:40 PM Paolo Bonzini  wrote:
>
> On 15/04/21 09:27, Masahiro Yamada wrote:
> > Since commit d9f4ff50d2aa ("kbuild: spilt cc-option and friends to
> > scripts/Makefile.compiler"), some kselftests fail to build.
> >
> > The tools/ directory opted out Kbuild, and went in a different
> > direction. They copy any kind of files to the tools/ directory
> > in order to do whatever they want to do in their world.
> >
> > tools/build/Build.include mimics scripts/Kbuild.include, but some
> > tool Makefiles included the Kbuild one to import a feature that is
> > missing in tools/build/Build.include:
> >
> >   - Commit ec04aa3ae87b ("tools/thermal: tmon: use "-fstack-protector"
> > only if supported") included scripts/Kbuild.include from
> > tools/thermal/tmon/Makefile to import the cc-option macro.
> >
> >   - Commit c2390f16fc5b ("selftests: kvm: fix for compilers that do
> > not support -no-pie") included scripts/Kbuild.include from
> > tools/testing/selftests/kvm/Makefile to import the try-run macro.
> >
> >   - Commit 9cae4ace80ef ("selftests/bpf: do not ignore clang
> > failures") included scripts/Kbuild.include from
> > tools/testing/selftests/bpf/Makefile to import the .DELETE_ON_ERROR
> > target.
> >
> >   - Commit 0695f8bca93e ("selftests/powerpc: Handle Makefile for
> > unrecognized option") included scripts/Kbuild.include from
> > tools/testing/selftests/powerpc/pmu/ebb/Makefile to import the
> > try-run macro.
> >
> > Copy what they want there, and stop including scripts/Kbuild.include
> > from the tool Makefiles.
>
> I think it would make sense to add try-run, cc-option and
> .DELETE_ON_ERROR to tools/build/Build.include?


To be safe, I just copy-pasted what the makefiles need.
If someone wants to refactor the tool build system, that is fine,
but, to me, I do not see consistent rules or policy under tools/.

-- 
Best Regards
Masahiro Yamada


Re: [PATCH 2/2] tools: do not include scripts/Kbuild.include

2021-04-15 Thread Paolo Bonzini

On 15/04/21 09:27, Masahiro Yamada wrote:

Since commit d9f4ff50d2aa ("kbuild: spilt cc-option and friends to
scripts/Makefile.compiler"), some kselftests fail to build.

The tools/ directory opted out Kbuild, and went in a different
direction. They copy any kind of files to the tools/ directory
in order to do whatever they want to do in their world.

tools/build/Build.include mimics scripts/Kbuild.include, but some
tool Makefiles included the Kbuild one to import a feature that is
missing in tools/build/Build.include:

  - Commit ec04aa3ae87b ("tools/thermal: tmon: use "-fstack-protector"
only if supported") included scripts/Kbuild.include from
tools/thermal/tmon/Makefile to import the cc-option macro.

  - Commit c2390f16fc5b ("selftests: kvm: fix for compilers that do
not support -no-pie") included scripts/Kbuild.include from
tools/testing/selftests/kvm/Makefile to import the try-run macro.

  - Commit 9cae4ace80ef ("selftests/bpf: do not ignore clang
failures") included scripts/Kbuild.include from
tools/testing/selftests/bpf/Makefile to import the .DELETE_ON_ERROR
target.

  - Commit 0695f8bca93e ("selftests/powerpc: Handle Makefile for
unrecognized option") included scripts/Kbuild.include from
tools/testing/selftests/powerpc/pmu/ebb/Makefile to import the
try-run macro.

Copy what they want there, and stop including scripts/Kbuild.include
from the tool Makefiles.


I think it would make sense to add try-run, cc-option and 
.DELETE_ON_ERROR to tools/build/Build.include?


Paolo


Link: 
https://lore.kernel.org/lkml/86dadf33-70f7-a5ac-cb8c-64966d2f4...@linux.ibm.com/
Fixes: d9f4ff50d2aa ("kbuild: spilt cc-option and friends to 
scripts/Makefile.compiler")
Reported-by: Janosch Frank 
Reported-by: Christian Borntraeger 
Signed-off-by: Masahiro Yamada 
---

  tools/testing/selftests/bpf/Makefile  |  3 ++-
  tools/testing/selftests/kvm/Makefile  | 12 +++-
  .../selftests/powerpc/pmu/ebb/Makefile| 11 ++-
  tools/thermal/tmon/Makefile   | 19 +--
  4 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 044bfdcf5b74..d872b9f41543 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,5 +1,4 @@
  # SPDX-License-Identifier: GPL-2.0
-include ../../../../scripts/Kbuild.include
  include ../../../scripts/Makefile.arch
  include ../../../scripts/Makefile.include
  
@@ -476,3 +475,5 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)	\

prog_tests/tests.h map_tests/tests.h verifier/tests.h   \
feature \
$(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko)
+
+.DELETE_ON_ERROR:
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..8b45bc417d83 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -1,5 +1,15 @@
  # SPDX-License-Identifier: GPL-2.0-only
-include ../../../../scripts/Kbuild.include
+
+TMPOUT = .tmp_
+
+try-run = $(shell set -e;  \
+   TMP=$(TMPOUT)/tmp;  \
+   mkdir -p $(TMPOUT); \
+   trap "rm -rf $(TMPOUT)" EXIT; \
+   if ($(1)) >/dev/null 2>&1;\
+   then echo "$(2)"; \
+   else echo "$(3)"; \
+   fi)
  
  all:
  
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile

index af3df79d8163..d5d3e869df93 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile
+++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile
@@ -1,5 +1,4 @@
  # SPDX-License-Identifier: GPL-2.0
-include ../../../../../../scripts/Kbuild.include
  
  noarg:

$(MAKE) -C ../../
@@ -8,6 +7,16 @@ noarg:
  CFLAGS += -m64
  
  TMPOUT = $(OUTPUT)/TMPDIR/

+
+try-run = $(shell set -e;  \
+   TMP=$(TMPOUT)/tmp;  \
+   mkdir -p $(TMPOUT); \
+   trap "rm -rf $(TMPOUT)" EXIT; \
+   if ($(1)) >/dev/null 2>&1;\
+   then echo "$(2)"; \
+   else echo "$(3)"; \
+   fi)
+
  # Toolchains may build PIE by default which breaks the assembly
  no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
  $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o 
"$$TMP", -no-pie)
diff --git a/tools/thermal/tmon/Makefile b/tools/thermal/tmon/Makefile
index 59e417ec3e13..92a683e4866c 100644
--- a/tools/thermal/tmon/Makefile
+++ b/tools/thermal/tmon/Makefile
@@ -1,6 +1,21 @@
  # SPDX-License-Identifier: GPL-2.0
-# We need this for the "cc-option" macro.
-include ../../../scripts/Kbuild.include
+
+TMPOUT = .tmp_
+
+try-run = $(shell set -e;  \
+   TMP=$(TMPOUT)/tmp;  \
+   mkdir -p 

[PATCH 2/2] tools: do not include scripts/Kbuild.include

2021-04-15 Thread Masahiro Yamada
Since commit d9f4ff50d2aa ("kbuild: spilt cc-option and friends to
scripts/Makefile.compiler"), some kselftests fail to build.

The tools/ directory opted out Kbuild, and went in a different
direction. They copy any kind of files to the tools/ directory
in order to do whatever they want to do in their world.

tools/build/Build.include mimics scripts/Kbuild.include, but some
tool Makefiles included the Kbuild one to import a feature that is
missing in tools/build/Build.include:

 - Commit ec04aa3ae87b ("tools/thermal: tmon: use "-fstack-protector"
   only if supported") included scripts/Kbuild.include from
   tools/thermal/tmon/Makefile to import the cc-option macro.

 - Commit c2390f16fc5b ("selftests: kvm: fix for compilers that do
   not support -no-pie") included scripts/Kbuild.include from
   tools/testing/selftests/kvm/Makefile to import the try-run macro.

 - Commit 9cae4ace80ef ("selftests/bpf: do not ignore clang
   failures") included scripts/Kbuild.include from
   tools/testing/selftests/bpf/Makefile to import the .DELETE_ON_ERROR
   target.

 - Commit 0695f8bca93e ("selftests/powerpc: Handle Makefile for
   unrecognized option") included scripts/Kbuild.include from
   tools/testing/selftests/powerpc/pmu/ebb/Makefile to import the
   try-run macro.

Copy what they want there, and stop including scripts/Kbuild.include
from the tool Makefiles.

Link: 
https://lore.kernel.org/lkml/86dadf33-70f7-a5ac-cb8c-64966d2f4...@linux.ibm.com/
Fixes: d9f4ff50d2aa ("kbuild: spilt cc-option and friends to 
scripts/Makefile.compiler")
Reported-by: Janosch Frank 
Reported-by: Christian Borntraeger 
Signed-off-by: Masahiro Yamada 
---

 tools/testing/selftests/bpf/Makefile  |  3 ++-
 tools/testing/selftests/kvm/Makefile  | 12 +++-
 .../selftests/powerpc/pmu/ebb/Makefile| 11 ++-
 tools/thermal/tmon/Makefile   | 19 +--
 4 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index 044bfdcf5b74..d872b9f41543 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include ../../../../scripts/Kbuild.include
 include ../../../scripts/Makefile.arch
 include ../../../scripts/Makefile.include
 
@@ -476,3 +475,5 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) 
$(HOST_SCRATCH_DIR)  \
prog_tests/tests.h map_tests/tests.h verifier/tests.h   \
feature \
$(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko)
+
+.DELETE_ON_ERROR:
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..8b45bc417d83 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -1,5 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0-only
-include ../../../../scripts/Kbuild.include
+
+TMPOUT = .tmp_
+
+try-run = $(shell set -e;  \
+   TMP=$(TMPOUT)/tmp;  \
+   mkdir -p $(TMPOUT); \
+   trap "rm -rf $(TMPOUT)" EXIT;   \
+   if ($(1)) >/dev/null 2>&1;  \
+   then echo "$(2)";   \
+   else echo "$(3)";   \
+   fi)
 
 all:
 
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile 
b/tools/testing/selftests/powerpc/pmu/ebb/Makefile
index af3df79d8163..d5d3e869df93 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile
+++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include ../../../../../../scripts/Kbuild.include
 
 noarg:
$(MAKE) -C ../../
@@ -8,6 +7,16 @@ noarg:
 CFLAGS += -m64
 
 TMPOUT = $(OUTPUT)/TMPDIR/
+
+try-run = $(shell set -e;  \
+   TMP=$(TMPOUT)/tmp;  \
+   mkdir -p $(TMPOUT); \
+   trap "rm -rf $(TMPOUT)" EXIT;   \
+   if ($(1)) >/dev/null 2>&1;  \
+   then echo "$(2)";   \
+   else echo "$(3)";   \
+   fi)
+
 # Toolchains may build PIE by default which breaks the assembly
 no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
 $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o 
"$$TMP", -no-pie)
diff --git a/tools/thermal/tmon/Makefile b/tools/thermal/tmon/Makefile
index 59e417ec3e13..92a683e4866c 100644
--- a/tools/thermal/tmon/Makefile
+++ b/tools/thermal/tmon/Makefile
@@ -1,6 +1,21 @@
 # SPDX-License-Identifier: GPL-2.0
-# We need this for the "cc-option" macro.
-include ../../../scripts/Kbuild.include
+
+TMPOUT = .tmp_
+
+try-run = $(shell set -e;  \
+   TMP=$(TMPOUT)/tmp;  \
+   mkdir -p $(TMPOUT); \
+   trap "rm -rf $(TMPOUT)" EXIT;   \
+   if ($(1)) >/dev/null 2>&1;  \
+   then echo "$(2)";   \
+   else echo "$(3)";