date:20200417

[PATCH v3 3/4] hugetlbfs: remove hugetlb_add_hstate() warning for existing hstate

2020-04-17 Thread Mike Kravetz

The routine hugetlb_add_hstate prints a warning if the hstate already
exists.  This was originally done as part of kernel command line
parsing.  If 'hugepagesz=' was specified more than once, the warning
pr_warn("hugepagesz= specified twice, ignoring\n");
would be printed.

Some architectures want to enable all huge page sizes.  They would
call hugetlb_add_hstate for all supported sizes.  However, this was
done after command line processing and as a result hstates could have
already been created for some sizes.  To make sure no warning were
printed, there would often be code like:
if (!size_to_hstate(size)
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT)

The only time we want to print the warning is as the result of command
line processing.  So, remove the warning from hugetlb_add_hstate and
add it to the single arch independent routine processing "hugepagesz=".
After this, calls to size_to_hstate() in arch specific code can be
removed and hugetlb_add_hstate can be called without worrying about
warning messages.

Signed-off-by: Mike Kravetz 
Acked-by: Mina Almasry 
---
 arch/arm64/mm/hugetlbpage.c   | 16 
 arch/powerpc/mm/hugetlbpage.c |  3 +--
 arch/riscv/mm/hugetlbpage.c   |  2 +-
 arch/sparc/mm/init_64.c   | 19 ---
 arch/x86/mm/hugetlbpage.c |  2 +-
 mm/hugetlb.c  |  9 ++---
 6 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f706b821aba6..21fa98b51e00 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -441,22 +441,14 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
 }
 
-static void __init add_huge_page_size(unsigned long size)
-{
-   if (size_to_hstate(size))
-   return;
-
-   hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
-}
-
 static int __init hugetlbpage_init(void)
 {
 #ifdef CONFIG_ARM64_4K_PAGES
-   add_huge_page_size(PUD_SIZE);
+   hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
 #endif
-   add_huge_page_size(CONT_PMD_SIZE);
-   add_huge_page_size(PMD_SIZE);
-   add_huge_page_size(CONT_PTE_SIZE);
+   hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT);
 
return 0;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 2c3fa0a7787b..4d5ed1093615 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -584,8 +584,7 @@ static int __init add_huge_page_size(unsigned long long 
size)
if (!arch_hugetlb_valid_size((unsigned long)size))
return -EINVAL;
 
-   if (!size_to_hstate(size))
-   hugetlb_add_hstate(shift - PAGE_SHIFT);
+   hugetlb_add_hstate(shift - PAGE_SHIFT);
return 0;
 }
 
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 4e5d7e9f0eef..932dadfdca54 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -26,7 +26,7 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 static __init int gigantic_pages_init(void)
 {
/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
-   if (IS_ENABLED(CONFIG_64BIT) && !size_to_hstate(1UL << PUD_SHIFT))
+   if (IS_ENABLED(CONFIG_64BIT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
 }
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 4618f96fd30f..ae819a16d07a 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -325,23 +325,12 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, 
unsigned long tsb_inde
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-static void __init add_huge_page_size(unsigned long size)
-{
-   unsigned int order;
-
-   if (size_to_hstate(size))
-   return;
-
-   order = ilog2(size) - PAGE_SHIFT;
-   hugetlb_add_hstate(order);
-}
-
 static int __init hugetlbpage_init(void)
 {
-   add_huge_page_size(1UL << HPAGE_64K_SHIFT);
-   add_huge_page_size(1UL << HPAGE_SHIFT);
-   add_huge_page_size(1UL << HPAGE_256MB_SHIFT);
-   add_huge_page_size(1UL << HPAGE_2GB_SHIFT);
+   hugetlb_add_hstate(HPAGE_64K_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(HPAGE_256MB_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(HPAGE_2GB_SHIFT - PAGE_SHIFT);
 
return 0;
 }
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 937d640a89e3..cf5781142716 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -195,7 +195,7 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 static __init int gigantic_pages_init(void)
 {
/* With compaction or CMA we can allocate gigantic pages at runtime */
-   if (boot_cpu_has(X86_FEATURE_GBPAGES) &&

[PATCH v3 2/4] hugetlbfs: move hugepagesz= parsing to arch independent code

2020-04-17 Thread Mike Kravetz

Now that architectures provide arch_hugetlb_valid_size(), parsing
of "hugepagesz=" can be done in architecture independent code.
Create a single routine to handle hugepagesz= parsing and remove
all arch specific routines.  We can also remove the interface
hugetlb_bad_size() as this is no longer used outside arch independent
code.

This also provides consistent behavior of hugetlbfs command line
options.  The hugepagesz= option should only be specified once for
a specific size, but some architectures allow multiple instances.
This appears to be more of an oversight when code was added by some
architectures to set up ALL huge pages sizes.

Signed-off-by: Mike Kravetz 
Acked-by: Mina Almasry 
Reviewed-by: Peter Xu 
---
 arch/arm64/mm/hugetlbpage.c   | 15 ---
 arch/powerpc/mm/hugetlbpage.c | 15 ---
 arch/riscv/mm/hugetlbpage.c   | 16 
 arch/s390/mm/hugetlbpage.c| 18 --
 arch/sparc/mm/init_64.c   | 22 --
 arch/x86/mm/hugetlbpage.c | 16 
 include/linux/hugetlb.h   |  1 -
 mm/hugetlb.c  | 23 +--
 8 files changed, 17 insertions(+), 109 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 069b96ee2aec..f706b821aba6 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -476,18 +476,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 
return false;
 }
-
-static __init int setup_hugepagesz(char *opt)
-{
-   unsigned long ps = memparse(opt, );
-
-   if (arch_hugetlb_valid_size(ps)) {
-   add_huge_page_size(ps);
-   return 1;
-   }
-
-   hugetlb_bad_size();
-   pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
-   return 0;
-}
-__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index de54d2a37830..2c3fa0a7787b 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -589,21 +589,6 @@ static int __init add_huge_page_size(unsigned long long 
size)
return 0;
 }
 
-static int __init hugepage_setup_sz(char *str)
-{
-   unsigned long long size;
-
-   size = memparse(str, );
-
-   if (add_huge_page_size(size) != 0) {
-   hugetlb_bad_size();
-   pr_err("Invalid huge page size specified(%llu)\n", size);
-   }
-
-   return 1;
-}
-__setup("hugepagesz=", hugepage_setup_sz);
-
 static int __init hugetlbpage_init(void)
 {
bool configured = false;
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index da1f516bc451..4e5d7e9f0eef 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -22,22 +22,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
return false;
 }
 
-static __init int setup_hugepagesz(char *opt)
-{
-   unsigned long ps = memparse(opt, );
-
-   if (arch_hugetlb_valid_size(ps)) {
-   hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT);
-   return 1;
-   }
-
-   hugetlb_bad_size();
-   pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
-   return 0;
-
-}
-__setup("hugepagesz=", setup_hugepagesz);
-
 #ifdef CONFIG_CONTIG_ALLOC
 static __init int gigantic_pages_init(void)
 {
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index ac25b207624c..242dfc0d462d 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -261,24 +261,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
return false;
 }
 
-static __init int setup_hugepagesz(char *opt)
-{
-   unsigned long size;
-   char *string = opt;
-
-   size = memparse(opt, );
-   if (arch_hugetlb_valid_size(size)) {
-   hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
-   } else {
-   hugetlb_bad_size();
-   pr_err("hugepagesz= specifies an unsupported page size %s\n",
-   string);
-   return 0;
-   }
-   return 1;
-}
-__setup("hugepagesz=", setup_hugepagesz);
-
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 2bfe8e22b706..4618f96fd30f 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -397,28 +397,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 
return true;
 }
-
-static int __init setup_hugepagesz(char *string)
-{
-   unsigned long long hugepage_size;
-   int rc = 0;
-
-   hugepage_size = memparse(string, );
-
-   if (!arch_hugetlb_valid_size((unsigned long)hugepage_size)) {
-   hugetlb_bad_size();
-   pr_err("hugepagesz=%llu not supported by MMU.\n",
-   hugepage_size);
-

Re: [PATCH] drivers: uio: new driver uio_fsl_85xx_cache_sram

2020-04-17 Thread Randy Dunlap

On 4/17/20 10:21 AM, Wang Wenhu wrote:
> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
> index 202ee81cfc2b..f6e6ec0089c0 100644
> --- a/drivers/uio/Kconfig
> +++ b/drivers/uio/Kconfig
> @@ -105,6 +105,14 @@ config UIO_NETX
> To compile this driver as a module, choose M here; the module
> will be called uio_netx.
>  
> +config UIO_FSL_85XX_CACHE_SRAM
> + tristate "Freescale MPC85xx Cache-Sram driver"
> + depends on FSL_SOC_BOOKE && PPC32 && !FSL_85XX_CACHE_SRAM
> + help
> +   Generic driver for accessing the Cache-Sram form user level. This

  from

and SRAM would be better than Sram IMO. (2 places)

> +   is extremely helpful for some user-space applications that require
> +   high performance memory accesses.
> +
>  config UIO_FSL_ELBC_GPCM
>   tristate "eLBC/GPCM driver"
>   depends on FSL_LBC

thanks.
-- 
~Randy

Re: [PATCH] soc: fsl: dpio: avoid stack usage warning

2020-04-17 Thread Arnd Bergmann

On Wed, Apr 8, 2020 at 10:24 PM Russell King - ARM Linux admin
 wrote:
> On Wed, Apr 08, 2020 at 08:58:16PM +0200, Arnd Bergmann wrote:
> > - int i;
> > - struct qbman_eq_desc ed[32];
> > + struct qbman_eq_desc *ed = kcalloc(sizeof(struct qbman_eq_desc), 32, 
> > GFP_KERNEL);
>
> I think you need to rearrange this to be more compliant with the coding
> style.

Ok, I've updated this now to move the allocation into a new line
and applied this and the other fsl patch into the arm/fixes
branch for v5.7.

   Arnd

[PATCH v4 2/2] powerpc/uaccess: Implement unsafe_copy_to_user() as a simple loop

2020-04-17 Thread Christophe Leroy

At the time being, unsafe_copy_to_user() is based on
raw_copy_to_user() which calls __copy_tofrom_user().

__copy_tofrom_user() is a big optimised function to copy big amount
of data. It aligns destinations to cache line in order to use
dcbz instruction.

Today unsafe_copy_to_user() is called only from filldir().
It is used to mainly copy small amount of data like filenames,
so __copy_tofrom_user() is not fit.

Also, unsafe_copy_to_user() is used within user_access_begin/end
sections. In those section, it is preferable to not call functions.

Rewrite unsafe_copy_to_user() as a macro that uses __put_user_goto().
We first perform a loop of long, then we finish with necessary
complements.

unsafe_copy_to_user() might be used in the near future to copy
fixed-size data, like pt_regs structs during signal processing.
Having it as a macro allows GCC to optimise it for instead when
it knows the size in advance, it can unloop loops, drop complements
when the size is a multiple of longs, etc ...

Signed-off-by: Christophe Leroy 
---
v4: new
---
 arch/powerpc/include/asm/uaccess.h | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 9365b59495a2..b24fbe75f9ce 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -514,7 +514,26 @@ static __must_check inline bool user_access_begin(const 
void __user *ptr, size_t
 #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
 #define unsafe_get_user(x, p, e) unsafe_op_wrap(__get_user_allowed(x, p), e)
 #define unsafe_put_user(x, p, e) __put_user_goto(x, p, e)
+
 #define unsafe_copy_to_user(d, s, l, e) \
-   unsafe_op_wrap(raw_copy_to_user_allowed(d, s, l), e)
+do {   \
+   u8 __user *_dst = (u8 __user *)(d); \
+   const u8 *_src = (const u8 *)(s);   \
+   size_t _len = (l);  \
+   int _i; \
+   \
+   for (_i = 0; _i < (_len & ~(sizeof(long) - 1)); _i += sizeof(long)) 
\
+   __put_user_goto(*(long*)(_src + _i), (long __user *)(_dst + 
_i), e);\
+   if (IS_ENABLED(CONFIG_PPC64) && (_len & 4)) {   \
+   __put_user_goto(*(u32*)(_src + _i), (u32 __user *)(_dst + _i), 
e);  \
+   _i += 4;\
+   }   \
+   if (_len & 2) { \
+   __put_user_goto(*(u16*)(_src + _i), (u16 __user *)(_dst + _i), 
e);  \
+   _i += 2;\
+   }   \
+   if (_len & 1) \
+   __put_user_goto(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), 
e);\
+} while (0)
 
 #endif /* _ARCH_POWERPC_UACCESS_H */
-- 
2.25.0

Re: [PATCH 4/8] binfmt_elf: open code copy_siginfo_to_user to kernelspace buffer

2020-04-17 Thread Eric W. Biederman

Christoph Hellwig  writes:

> On Wed, Apr 15, 2020 at 10:20:11AM +0200, Arnd Bergmann wrote:
>> > I'd rather keep it out of this series and to
>> > an interested party.  Then again x32 doesn't seem to have a whole lot
>> > of interested parties..
>> 
>> Fine with me. It's on my mental list of things that we want to kill off
>> eventually as soon as the remaining users stop replying to questions
>> about it.
>> 
>> In fact I should really turn that into a properly maintained list in
>> Documentation/... that contains any options that someone has
>> asked about removing in the past, along with the reasons for keeping
>> it around and a time at which we should ask about it again.
>
> To the newly added x86 maintainers:  Arnd brought up the point that
> elf_core_dump writes the ABI siginfo format into the core dump. That
> format differs for i386 vs x32.  Is there any good way to find out
> which is the right format when are not in a syscall?
>
> As far a I can tell x32 vs i386 just seems to be based around what
> syscall table was used for the current syscall, but core dumps aren't
> always in syscall context.

I don't think this matters.  The i386 and x32 signal structures
only differ for SIGCHLD.  The SIGCHLD signal does cause coredumps.
So as long as we get the 32bit vs 64bit distinct correct all should be
well.

Eric

[PATCH v3 1/4] hugetlbfs: add arch_hugetlb_valid_size

2020-04-17 Thread Mike Kravetz

The architecture independent routine hugetlb_default_setup sets up
the default huge pages size.  It has no way to verify if the passed
value is valid, so it accepts it and attempts to validate at a later
time.  This requires undocumented cooperation between the arch specific
and arch independent code.

For architectures that support more than one huge page size, provide
a routine arch_hugetlb_valid_size to validate a huge page size.
hugetlb_default_setup can use this to validate passed values.

arch_hugetlb_valid_size will also be used in a subsequent patch to
move processing of the "hugepagesz=" in arch specific code to a common
routine in arch independent code.

Signed-off-by: Mike Kravetz 
---
 arch/arm64/mm/hugetlbpage.c   | 17 +
 arch/powerpc/mm/hugetlbpage.c | 20 +---
 arch/riscv/mm/hugetlbpage.c   | 26 +-
 arch/s390/mm/hugetlbpage.c| 16 
 arch/sparc/mm/init_64.c   | 24 
 arch/x86/mm/hugetlbpage.c | 17 +
 include/linux/hugetlb.h   |  1 +
 mm/hugetlb.c  | 21 ++---
 8 files changed, 103 insertions(+), 39 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index bbeb6a5a6ba6..069b96ee2aec 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -462,17 +462,26 @@ static int __init hugetlbpage_init(void)
 }
 arch_initcall(hugetlbpage_init);
 
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
 {
-   unsigned long ps = memparse(opt, );
-
-   switch (ps) {
+   switch (size) {
 #ifdef CONFIG_ARM64_4K_PAGES
case PUD_SIZE:
 #endif
case CONT_PMD_SIZE:
case PMD_SIZE:
case CONT_PTE_SIZE:
+   return true;
+   }
+
+   return false;
+}
+
+static __init int setup_hugepagesz(char *opt)
+{
+   unsigned long ps = memparse(opt, );
+
+   if (arch_hugetlb_valid_size(ps)) {
add_huge_page_size(ps);
return 1;
}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 33b3461d91e8..de54d2a37830 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -558,7 +558,7 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
return vma_kernel_pagesize(vma);
 }
 
-static int __init add_huge_page_size(unsigned long long size)
+bool __init arch_hugetlb_valid_size(unsigned long size)
 {
int shift = __ffs(size);
int mmu_psize;
@@ -566,20 +566,26 @@ static int __init add_huge_page_size(unsigned long long 
size)
/* Check that it is a page size supported by the hardware and
 * that it fits within pagetable and slice limits. */
if (size <= PAGE_SIZE || !is_power_of_2(size))
-   return -EINVAL;
+   return false;
 
mmu_psize = check_and_get_huge_psize(shift);
if (mmu_psize < 0)
-   return -EINVAL;
+   return false;
 
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
-   /* Return if huge page size has already been setup */
-   if (size_to_hstate(size))
-   return 0;
+   return true;
+}
 
-   hugetlb_add_hstate(shift - PAGE_SHIFT);
+static int __init add_huge_page_size(unsigned long long size)
+{
+   int shift = __ffs(size);
+
+   if (!arch_hugetlb_valid_size((unsigned long)size))
+   return -EINVAL;
 
+   if (!size_to_hstate(size))
+   hugetlb_add_hstate(shift - PAGE_SHIFT);
return 0;
 }
 
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index a6189ed36c5f..da1f516bc451 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -12,21 +12,29 @@ int pmd_huge(pmd_t pmd)
return pmd_leaf(pmd);
 }
 
+bool __init arch_hugetlb_valid_size(unsigned long size)
+{
+   if (size == HPAGE_SIZE)
+   return true;
+   else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
+   return true;
+   else
+   return false;
+}
+
 static __init int setup_hugepagesz(char *opt)
 {
unsigned long ps = memparse(opt, );
 
-   if (ps == HPAGE_SIZE) {
-   hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
-   } else if (IS_ENABLED(CONFIG_64BIT) && ps == PUD_SIZE) {
-   hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-   } else {
-   hugetlb_bad_size();
-   pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
-   return 0;
+   if (arch_hugetlb_valid_size(ps)) {
+   hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT);
+   return 1;
}
 
-   return 1;
+   hugetlb_bad_size();
+   pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
+   return 0;
+
 }
 __setup("hugepagesz=", setup_hugepagesz);
 
diff --git a/arch/s390/mm/hugetlbpage.c

[Bug 206203] kmemleak reports various leaks in drivers/of/unittest.c

2020-04-17 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=206203

--- Comment #11 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 288565
  --> https://bugzilla.kernel.org/attachment.cgi?id=288565=edit
kmemleak output (kernel 5.7-rc1, PowerMac G5 11,2)

Applied Frank's patch set on top of 5.7-rc1. Though there are certainly less
memory leaks now, there are still some OF ones reported.

Frank Rowand (5):
  of: unittest: kmemleak on changeset destroy
  of: unittest: kmemleak in of_unittest_platform_populate()
  of: unittest: kmemleak in of_unittest_overlay_high_level()
  of: overlay: kmemleak in dup_and_fixup_symbol_prop()
  of: unittest: kmemleak in duplicate property update

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[PATCH] drivers: uio: new driver uio_fsl_85xx_cache_sram

2020-04-17 Thread Wang Wenhu

Implements a new uio driver for freescale 85xx platforms to access
the Cache-Sram form user level. It is extremely helpful for the user
space applications that require high performance memory accesses.

Cc: Greg Kroah-Hartman 
Cc: Christophe Leroy 
Cc: Scott Wood 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Wang Wenhu 
---
 drivers/uio/Kconfig   |   8 +
 drivers/uio/Makefile  |   1 +
 drivers/uio/uio_fsl_85xx_cache_sram.c | 407 ++
 3 files changed, 416 insertions(+)
 create mode 100644 drivers/uio/uio_fsl_85xx_cache_sram.c

diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
index 202ee81cfc2b..f6e6ec0089c0 100644
--- a/drivers/uio/Kconfig
+++ b/drivers/uio/Kconfig
@@ -105,6 +105,14 @@ config UIO_NETX
  To compile this driver as a module, choose M here; the module
  will be called uio_netx.
 
+config UIO_FSL_85XX_CACHE_SRAM
+   tristate "Freescale MPC85xx Cache-Sram driver"
+   depends on FSL_SOC_BOOKE && PPC32 && !FSL_85XX_CACHE_SRAM
+   help
+ Generic driver for accessing the Cache-Sram form user level. This
+ is extremely helpful for some user-space applications that require
+ high performance memory accesses.
+
 config UIO_FSL_ELBC_GPCM
tristate "eLBC/GPCM driver"
depends on FSL_LBC
diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
index c285dd2a4539..be2056cffc21 100644
--- a/drivers/uio/Makefile
+++ b/drivers/uio/Makefile
@@ -10,4 +10,5 @@ obj-$(CONFIG_UIO_NETX)+= uio_netx.o
 obj-$(CONFIG_UIO_PRUSS) += uio_pruss.o
 obj-$(CONFIG_UIO_MF624) += uio_mf624.o
 obj-$(CONFIG_UIO_FSL_ELBC_GPCM)+= uio_fsl_elbc_gpcm.o
+obj-$(CONFIG_UIO_FSL_85XX_CACHE_SRAM)  += uio_fsl_85xx_cache_sram.o
 obj-$(CONFIG_UIO_HV_GENERIC)   += uio_hv_generic.o
diff --git a/drivers/uio/uio_fsl_85xx_cache_sram.c 
b/drivers/uio/uio_fsl_85xx_cache_sram.c
new file mode 100644
index ..312ca38e93e1
--- /dev/null
+++ b/drivers/uio/uio_fsl_85xx_cache_sram.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Vivo Communication Technology Co. Ltd.
+ * Copyright (C) 2020 Wang Wenhu 
+ * All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define DRIVER_NAME"uio_fsl_85xx_cache_sram"
+#define UIO_INFO_VER   "devicetree,pseudo"
+
+#define MAX_SRAM_UIO_INFOS 5
+
+#define L2CR_L2FI  0x4000  /* L2 flash invalidate */
+#define L2CR_SRAM_FULL 0x0001  /* L2SRAM full size */
+#define L2CR_SRAM_HALF 0x0002  /* L2SRAM half size */
+#define L2CR_SRAM_QUART0x0004  /* L2SRAM one quarter 
size */
+#define L2CR_SRAM_EIGHTH   0x0006  /* L2SRAM one eighth size */
+
+#define L2SRAM_BAR_MSK_LO180xC000  /* Lower 18 bits */
+#define L2SRAM_BARE_MSK_HI40x000F  /* Upper 4 bits */
+
+enum cache_sram_lock_ways {
+   LOCK_WAYS_ZERO,
+   LOCK_WAYS_EIGHTH,
+   LOCK_WAYS_TWO_EIGHTH,
+   LOCK_WAYS_HALF = 4,
+   LOCK_WAYS_FULL = 8,
+};
+
+struct mpc85xx_l2ctlr {
+   u32 ctl;/* 0x000 - L2 control */
+   u8  res1[0xC];
+   u32 ewar0;  /* 0x010 - External write address 0 */
+   u32 ewarea0;/* 0x014 - External write address extended 0 */
+   u32 ewcr0;  /* 0x018 - External write ctrl */
+   u8  res2[4];
+   u32 ewar1;  /* 0x020 - External write address 1 */
+   u32 ewarea1;/* 0x024 - External write address extended 1 */
+   u32 ewcr1;  /* 0x028 - External write ctrl 1 */
+   u8  res3[4];
+   u32 ewar2;  /* 0x030 - External write address 2 */
+   u32 ewarea2;/* 0x034 - External write address extended 2 */
+   u32 ewcr2;  /* 0x038 - External write ctrl 2 */
+   u8  res4[4];
+   u32 ewar3;  /* 0x040 - External write address 3 */
+   u32 ewarea3;/* 0x044 - External write address extended 3 */
+   u32 ewcr3;  /* 0x048 - External write ctrl 3 */
+   u8  res5[0xB4];
+   u32 srbar0; /* 0x100 - SRAM base address 0 */
+   u32 srbarea0;   /* 0x104 - SRAM base addr reg ext address 0 */
+   u32 srbar1; /* 0x108 - SRAM base address 1 */
+   u32 srbarea1;   /* 0x10C - SRAM base addr reg ext address 1 */
+   u8  res6[0xCF0];
+   u32 errinjhi;   /* 0xE00 - Error injection mask high */
+   u32 errinjlo;   /* 0xE04 - Error injection mask low */
+   u32 errinjctl;  /* 0xE08 - Error injection tag/ecc control */
+   u8  res7[0x14];
+   u32 captdatahi; /* 0xE20 - Error data high capture */
+   u32 captdatalo; /* 0xE24 - Error data low capture */
+   u32 captecc;/* 0xE28 - Error syndrome */
+   u8

Re: ppc64 early slub caches have zero random value

2020-04-17 Thread Vlastimil Babka

On 4/17/20 6:53 PM, Michal Suchánek wrote:
> Hello,

Hi, thanks for reproducing on latest upstream!

> instrumenting the kernel with the following patch
> 
> ---
>  mm/slub.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index d6787bbe0248..d40995d5f8ff 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3633,6 +3633,7 @@ static int kmem_cache_open(struct kmem_cache *s, 
> slab_flags_t flags)
>   s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
>  #ifdef CONFIG_SLAB_FREELIST_HARDENED
>   s->random = get_random_long();
> + pr_notice("Creating cache %s with s->random=%ld\n", s->name, s->random);
>  #endif
>  
>   if (!calculate_sizes(s, -1))
> 
> I get:
> 
> [0.00] random: get_random_u64 called from kmem_cache_open+0x3c/0x5b0
with crng_init=0
> [0.00] Creating cache kmem_cache_node with s->random=0
> [0.00] Creating cache kmem_cache with s->random=0
> [0.00] Creating cache kmalloc-8 with s->random=0
> [0.00] Creating cache kmalloc-16 with s->random=0
> [0.00] Creating cache kmalloc-32 with s->random=0
> [0.00] Creating cache kmalloc-64 with s->random=0
> [0.00] Creating cache kmalloc-96 with s->random=0
> [0.00] Creating cache kmalloc-128 with s->random=0
> [0.00] Creating cache kmalloc-192 with s->random=-682532147323126958
> 
> The earliest caches created invariably end up with s->random of zero.

It seems that reliably it's the first 8 calls get_random_u64(), which sounds
more like some off-by-X bug than a genuine lack entropy that would become fixed
in the meanwhile?

> This is a problem for crash which does not recognize these as randomized
> and fails to read them. While this can be addressed in crash is it
> intended to create caches with zero random value in the kernel?

Definitely not. The question is more likely what guarantees we have with
crng_init=0. Probably we can't expect cryptographically strong randomness, but
zeroes still do look like a bug to me?

> This is broken at least in the 5.4~5.7 range but it is not clear if this
> ever worked. All examples of earlier kernels I have at hand use slab mm.
> 
> Thanks
> 
> Michal
>

Re: POWER9 crash due to STRICT_KERNEL_RWX (WAS: Re: Linux-next POWER9 NULL pointer NIP...)

2020-04-17 Thread Naveen N. Rao


Qian Cai wrote:




On Apr 17, 2020, at 3:01 AM, Naveen N. Rao  wrote:

Hi Qian,

Qian Cai wrote:

OK, reverted the commit,
c55d7b5e6426 (“powerpc: Remove STRICT_KERNEL_RWX incompatibility with 
RELOCATABLE”)
or set STRICT_KERNEL_RWX=n fixed the crash below and also mentioned in this 
thread,
https://lore.kernel.org/lkml/15ac5b0e-a221-4b8c-9039-fa96b8ef7...@lca.pw/


Do you see any errors logged in dmesg when you see the crash?  
STRICT_KERNEL_RWX changes how patch_instruction() works, so it would be 
interesting to see if there are any ftrace-related errors thrown before the 
crash.


Yes, looks like there is a warning right after,

echo function > /sys/kernel/debug/tracing/current_tracer
echo nop > /sys/kernel/debug/tracing/current_tracer

and just before the crash,

[ T3454] ftrace-powerpc: Unexpected call sequence at de85f044: 48003d1d 
7c0802a6
[   56.870472][ T3454] [ cut here ]
[   56.870500][ T3454] WARNING: CPU: 52 PID: 3454 at kernel/trace/ftrace.c:2026 
ftrace_bug+0x104/0x310
[   56.870527][ T3454] Modules linked in: kvm_hv kvm ses enclosure 
scsi_transport_sas ip_tables x_tables xfs sd_mod i40e firmware_class aacraid 
dm_mirror dm_region_hash dm_log dm_mod
[   56.870592][ T3454] CPU: 52 PID: 3454 Comm: nip.sh Not tainted 
5.7.0-rc1-next-20200416 #4
[   56.870627][ T3454] NIP:  c02a3ae4 LR: c02a47fc CTR: 
c02436f0
[   56.870661][ T3454] REGS: c0069a9ef710 TRAP: 0700   Not tainted  
(5.7.0-rc1-next-20200416)
[   56.870697][ T3454] MSR:  9282b033 
  CR: 28228222  XER: 
[   56.870748][ T3454] CFAR: c02a3a2c IRQMASK: 0 
[   56.870748][ T3454] GPR00: c02a47fc c0069a9ef9a0 c12f9000 ffea 
[   56.870748][ T3454] GPR04: c0002004e2160438 c007fedf0ad8 614ca19d 0007 
[   56.870748][ T3454] GPR08: 0003   0002 
[   56.870748][ T3454] GPR12: 4000 c007fffd5600 4000 000139ae9798 
[   56.870748][ T3454] GPR16: 000139ae9724 000139a86968 000139a1f230 000139aed568 
[   56.870748][ T3454] GPR20: 0001402af8b0 0009 000139a996e8 7fffc9186d94 
[   56.870748][ T3454] GPR24:  c0069a9efc00 c132cd00 c0069a9efc40 
[   56.870748][ T3454] GPR28: c11c29e8 0001 c0002004e2160438 c00809321a64 
[   56.870969][ T3454] NIP [c02a3ae4] ftrace_bug+0x104/0x310

ftrace_bug at kernel/trace/ftrace.c:2026
[   56.870995][ T3454] LR [c02a47fc] ftrace_modify_all_code+0x16c/0x210
ftrace_modify_all_code at kernel/trace/ftrace.c:2672
[   56.871034][ T3454] Call Trace:
[   56.871057][ T3454] [c0069a9ef9a0] [4b899a9efa00] 0x4b899a9efa00 
(unreliable)
[   56.871086][ T3454] [c0069a9efa20] [c02a47fc] 
ftrace_modify_all_code+0x16c/0x210
[   56.871125][ T3454] [c0069a9efa50] [c0061b68] 
arch_ftrace_update_code+0x18/0x30
[   56.871162][ T3454] [c0069a9efa70] [c02a49c4] 
ftrace_run_update_code+0x44/0xc0
[   56.871199][ T3454] [c0069a9efaa0] [c02aa3c8] 
ftrace_startup+0xe8/0x1b0
[   56.871236][ T3454] [c0069a9efae0] [c02aa4e0] 
register_ftrace_function+0x50/0xc0
[   56.871275][ T3454] [c0069a9efb10] [c02d0468] 
function_trace_init+0x98/0xd0
[   56.871312][ T3454] [c0069a9efb40] [c02c75c0] 
tracing_set_tracer+0x350/0x640
[   56.871349][ T3454] [c0069a9efbe0] [c02c7a90] 
tracing_set_trace_write+0x1e0/0x370
[   56.871388][ T3454] [c0069a9efd00] [c052094c] 
__vfs_write+0x3c/0x70
[   56.871424][ T3454] [c0069a9efd20] [c0523d4c] 
vfs_write+0xcc/0x200
[   56.871461][ T3454] [c0069a9efd70] [c05240ec] 
ksys_write+0x7c/0x140
[   56.871498][ T3454] [c0069a9efdc0] [c0038a94] 
system_call_exception+0x114/0x1e0
[   56.871535][ T3454] [c0069a9efe20] [c000c870] 
system_call_common+0xf0/0x278
[   56.871570][ T3454] Instruction dump:
[   56.871592][ T3454] 7d908120 4e800020 6000 2b890001 409effd4 3c62ff8b 38631958 4bf4491d 
[   56.871639][ T3454] 6000 4bc0 6000 fba10068 <0fe0> 3901 3ce20003 3d22fed7 
[   56.871685][ T3454] irq event stamp: 95388

[   56.871708][ T3454] hardirqs last  enabled at (95387): [] 
console_unlock+0x6a4/0x950
[   56.871746][ T3454] hardirqs last disabled at (95388): [] 
program_check_common_virt+0x2bc/0x310
[   56.871785][ T3454] softirqs last  enabled at (91222): [] 
__do_softirq+0x658/0x8d8
[   56.871823][ T3454] softirqs last disabled at (91215): [] 
irq_exit+0x16c/0x1d0
[   56.871859][ T3454] ---[ end trace 48f8445450a4e206 ]---
[   56.871907][ T3454] ftrace failed to modify 
[   56.871913][ T3454] [] show_sas_rphy_phy_identifier+0xc/0x60 [scsi_transport_sas]

show_sas_rphy_phy_identifier at drivers/scsi/scsi_transport_sas.c:1221
[   56.871969][ T3454]  actual:   1d:3d:00:48
[   56.871996][ T3454] Setting ftrace call site

[PATCH v4 1/2] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-04-17 Thread Christophe Leroy

unsafe_put_user() is designed to take benefit of 'asm goto'.

Instead of using the standard __put_user() approach and branch
based on the returned error, use 'asm goto' and make the
exception code branch directly to the error label. There is
no code anymore in the fixup section.

This change significantly simplifies functions using
unsafe_put_user()

Small exemple of the benefit with the following code:

struct test {
u32 item1;
u16 item2;
u8 item3;
u64 item4;
};

int set_test_to_user(struct test __user *test, u32 item1, u16 item2, u8 item3, 
u64 item4)
{
unsafe_put_user(item1, >item1, failed);
unsafe_put_user(item2, >item2, failed);
unsafe_put_user(item3, >item3, failed);
unsafe_put_user(item4, >item4, failed);
return 0;
failed:
return -EFAULT;
}

Before the patch:

0be8 :
 be8:   39 20 00 00 li  r9,0
 bec:   90 83 00 00 stw r4,0(r3)
 bf0:   2f 89 00 00 cmpwi   cr7,r9,0
 bf4:   40 9e 00 38 bne cr7,c2c 
 bf8:   b0 a3 00 04 sth r5,4(r3)
 bfc:   2f 89 00 00 cmpwi   cr7,r9,0
 c00:   40 9e 00 2c bne cr7,c2c 
 c04:   98 c3 00 06 stb r6,6(r3)
 c08:   2f 89 00 00 cmpwi   cr7,r9,0
 c0c:   40 9e 00 20 bne cr7,c2c 
 c10:   90 e3 00 08 stw r7,8(r3)
 c14:   91 03 00 0c stw r8,12(r3)
 c18:   21 29 00 00 subfic  r9,r9,0
 c1c:   7d 29 49 10 subfe   r9,r9,r9
 c20:   38 60 ff f2 li  r3,-14
 c24:   7d 23 18 38 and r3,r9,r3
 c28:   4e 80 00 20 blr
 c2c:   38 60 ff f2 li  r3,-14
 c30:   4e 80 00 20 blr

 <.fixup>:
...
  b8:   39 20 ff f2 li  r9,-14
  bc:   48 00 00 00 b   bc <.fixup+0xbc>
bc: R_PPC_REL24 .text+0xbf0
  c0:   39 20 ff f2 li  r9,-14
  c4:   48 00 00 00 b   c4 <.fixup+0xc4>
c4: R_PPC_REL24 .text+0xbfc
  c8:   39 20 ff f2 li  r9,-14
  cc:   48 00 00 00 b   cc <.fixup+0xcc>
cc: R_PPC_REL24 .text+0xc08
  d0:   39 20 ff f2 li  r9,-14
  d4:   48 00 00 00 b   d4 <.fixup+0xd4>
d4: R_PPC_REL24 .text+0xc18

 <__ex_table>:
...
a0: R_PPC_REL32 .text+0xbec
a4: R_PPC_REL32 .fixup+0xb8
a8: R_PPC_REL32 .text+0xbf8
ac: R_PPC_REL32 .fixup+0xc0
b0: R_PPC_REL32 .text+0xc04
b4: R_PPC_REL32 .fixup+0xc8
b8: R_PPC_REL32 .text+0xc10
bc: R_PPC_REL32 .fixup+0xd0
c0: R_PPC_REL32 .text+0xc14
c4: R_PPC_REL32 .fixup+0xd0

After the patch:

0be8 :
 be8:   90 83 00 00 stw r4,0(r3)
 bec:   b0 a3 00 04 sth r5,4(r3)
 bf0:   98 c3 00 06 stb r6,6(r3)
 bf4:   90 e3 00 08 stw r7,8(r3)
 bf8:   91 03 00 0c stw r8,12(r3)
 bfc:   38 60 00 00 li  r3,0
 c00:   4e 80 00 20 blr
 c04:   38 60 ff f2 li  r3,-14
 c08:   4e 80 00 20 blr

 <__ex_table>:
...
a0: R_PPC_REL32 .text+0xbe8
a4: R_PPC_REL32 .text+0xc04
a8: R_PPC_REL32 .text+0xbec
ac: R_PPC_REL32 .text+0xc04
b0: R_PPC_REL32 .text+0xbf0
b4: R_PPC_REL32 .text+0xc04
b8: R_PPC_REL32 .text+0xbf4
bc: R_PPC_REL32 .text+0xc04
c0: R_PPC_REL32 .text+0xbf8
c4: R_PPC_REL32 .text+0xc04

Signed-off-by: Christophe Leroy 
Reviewed-by: Segher Boessenkool 
---
v4:
- no change

v3:
- Added <> modifier to __put_user_asm_goto()
- Removed %U1 modifier to __put_user_asm2_goto()

v2:
- Grouped most __goto() macros together
- Removed stuff in .fixup section, referencing the error label
directly from the extable
- Using more flexible addressing in asm.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 61 +-
 1 file changed, 52 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 9cc9c106ae2a..9365b59495a2 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -93,12 +93,12 @@ static inline int __access_ok(unsigned long addr, unsigned 
long size,
 #define __get_user(x, ptr) \
__get_user_nocheck((x), (ptr), sizeof(*(ptr)), true)
 #define __put_user(x, ptr) \
-   __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), true)
+   __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+#define __put_user_goto(x, ptr, label) \
+   __put_user_nocheck_goto((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), 
label)
 
 #define __get_user_allowed(x, ptr) \
__get_user_nocheck((x), (ptr),

[Bug 206203] kmemleak reports various leaks in drivers/of/unittest.c

2020-04-17 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=206203

--- Comment #12 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 288567
  --> https://bugzilla.kernel.org/attachment.cgi?id=288567=edit
dmesg (kernel 5.7-rc1, PowerMac G5 11,2)

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[PATCH v3 4/4] hugetlbfs: clean up command line processing

2020-04-17 Thread Mike Kravetz

With all hugetlb page processing done in a single file clean up code.
- Make code match desired semantics
  - Update documentation with semantics
- Make all warnings and errors messages start with 'HugeTLB:'.
- Consistently name command line parsing routines.
- Check for hugepages_supported() before processing parameters.
- Add comments to code
  - Describe some of the subtle interactions
  - Describe semantics of command line arguments

This patch also fixes issues with implicitly setting the number of
gigantic huge pages to preallocate.  Previously on X86 command line,
hugepages=2 default_hugepagesz=1G
would result in zero 1G pages being preallocated and,
# grep HugePages_Total /proc/meminfo
HugePages_Total:   0
# sysctl -a | grep nr_hugepages
vm.nr_hugepages = 2
vm.nr_hugepages_mempolicy = 2
# cat /proc/sys/vm/nr_hugepages
2
After this patch 2 gigantic pages will be preallocated and all the
proc, sysfs, sysctl and meminfo files will accurately reflect this.

To address the issue with gigantic pages, a small change in behavior
was made to command line processing.  Previously the command line,
hugepages=128 default_hugepagesz=2M hugepagesz=2M hugepages=256
would result in the allocation of 256 2M huge pages.  The value 128
would be ignored without any warning.  After this patch, 128 2M pages
will be allocated and a warning message will be displayed indicating
the value of 256 is ignored.  This change in behavior is required
because allocation of implicitly specified gigantic pages must be done
when the default_hugepagesz= is encountered for gigantic pages.
Previously the code waited until later in the boot process (hugetlb_init),
to allocate pages of default size.  However the bootmem allocator required
for gigantic allocations is not available at this time.

Signed-off-by: Mike Kravetz 
---
 .../admin-guide/kernel-parameters.txt |  40 +++--
 Documentation/admin-guide/mm/hugetlbpage.rst  |  35 
 mm/hugetlb.c  | 159 ++
 3 files changed, 190 insertions(+), 44 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index f2a93c8679e8..8cd78cc87a1c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -834,12 +834,15 @@
See also Documentation/networking/decnet.txt.
 
default_hugepagesz=
-   [same as hugepagesz=] The size of the default
-   HugeTLB page size. This is the size represented by
-   the legacy /proc/ hugepages APIs, used for SHM, and
-   default size when mounting hugetlbfs filesystems.
-   Defaults to the default architecture's huge page size
-   if not specified.
+   [HW] The size of the default HugeTLB page. This is
+   the size represented by the legacy /proc/ hugepages
+   APIs.  In addition, this is the default hugetlb size
+   used for shmget(), mmap() and mounting hugetlbfs
+   filesystems.  If not specified, defaults to the
+   architecture's default huge page size.  Huge page
+   sizes are architecture dependent.  See also
+   Documentation/admin-guide/mm/hugetlbpage.rst.
+   Format: size[KMG]
 
deferred_probe_timeout=
[KNL] Debugging option to set a timeout in seconds for
@@ -1479,13 +1482,24 @@
hugepages using the cma allocator. If enabled, the
boot-time allocation of gigantic hugepages is skipped.
 
-   hugepages=  [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
-   hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
-   On x86-64 and powerpc, this option can be specified
-   multiple times interleaved with hugepages= to reserve
-   huge pages of different sizes. Valid pages sizes on
-   x86-64 are 2M (when the CPU supports "pse") and 1G
-   (when the CPU supports the "pdpe1gb" cpuinfo flag).
+   hugepages=  [HW] Number of HugeTLB pages to allocate at boot.
+   If this follows hugepagesz (below), it specifies
+   the number of pages of hugepagesz to be allocated.
+   If this is the first HugeTLB parameter on the command
+   line, it specifies the number of pages to allocate for
+   the default huge page size.  See also
+   Documentation/admin-guide/mm/hugetlbpage.rst.
+   Format: 
+
+   hugepagesz=
+   [HW] The size

[PATCH 1/2] signal: Factor copy_siginfo_to_external32 from copy_siginfo_to_user32

2020-04-17 Thread Eric W. Biederman



To remove the use of set_fs in the coredump code there needs to be a
way to convert a kernel siginfo to a userspace compat siginfo.

Call that function copy_siginfo_to_compat and factor it out of
copy_siginfo_to_user32.

The existence of x32 complicates this code.  On x32 SIGCHLD uses 64bit
times for utime and stime.  As only SIGCHLD is affected and SIGCHLD
never causes a coredump I have avoided handling that case.

Signed-off-by: "Eric W. Biederman" 
---
 include/linux/compat.h |   1 +
 kernel/signal.c| 108 +++--
 2 files changed, 63 insertions(+), 46 deletions(-)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 0480ba4db592..4962b254e550 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -402,6 +402,7 @@ long compat_get_bitmap(unsigned long *mask, const 
compat_ulong_t __user *umask,
   unsigned long bitmap_size);
 long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
   unsigned long bitmap_size);
+void copy_siginfo_to_external32(struct compat_siginfo *to, const struct 
kernel_siginfo *from);
 int copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo 
__user *from);
 int copy_siginfo_to_user32(struct compat_siginfo __user *to, const 
kernel_siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
diff --git a/kernel/signal.c b/kernel/signal.c
index e58a6c619824..578f196898cb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3235,90 +3235,106 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const 
siginfo_t __user *from)
 }
 
 #ifdef CONFIG_COMPAT
-int copy_siginfo_to_user32(struct compat_siginfo __user *to,
-  const struct kernel_siginfo *from)
-#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
+void copy_siginfo_to_external32(struct compat_siginfo *to,
+   const struct kernel_siginfo *from)
 {
-   return __copy_siginfo_to_user32(to, from, in_x32_syscall());
-}
-int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
-const struct kernel_siginfo *from, bool x32_ABI)
-#endif
-{
-   struct compat_siginfo new;
-   memset(, 0, sizeof(new));
+   /*
+* This function does not work properly for SIGCHLD on x32,
+* but it does not need to as SIGCHLD never causes a coredump.
+*/
+   memset(to, 0, sizeof(*to));
 
-   new.si_signo = from->si_signo;
-   new.si_errno = from->si_errno;
-   new.si_code  = from->si_code;
+   to->si_signo = from->si_signo;
+   to->si_errno = from->si_errno;
+   to->si_code  = from->si_code;
switch(siginfo_layout(from->si_signo, from->si_code)) {
case SIL_KILL:
-   new.si_pid = from->si_pid;
-   new.si_uid = from->si_uid;
+   to->si_pid = from->si_pid;
+   to->si_uid = from->si_uid;
break;
case SIL_TIMER:
-   new.si_tid = from->si_tid;
-   new.si_overrun = from->si_overrun;
-   new.si_int = from->si_int;
+   to->si_tid = from->si_tid;
+   to->si_overrun = from->si_overrun;
+   to->si_int = from->si_int;
break;
case SIL_POLL:
-   new.si_band = from->si_band;
-   new.si_fd   = from->si_fd;
+   to->si_band = from->si_band;
+   to->si_fd   = from->si_fd;
break;
case SIL_FAULT:
-   new.si_addr = ptr_to_compat(from->si_addr);
+   to->si_addr = ptr_to_compat(from->si_addr);
 #ifdef __ARCH_SI_TRAPNO
-   new.si_trapno = from->si_trapno;
+   to->si_trapno = from->si_trapno;
 #endif
break;
case SIL_FAULT_MCEERR:
-   new.si_addr = ptr_to_compat(from->si_addr);
+   to->si_addr = ptr_to_compat(from->si_addr);
 #ifdef __ARCH_SI_TRAPNO
-   new.si_trapno = from->si_trapno;
+   to->si_trapno = from->si_trapno;
 #endif
-   new.si_addr_lsb = from->si_addr_lsb;
+   to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
-   new.si_addr = ptr_to_compat(from->si_addr);
+   to->si_addr = ptr_to_compat(from->si_addr);
 #ifdef __ARCH_SI_TRAPNO
-   new.si_trapno = from->si_trapno;
+   to->si_trapno = from->si_trapno;
 #endif
-   new.si_lower = ptr_to_compat(from->si_lower);
-   new.si_upper = ptr_to_compat(from->si_upper);
+   to->si_lower = ptr_to_compat(from->si_lower);
+   to->si_upper = ptr_to_compat(from->si_upper);
break;
case SIL_FAULT_PKUERR:
-   new.si_addr = ptr_to_compat(from->si_addr);
+   to->si_addr = ptr_to_compat(from->si_addr);
 #ifdef __ARCH_SI_TRAPNO
-

Re: [PATCH v4,4/4] drivers: uio: new driver for fsl_85xx_cache_sram

2020-04-17 Thread Scott Wood

On Fri, 2020-04-17 at 22:16 +0800, 王文虎 wrote:
> > On Fri, 2020-04-17 at 09:42 +0200, Greg KH wrote:>> On Thu, Apr 16, 2020
> > at 11:58:29PM -0500, Scott Wood wrote:
> > > > On Fri, 2020-04-17 at 10:31 +0800, 王文虎 wrote:
> > > > > Sounds it is. And does the modification below fit well?
> > > > > ---
> > > > > -static const struct of_device_id uio_mpc85xx_l2ctlr_of_match[] = {
> > > > > -   {   .compatible = "uio,mpc85xx-cache-sram", },
> > > > > -   {},
> > > > > +#ifdef CONFIG_OF
> > > > > +static struct of_device_id uio_fsl_85xx_cache_sram_of_match[] = {
> > > > > +   { /* This is filled with module_parm */ },
> > > > > +   { /* Sentinel */ },
> > > > >  };
> > > > > +MODULE_DEVICE_TABLE(of, uio_fsl_85xx_cache_sram_of_match);
> > > > > +module_param_string(of_id,
> > > > > uio_fsl_85xx_cache_sram_of_match[0].compatible,
> > > > > +   sizeof(uio_fsl_85xx_cache_sram_of_match[
> > > > > 0].c
> > > > > ompa
> > > > > tible), 0);
> > > > > +MODULE_PARM_DESC(of_id, "platform device id to be handled by cache-
> > > > > sram-
> > > > > uio");
> > > > > +#endif
> > > > 
> > > > No.  The point is that you wouldn't be configuring this with the
> > > > device
> > > > tree
> > > > at all.
> > > 
> > > Wait, why not?  Don't force people to use module parameters, that is
> > > crazy.  DT describes the hardware involved, if someone wants to bind to
> > > a specific range of memory, as described by DT, why can't they do so?
> > 
> > Yes, DT describes the hardware, and as I've said a couple times already,
> > this
> > isn't hardware description.
> > 
> > I'm not forcing people to use module parameters.  That was a least-effort
> > suggestion to avoid abusing the DT.  I later said I'd try to come up with
> > a
> > patch that allocates regions dynamically (and most likely doesn't use UIO
> > at
> > all).
> > 
> > > I can understand not liking the name "uio" in a dt tree, but there's no
> > > reason that DT can not describe what a driver binds to here.
> > 
> > The DT already describes this hardware, and there is already code that
> > binds
> > to it.  This patch is trying to add a second node for it with
> > configuration.
> > 
> 
> Hi, Scott, Greg,
> Seems like no balance here. How about I implement a driver of uio including
> the l2ctrl and cache_sram related implementations?
> And this way, the driver would be a hardware level driver and targeted for
> uio.

No, duplicating the code makes no sense whatsoever.  Please just wait a bit
and I'll send a patch to have the existing driver expose a dynamic allocation
interface to userspace.

-Scott

Re: [PATCH 2/8] signal: clean up __copy_siginfo_to_user32

2020-04-17 Thread Eric W. Biederman

Christoph Hellwig  writes:

> Instead of an architecture specific calling convention in common code
> just pass a flags argument with architecture specific values.

This bothers me because it makes all architectures pay for the sins of
x32.  Further it starts burying the details of the what is happening in
architecture specific helpers.  Hiding the fact that there is only
one niche architecture that does anything weird.

I am very sensitive to hiding away signal handling details right now
because way to much of the signal handling code got hidden in
architecture specific files and was quite buggy because as a result.

My general sense is putting all of the weird details up front and center
in kernel/signal.c is the only way for this code will be looked at
and successfully maintained.

How about these patches to solve set_fs with binfmt_elf instead:

Eric W. Biederman (2):
  signal: Factor copy_siginfo_to_external32 from copy_siginfo_to_user32
  signal: Remove the set_fs in binfmt_elf.c:fill_siginfo_note

 fs/binfmt_elf.c|   5 +
 fs/compat_binfmt_elf.c |   2 +-
 include/linux/compat.h |   1 +
 include/linux/signal.h |   7 +++
 kernel/signal.c| 108 
++--

Eric

Re: [PATCH] drivers: uio: new driver uio_fsl_85xx_cache_sram

2020-04-17 Thread Scott Wood

On Fri, 2020-04-17 at 10:21 -0700, Wang Wenhu wrote:
> Implements a new uio driver for freescale 85xx platforms to access
> the Cache-Sram form user level. It is extremely helpful for the user
> space applications that require high performance memory accesses.
> 
> Cc: Greg Kroah-Hartman 
> Cc: Christophe Leroy 
> Cc: Scott Wood 
> Cc: Michael Ellerman 
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Wang Wenhu 
> ---
>  drivers/uio/Kconfig   |   8 +
>  drivers/uio/Makefile  |   1 +
>  drivers/uio/uio_fsl_85xx_cache_sram.c | 407 ++
>  3 files changed, 416 insertions(+)
>  create mode 100644 drivers/uio/uio_fsl_85xx_cache_sram.c

NACK, we don't need two copies of this code in the kernel.  Please just wait a
bit and I'll send a patch to have the existing driver expose a dynamic
allocation interface to userspace.

-Scott

[PATCH v3 0/4] Clean up hugetlb boot command line processing

2020-04-17 Thread Mike Kravetz

v3 -
   Used weak attribute method of defining arch_hugetlb_valid_size.
 This eliminates changes to arch specific hugetlb.h files (Peter)
   Updated documentation (Peter, Randy)
   Fixed handling of implicitly specified gigantic page preallocation
 in existing code and removed documentation of such.  There is now
 no difference between handling of gigantic and non-gigantic pages.
 (Peter, Nitesh).
 This requires the most review as there is a small change to
 undocumented behavior.  See patch 4 commit message for details.
   Added Acks and Reviews (Mina, Peter)

v2 -
   Fix build errors with patch 1 (Will)
   Change arch_hugetlb_valid_size arg to unsigned long and remove
 irrelevant 'extern' keyword (Christophe)
   Documentation and other misc changes (Randy, Christophe, Mina)
   Do not process command line options if !hugepages_supported()
 (Dave, but it sounds like we may want to additional changes to
  hugepages_supported() for x86?  If that is needed I would prefer
  a separate patch.)

Longpeng(Mike) reported a weird message from hugetlb command line processing
and proposed a solution [1].  While the proposed patch does address the
specific issue, there are other related issues in command line processing.
As hugetlbfs evolved, updates to command line processing have been made to
meet immediate needs and not necessarily in a coordinated manner.  The result
is that some processing is done in arch specific code, some is done in arch
independent code and coordination is problematic.  Semantics can vary between
architectures.

The patch series does the following:
- Define arch specific arch_hugetlb_valid_size routine used to validate
  passed huge page sizes.
- Move hugepagesz= command line parsing out of arch specific code and into
  an arch independent routine.
- Clean up command line processing to follow desired semantics and
  document those semantics.

[1] https://lore.kernel.org/linux-mm/20200305033014.1152-1-longpe...@huawei.com

Mike Kravetz (4):
  hugetlbfs: add arch_hugetlb_valid_size
  hugetlbfs: move hugepagesz= parsing to arch independent code
  hugetlbfs: remove hugetlb_add_hstate() warning for existing hstate
  hugetlbfs: clean up command line processing

 .../admin-guide/kernel-parameters.txt |  40 ++--
 Documentation/admin-guide/mm/hugetlbpage.rst  |  35 
 arch/arm64/mm/hugetlbpage.c   |  30 +--
 arch/powerpc/mm/hugetlbpage.c |  30 +--
 arch/riscv/mm/hugetlbpage.c   |  24 +--
 arch/s390/mm/hugetlbpage.c|  24 +--
 arch/sparc/mm/init_64.c   |  43 +---
 arch/x86/mm/hugetlbpage.c |  23 +--
 include/linux/hugetlb.h   |   2 +-
 mm/hugetlb.c  | 190 +++---
 10 files changed, 271 insertions(+), 170 deletions(-)

-- 
2.25.2

Re: remove set_fs calls from the exec and coredump code v2

2020-04-17 Thread Eric W. Biederman

Christoph Hellwig  writes:

> Hi all,
>
> this series gets rid of playing with the address limit in the exec and
> coredump code.  Most of this was fairly trivial, the biggest changes are
> those to the spufs coredump code.
>
> Changes since v1:
>  - properly spell NUL
>  - properly handle the compat siginfo case in ELF coredumps

Quick question is exec from a kernel thread within the scope of what you
are looking at?

There is a set_fs(USER_DS) in flush_old_exec whose sole purpose appears
to be to allow exec from kernel threads.  Where the kernel threads
run with set_fs(KERNEL_DS) until they call exec.

Eric

[PATCH 2/2] signal: Remove the set_fs in binfmt_elf.c:fill_siginfo_note

2020-04-17 Thread Eric W. Biederman



The code in binfmt_elf.c is differnt from the rest of the code that
processes siginfo, as it sends siginfo from a kernel buffer to a file
rather than from kernel memory to userspace buffers.  To remove it's
use of set_fs the code needs some different siginfo helpers.

Add the helper copy_siginfo_to_external to copy from the kernel's
internal siginfo layout to a buffer in the siginfo layout that
userspace expects.

Modify fill_siginfo_note to use copy_siginfo_to_external instead of
set_fs and copy_siginfo_to_user.

Update compat_binfmt_elf.c to use the previously added
copy_siginfo_to_external32 to handle the compat case.

Signed-off-by: "Eric W. Biederman" 
---
 fs/binfmt_elf.c| 5 +
 fs/compat_binfmt_elf.c | 2 +-
 include/linux/signal.h | 7 +++
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 13f25e241ac4..a1f57e20c3cf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1556,10 +1556,7 @@ static void fill_auxv_note(struct memelfnote *note, 
struct mm_struct *mm)
 static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t 
*csigdata,
const kernel_siginfo_t *siginfo)
 {
-   mm_segment_t old_fs = get_fs();
-   set_fs(KERNEL_DS);
-   copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
-   set_fs(old_fs);
+   copy_siginfo_to_external(csigdata, siginfo);
fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
 }
 
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index aaad4ca1217e..fa0e24e1b726 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -39,7 +39,7 @@
  */
 #define user_long_tcompat_long_t
 #define user_siginfo_t compat_siginfo_t
-#define copy_siginfo_to_user   copy_siginfo_to_user32
+#define copy_siginfo_to_external   copy_siginfo_to_external32
 
 /*
  * The machine-dependent core note format types are defined in 
elfcore-compat.h,
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 05bacd2ab135..c1796321cadb 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -24,6 +24,13 @@ static inline void clear_siginfo(kernel_siginfo_t *info)
 
 #define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct 
kernel_siginfo))
 
+static inline void copy_siginfo_to_external(siginfo_t *to,
+   const kernel_siginfo_t *from)
+{
+   memcpy(to, from, sizeof(*from));
+   memset(((char *)to) + sizeof(struct kernel_siginfo), 0, 
SI_EXPANSION_SIZE);
+}
+
 int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
 int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);
 
-- 
2.25.0

Re: [PATCH 4/8] binfmt_elf: open code copy_siginfo_to_user to kernelspace buffer

2020-04-17 Thread Arnd Bergmann

On Fri, Apr 17, 2020 at 8:13 PM Eric W. Biederman  wrote:
>
> Christoph Hellwig  writes:
>
> > On Wed, Apr 15, 2020 at 10:20:11AM +0200, Arnd Bergmann wrote:
> >> > I'd rather keep it out of this series and to
> >> > an interested party.  Then again x32 doesn't seem to have a whole lot
> >> > of interested parties..
> >>
> >> Fine with me. It's on my mental list of things that we want to kill off
> >> eventually as soon as the remaining users stop replying to questions
> >> about it.
> >>
> >> In fact I should really turn that into a properly maintained list in
> >> Documentation/... that contains any options that someone has
> >> asked about removing in the past, along with the reasons for keeping
> >> it around and a time at which we should ask about it again.
> >
> > To the newly added x86 maintainers:  Arnd brought up the point that
> > elf_core_dump writes the ABI siginfo format into the core dump. That
> > format differs for i386 vs x32.  Is there any good way to find out
> > which is the right format when are not in a syscall?
> >
> > As far a I can tell x32 vs i386 just seems to be based around what
> > syscall table was used for the current syscall, but core dumps aren't
> > always in syscall context.
>
> I don't think this matters.  The i386 and x32 signal structures
> only differ for SIGCHLD.  The SIGCHLD signal does cause coredumps.
> So as long as we get the 32bit vs 64bit distinct correct all should be
> well.

Ok, makes sense. Thanks for taking a look into this!

  Arnd

Re: [RFC PATCH] powerpc/64/signal: balance return predictor stack in signal trampoline

2020-04-17 Thread Alan Modra

On Fri, Apr 17, 2020 at 07:17:47PM +1000, Nicholas Piggin wrote:
> I don't know much about dwarf, gdb still seems to recognize the signal
> frame and unwind properly if I break inside a signal handler.

Yes, the dwarf unwind info still looks good.  The commented out dwarf
near the end of sigtramp.S should probably go.  At least if you really
can't take an async signal in the trampoline (a kernel question, not
anything to do with gcc support of async signals as the comment
wrongly says).  If you *can* take an async signal at some point past
the trampoline addi, then delete the comment and uncomment the code.
Note that the advance_loc there bitrotted ever since the nop was added
before the trampoline, so you'd need to change that to an advance_loc
that moves from .Lsigrt_start to immediately after the addi, ie. 0x42.

-- 
Alan Modra
Australia Development Lab, IBM

Re: [PATCH] powerpc/pseries: Make vio and ibmebus initcalls pseries specific

2020-04-17 Thread Tyrel Datwyler

On 4/16/20 9:07 PM, Oliver O'Halloran wrote:
> The vio and ibmebus buses are used for pseries specific paravirtualised
> devices and currently they're initialised by the generic initcall types.
> This is mostly fine, but it can result in some nuisance errors in dmesg
> when booting on PowerNV on some OSes, e.g.
> 
> [2.984439] synth uevent: /devices/vio: failed to send uevent
> [2.984442] vio vio: uevent: failed to send synthetic uevent
> [   17.968551] synth uevent: /devices/vio: failed to send uevent
> [   17.968554] vio vio: uevent: failed to send synthetic uevent
> 
> We don't see anything similar for the ibmebus because that depends on
> !CONFIG_LITTLE_ENDIAN.
> 
> This patch squashes those by switching to using machine_*_initcall() so the 
> bus
> type is only registered when the kernel is running on a pseries machine.
> 
> Signed-off-by: Oliver O'Halloran 
> ---

Reviewed-by: Tyrel Datwyler

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Michael S. Tsirkin

On Fri, Apr 17, 2020 at 04:51:19PM +0800, Jason Wang wrote:
> 
> On 2020/4/17 下午4:46, Michael S. Tsirkin wrote:
> > On Fri, Apr 17, 2020 at 04:39:49PM +0800, Jason Wang wrote:
> > > On 2020/4/17 下午4:29, Michael S. Tsirkin wrote:
> > > > On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:
> > > > > On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:
> > > > > > On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:
> > > > > > > On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:
> > > > > > > > On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:
> > > > > > > > > We try to keep the defconfig untouched after decoupling 
> > > > > > > > > CONFIG_VHOST
> > > > > > > > > out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
> > > > > > > > > ("vhost: refine vhost and vringh kconfig") by enabling 
> > > > > > > > > VHOST_MENU by
> > > > > > > > > default. Then the defconfigs can keep enabling 
> > > > > > > > > CONFIG_VHOST_NET
> > > > > > > > > without the caring of CONFIG_VHOST.
> > > > > > > > > 
> > > > > > > > > But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs 
> > > > > > > > > and even
> > > > > > > > > for the ones that doesn't want vhost. So it actually shifts 
> > > > > > > > > the
> > > > > > > > > burdens to the maintainers of all other to add 
> > > > > > > > > "CONFIG_VHOST_MENU is
> > > > > > > > > not set". So this patch tries to enable CONFIG_VHOST 
> > > > > > > > > explicitly in
> > > > > > > > > defconfigs that enables CONFIG_VHOST_NET and 
> > > > > > > > > CONFIG_VHOST_VSOCK.
> > > > > > > > > 
> > > > > > > > > Acked-by: Christian Borntraeger   
> > > > > > > > > (s390)
> > > > > > > > > Acked-by: Michael Ellerman   (powerpc)
> > > > > > > > > Cc: Thomas Bogendoerfer
> > > > > > > > > Cc: Benjamin Herrenschmidt
> > > > > > > > > Cc: Paul Mackerras
> > > > > > > > > Cc: Michael Ellerman
> > > > > > > > > Cc: Heiko Carstens
> > > > > > > > > Cc: Vasily Gorbik
> > > > > > > > > Cc: Christian Borntraeger
> > > > > > > > > Reported-by: Geert Uytterhoeven
> > > > > > > > > Signed-off-by: Jason Wang
> > > > > > > > I rebased this on top of OABI fix since that
> > > > > > > > seems more orgent to fix.
> > > > > > > > Pushed to my vhost branch pls take a look and
> > > > > > > > if possible test.
> > > > > > > > Thanks!
> > > > > > > I test this patch by generating the defconfigs that wants 
> > > > > > > vhost_net or
> > > > > > > vhost_vsock. All looks fine.
> > > > > > > 
> > > > > > > But having CONFIG_VHOST_DPN=y may end up with the similar 
> > > > > > > situation that
> > > > > > > this patch want to address.
> > > > > > > Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add 
> > > > > > > another
> > > > > > > menuconfig for VHOST_RING and do something similar?
> > > > > > > 
> > > > > > > Thanks
> > > > > > Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
> > > > > > an internal variable for the OABI fix. I kept it separate
> > > > > > so it's easy to revert for 5.8. Yes we could squash it into
> > > > > > VHOST directly but I don't see how that changes logic at all.
> > > > > Sorry for being unclear.
> > > > > 
> > > > > I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be 
> > > > > left
> > > > > in the defconfigs.
> > > > But who cares?
> > > FYI, please seehttps://www.spinics.net/lists/kvm/msg212685.html
> > The complaint was not about the symbol IIUC.  It was that we caused
> > everyone to build vhost unless they manually disabled it.
> 
> 
> There could be some misunderstanding here. I thought it's somehow similar: a
> CONFIG_VHOST_MENU=y will be left in the defconfigs even if CONFIG_VHOST is
> not set.
> 
> Thanks
> 

BTW do entries with no prompt actually appear in defconfig?

> >

[PATCH v6 4/9] crypto/nx: Initialize coproc entry with kzalloc

2020-04-17 Thread Haren Myneni



coproc entry is initialized during NX probe on power9, but not on P8.
nx842_delete_coprocs() is used for both and frees receive window if it
is allocated. Getting crash for rmmod on P8 since coproc->vas.rxwin
is not initialized.

This patch replaces kmalloc with kzalloc in nx842_powernv_probe()

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/nx-842-powernv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
index c037a24..8e63326 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -922,7 +922,7 @@ static int __init nx842_powernv_probe(struct device_node 
*dn)
return -EINVAL;
}
 
-   coproc = kmalloc(sizeof(*coproc), GFP_KERNEL);
+   coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
if (!coproc)
return -ENOMEM;
 
-- 
1.8.3.1

[PATCH v6 6/9] crypto/nx: Make enable code generic to add new GZIP compression type

2020-04-17 Thread Haren Myneni



Make setup and enable code generic to support new GZIP compression type.
Changed nx842 reference to nx and moved some code to new functions.
Functionality is not changed except sparse warning fix - setting NULL
instead of 0 for per_cpu send window in nx_delete_coprocs().

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/nx-common-powernv.c | 161 +-
 1 file changed, 101 insertions(+), 60 deletions(-)

diff --git a/drivers/crypto/nx/nx-common-powernv.c 
b/drivers/crypto/nx/nx-common-powernv.c
index f42881f..82dfa60 100644
--- a/drivers/crypto/nx/nx-common-powernv.c
+++ b/drivers/crypto/nx/nx-common-powernv.c
@@ -40,9 +40,9 @@ struct nx842_workmem {
char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
 } __packed __aligned(WORKMEM_ALIGN);
 
-struct nx842_coproc {
+struct nx_coproc {
unsigned int chip_id;
-   unsigned int ct;
+   unsigned int ct;/* Can be 842 or GZIP high/normal*/
unsigned int ci;/* Coprocessor instance, used with icswx */
struct {
struct vas_window *rxwin;
@@ -58,9 +58,15 @@ struct nx842_coproc {
 static DEFINE_PER_CPU(struct vas_window *, cpu_txwin);
 
 /* no cpu hotplug on powernv, so this list never changes after init */
-static LIST_HEAD(nx842_coprocs);
+static LIST_HEAD(nx_coprocs);
 static unsigned int nx842_ct;  /* used in icswx function */
 
+/*
+ * Using same values as in skiboot or coprocessor type representing
+ * in NX workbook.
+ */
+#define NX_CT_842  (3)
+
 static int (*nx842_powernv_exec)(const unsigned char *in,
unsigned int inlen, unsigned char *out,
unsigned int *outlenp, void *workmem, int fc);
@@ -666,15 +672,15 @@ static int nx842_powernv_decompress(const unsigned char 
*in, unsigned int inlen,
  wmem, CCW_FC_842_DECOMP_CRC);
 }
 
-static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
+static inline void nx_add_coprocs_list(struct nx_coproc *coproc,
int chipid)
 {
coproc->chip_id = chipid;
INIT_LIST_HEAD(>list);
-   list_add(>list, _coprocs);
+   list_add(>list, _coprocs);
 }
 
-static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
+static struct vas_window *nx_alloc_txwin(struct nx_coproc *coproc)
 {
struct vas_window *txwin = NULL;
struct vas_tx_win_attr txattr;
@@ -704,9 +710,9 @@ static struct vas_window *nx842_alloc_txwin(struct 
nx842_coproc *coproc)
  * cpu_txwin is used in copy/paste operation for each compression /
  * decompression request.
  */
-static int nx842_open_percpu_txwins(void)
+static int nx_open_percpu_txwins(void)
 {
-   struct nx842_coproc *coproc, *n;
+   struct nx_coproc *coproc, *n;
unsigned int i, chip_id;
 
for_each_possible_cpu(i) {
@@ -714,17 +720,18 @@ static int nx842_open_percpu_txwins(void)
 
chip_id = cpu_to_chip_id(i);
 
-   list_for_each_entry_safe(coproc, n, _coprocs, list) {
+   list_for_each_entry_safe(coproc, n, _coprocs, list) {
/*
 * Kernel requests use only high priority FIFOs. So
 * open send windows for these FIFOs.
+* GZIP is not supported in kernel right now.
 */
 
if (coproc->ct != VAS_COP_TYPE_842_HIPRI)
continue;
 
if (coproc->chip_id == chip_id) {
-   txwin = nx842_alloc_txwin(coproc);
+   txwin = nx_alloc_txwin(coproc);
if (IS_ERR(txwin))
return PTR_ERR(txwin);
 
@@ -743,13 +750,28 @@ static int nx842_open_percpu_txwins(void)
return 0;
 }
 
+static int __init nx_set_ct(struct nx_coproc *coproc, const char *priority,
+   int high, int normal)
+{
+   if (!strcmp(priority, "High"))
+   coproc->ct = high;
+   else if (!strcmp(priority, "Normal"))
+   coproc->ct = normal;
+   else {
+   pr_err("Invalid RxFIFO priority value\n");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
-   int vasid, int *ct)
+   int vasid, int type, int *ct)
 {
struct vas_window *rxwin = NULL;
struct vas_rx_win_attr rxattr;
-   struct nx842_coproc *coproc;
u32 lpid, pid, tid, fifo_size;
+   struct nx_coproc *coproc;
u64 rx_fifo;
const char *priority;
int ret;
@@ -794,15 +816,12 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
if (!coproc)
return -ENOMEM;

[PATCH v6 9/9] Documentation/powerpc: VAS API

2020-04-17 Thread Haren Myneni



Power9 introduced Virtual Accelerator Switchboard (VAS) which allows
userspace to communicate with Nest Accelerator (NX) directly. But
kernel has to establish channel to NX for userspace. This document
describes user space API that application can use to establish
communication channel.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 Documentation/powerpc/index.rst   |   1 +
 Documentation/powerpc/vas-api.rst | 292 ++
 2 files changed, 293 insertions(+)
 create mode 100644 Documentation/powerpc/vas-api.rst

diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index 0d45f0f..afe2d5e 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -30,6 +30,7 @@ powerpc
 syscall64-abi
 transactional_memory
 ultravisor
+vas-api
 
 .. only::  subproject and html
 
diff --git a/Documentation/powerpc/vas-api.rst 
b/Documentation/powerpc/vas-api.rst
new file mode 100644
index 000..1217c2f
--- /dev/null
+++ b/Documentation/powerpc/vas-api.rst
@@ -0,0 +1,292 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. _VAS-API:
+
+===
+Virtual Accelerator Switchboard (VAS) userspace API
+===
+
+Introduction
+
+
+Power9 processor introduced Virtual Accelerator Switchboard (VAS) which
+allows both userspace and kernel communicate to co-processor
+(hardware accelerator) referred to as the Nest Accelerator (NX). The NX
+unit comprises of one or more hardware engines or co-processor types
+such as 842 compression, GZIP compression and encryption. On power9,
+userspace applications will have access to only GZIP Compression engine
+which supports ZLIB and GZIP compression algorithms in the hardware.
+
+To communicate with NX, kernel has to establish a channel or window and
+then requests can be submitted directly without kernel involvement.
+Requests to the GZIP engine must be formatted as a co-processor Request
+Block (CRB) and these CRBs must be submitted to the NX using COPY/PASTE
+instructions to paste the CRB to hardware address that is associated with
+the engine's request queue.
+
+The GZIP engine provides two priority levels of requests: Normal and
+High. Only Normal requests are supported from userspace right now.
+
+This document explains userspace API that is used to interact with
+kernel to setup channel / window which can be used to send compression
+requests directly to NX accelerator.
+
+
+Overview
+
+
+Application access to the GZIP engine is provided through
+/dev/crypto/nx-gzip device node implemented by the VAS/NX device driver.
+An application must open the /dev/crypto/nx-gzip device to obtain a file
+descriptor (fd). Then should issue VAS_TX_WIN_OPEN ioctl with this fd to
+establish connection to the engine. It means send window is opened on GZIP
+engine for this process. Once a connection is established, the application
+should use the mmap() system call to map the hardware address of engine's
+request queue into the application's virtual address space.
+
+The application can then submit one or more requests to the the engine by
+using copy/paste instructions and pasting the CRBs to the virtual address
+(aka paste_address) returned by mmap(). User space can close the
+established connection or send window by closing the file descriptior
+(close(fd)) or upon the process exit.
+
+Note that applications can send several requests with the same window or
+can establish multiple windows, but one window for each file descriptor.
+
+Following sections provide additional details and references about the
+individual steps.
+
+NX-GZIP Device Node
+===
+
+There is one /dev/crypto/nx-gzip node in the system and it provides
+access to all GZIP engines in the system. The only valid operations on
+/dev/crypto/nx-gzip are:
+
+   * open() the device for read and write.
+   * issue VAS_TX_WIN_OPEN ioctl
+   * mmap() the engine's request queue into application's virtual
+ address space (i.e. get a paste_address for the co-processor
+ engine).
+   * close the device node.
+
+Other file operations on this device node are undefined.
+
+Note that the copy and paste operations go directly to the hardware and
+do not go through this device. Refer COPY/PASTE document for more
+details.
+
+Although a system may have several instances of the NX co-processor
+engines (typically, one per P9 chip) there is just one
+/dev/crypto/nx-gzip device node in the system. When the nx-gzip device
+node is opened, Kernel opens send window on a suitable instance of NX
+accelerator. It finds CPU on which the user process is executing and
+determine the NX instance for the corresponding chip on which this CPU
+belongs.
+
+Applications may chose a specific instance of the NX co-processor using
+the vas_id field in the VAS_TX_WIN_OPEN ioctl as detailed below.
+
+A userspace

[RFC PATCH] powerpc/64/signal: balance return predictor stack in signal trampoline

2020-04-17 Thread Nicholas Piggin

Returning from an interrupt or syscall to a signal handler currently begins
execution directly at the handler's entry point, with LR set to the address
of the sigreturn trampoline. When the signal handler function returns, it
runs the trampoline. It looks like this:

# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addir1,r1,__SIGNAL_FRAMESIZE
li  r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz

Note the blr with no matching bl. This can corrupt the return predictor.

Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler.

I don't know much about dwarf, gdb still seems to recognize the signal
frame and unwind properly if I break inside a signal handler.

qtrace-tools link_stack checker confirms the entire user/kernel/vdso
cycle is balanced after this patch, whereas it's not upstream
Performance seems to be in the noise on my old POWER9, but I don't
quite know where it's at with spectre mitigations.
---
 arch/powerpc/include/asm/ppc-opcode.h |  1 +
 arch/powerpc/kernel/signal_64.c   | 20 +++-
 arch/powerpc/kernel/vdso64/sigtramp.S | 13 +
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index c1df75edde44..747b37f1ce09 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -329,6 +329,7 @@
 #define PPC_INST_BLR   0x4e800020
 #define PPC_INST_BLRL  0x4e800021
 #define PPC_INST_BCTR  0x4e800420
+#define PPC_INST_BCTRL 0x4e800421
 #define PPC_INST_MULLD 0x7c0001d2
 #define PPC_INST_MULLW 0x7c0001d6
 #define PPC_INST_MULHWU0x7c16
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index adfde59cf4ba..6c17e2456ccc 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -41,7 +41,7 @@
 #define FP_REGS_SIZE   sizeof(elf_fpregset_t)
 
 #define TRAMP_TRACEBACK3
-#define TRAMP_SIZE 6
+#define TRAMP_SIZE 7
 
 /*
  * When we have signals to deliver, we set up on the user stack,
@@ -603,13 +603,15 @@ static long setup_trampoline(unsigned int syscall, 
unsigned int __user *tramp)
int i;
long err = 0;
 
+   /* bctrl # call the handler */
+   err |= __put_user(PPC_INST_BCTRL, [0]);
/* addi r1, r1, __SIGNAL_FRAMESIZE  # Pop the dummy stackframe */
err |= __put_user(PPC_INST_ADDI | __PPC_RT(R1) | __PPC_RA(R1) |
- (__SIGNAL_FRAMESIZE & 0x), [0]);
+ (__SIGNAL_FRAMESIZE & 0x), [1]);
/* li r0, __NR_[rt_]sigreturn| */
-   err |= __put_user(PPC_INST_ADDI | (syscall & 0x), [1]);
+   err |= __put_user(PPC_INST_ADDI | (syscall & 0x), [2]);
/* sc */
-   err |= __put_user(PPC_INST_SC, [2]);
+   err |= __put_user(PPC_INST_SC, [3]);
 
/* Minimal traceback info */
for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
@@ -867,12 +869,12 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t 
*set,
 
/* Set up to return from userspace. */
if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) {
-   regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
+   regs->nip = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
} else {
err |= setup_trampoline(__NR_rt_sigreturn, >tramp[0]);
if (err)
goto badframe;
-   regs->link = (unsigned long) >tramp[0];
+   regs->nip = (unsigned long) >tramp[0];
}
 
/* Allocate a dummy caller frame for the signal handler. */
@@ -881,8 +883,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 
/* Set up "regs" so we "return" to the signal handler. */
if (is_elf2_task()) {
-   regs->nip = (unsigned long) ksig->ka.sa.sa_handler;
-   regs->gpr[12] = regs->nip;
+   regs->ctr = (unsigned long) ksig->ka.sa.sa_handler;
+   regs->gpr[12] = regs->ctr;
} else {
/* Handler is *really* a pointer to the function descriptor for
 * the signal routine.  The first entry in the function
@@ -892,7 +894,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
func_descr_t __user *funct_desc_ptr =
(func_descr_t __user *) ksig->ka.sa.sa_handler;
 
-   err |= get_user(regs->nip, _desc_ptr->entry);
+   err |= get_user(regs->ctr,

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Jason Wang




On 2020/4/17 下午5:25, Geert Uytterhoeven wrote:

Hi Michael,

On Fri, Apr 17, 2020 at 10:57 AM Michael S. Tsirkin  wrote:

On Fri, Apr 17, 2020 at 04:51:19PM +0800, Jason Wang wrote:

On 2020/4/17 下午4:46, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 04:39:49PM +0800, Jason Wang wrote:

On 2020/4/17 下午4:29, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:

On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:

On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:

On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:

We try to keep the defconfig untouched after decoupling CONFIG_VHOST
out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU by
default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
without the caring of CONFIG_VHOST.

But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and even
for the ones that doesn't want vhost. So it actually shifts the
burdens to the maintainers of all other to add "CONFIG_VHOST_MENU is
not set". So this patch tries to enable CONFIG_VHOST explicitly in
defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.

Acked-by: Christian Borntraeger(s390)
Acked-by: Michael Ellerman(powerpc)
Cc: Thomas Bogendoerfer
Cc: Benjamin Herrenschmidt
Cc: Paul Mackerras
Cc: Michael Ellerman
Cc: Heiko Carstens
Cc: Vasily Gorbik
Cc: Christian Borntraeger
Reported-by: Geert Uytterhoeven
Signed-off-by: Jason Wang

I rebased this on top of OABI fix since that
seems more orgent to fix.
Pushed to my vhost branch pls take a look and
if possible test.
Thanks!

I test this patch by generating the defconfigs that wants vhost_net or
vhost_vsock. All looks fine.

But having CONFIG_VHOST_DPN=y may end up with the similar situation that
this patch want to address.
Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add another
menuconfig for VHOST_RING and do something similar?

Thanks

Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
an internal variable for the OABI fix. I kept it separate
so it's easy to revert for 5.8. Yes we could squash it into
VHOST directly but I don't see how that changes logic at all.

Sorry for being unclear.

I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be left
in the defconfigs.

But who cares?

FYI, please seehttps://www.spinics.net/lists/kvm/msg212685.html

The complaint was not about the symbol IIUC.  It was that we caused
everyone to build vhost unless they manually disabled it.

There could be some misunderstanding here. I thought it's somehow similar: a
CONFIG_VHOST_MENU=y will be left in the defconfigs even if CONFIG_VHOST is
not set.

Thanks

Hmm. So looking at Documentation/kbuild/kconfig-language.rst :

 Things that merit "default y/m" include:

 a) A new Kconfig option for something that used to always be built
should be "default y".

 b) A new gatekeeping Kconfig option that hides/shows other Kconfig
options (but does not generate any code of its own), should be
"default y" so people will see those other options.

 c) Sub-driver behavior or similar options for a driver that is
"default n". This allows you to provide sane defaults.


So it looks like VHOST_MENU is actually matching rule b).
So what's the problem we are trying to solve with this patch, exactly?

Geert could you clarify pls?

I can confirm VHOST_MENU is matching rule b), so it is safe to always
enable it.

Gr{oetje,eeting}s,

 Geert



Right, so I think we can drop this patch.

Thanks

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Michael S. Tsirkin

On Fri, Apr 17, 2020 at 05:33:56PM +0800, Jason Wang wrote:
> 
> On 2020/4/17 下午5:01, Michael S. Tsirkin wrote:
> > > There could be some misunderstanding here. I thought it's somehow 
> > > similar: a
> > > CONFIG_VHOST_MENU=y will be left in the defconfigs even if CONFIG_VHOST is
> > > not set.
> > > 
> > > Thanks
> > > 
> > BTW do entries with no prompt actually appear in defconfig?
> > 
> 
> Yes. I can see CONFIG_VHOST_DPN=y after make ARCH=m68k defconfig

You see it in .config right? So that's harmless right?

-- 
MST

Re: [PATCH v6 09/10] powerpc/64s: Implement KUAP for Radix MMU

2020-04-17 Thread Christophe Leroy





Le 18/04/2019 à 08:51, Michael Ellerman a écrit :

Kernel Userspace Access Prevention utilises a feature of the Radix MMU
which disallows read and write access to userspace addresses. By
utilising this, the kernel is prevented from accessing user data from
outside of trusted paths that perform proper safety checks, such as
copy_{to/from}_user() and friends.

Userspace access is disabled from early boot and is only enabled when
performing an operation like copy_{to/from}_user(). The register that
controls this (AMR) does not prevent userspace from accessing itself,
so there is no need to save and restore when entering and exiting
userspace.

When entering the kernel from the kernel we save AMR and if it is not
blocking user access (because eg. we faulted doing a user access) we
reblock user access for the duration of the exception (ie. the page
fault) and then restore the AMR when returning back to the kernel.

This feature can be tested by using the lkdtm driver (CONFIG_LKDTM=y)
and performing the following:

   # (echo ACCESS_USERSPACE) > [debugfs]/provoke-crash/DIRECT

If enabled, this should send SIGSEGV to the thread.

We also add paranoid checking of AMR in switch and syscall return
under CONFIG_PPC_KUAP_DEBUG.

Co-authored-by: Michael Ellerman 
Signed-off-by: Russell Currey 
Signed-off-by: Michael Ellerman 


[...]


diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 15c67d2c0534..7cc25389c6bd 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S


[...]


@@ -594,6 +606,8 @@ _GLOBAL(_switch)
std r23,_CCR(r1)
std r1,KSP(r3)  /* Set old stack pointer */
  
+	kuap_check_amr r9, r10

+
FLUSH_COUNT_CACHE
  
  	/*


I'm having a problem with this check. As you know I implemented the 
exact same check in _switch() in entry_32.S. After adding some printk 
inside an user_access_begin()/user_access_end() section, I started to 
get valid user accesses blocked by KUAP. Then I activated 
CONFIG_PPC_KUAP_DEBUG which led me to WARNINGs on this check.


This is due to schedule() being called by printk() inside the section 
where user access is unlocked. How is this supposed to work ? When 
scheduling via the decrementer interrupt, the value of the KUAP register 
is saved on stack at interrupt entry and the one from the new task is 
restored at interrupt exit, but I can't see where it is done when 
schedule() is called directly.


Did I miss something when implementing KUAP for PPC32 ?

Christophe

Re: [PATCH v6 09/10] powerpc/64s: Implement KUAP for Radix MMU

2020-04-17 Thread Nicholas Piggin

Excerpts from Christophe Leroy's message of April 17, 2020 8:10 pm:
> 
> 
> Le 18/04/2019 à 08:51, Michael Ellerman a écrit :
>> Kernel Userspace Access Prevention utilises a feature of the Radix MMU
>> which disallows read and write access to userspace addresses. By
>> utilising this, the kernel is prevented from accessing user data from
>> outside of trusted paths that perform proper safety checks, such as
>> copy_{to/from}_user() and friends.
>> 
>> Userspace access is disabled from early boot and is only enabled when
>> performing an operation like copy_{to/from}_user(). The register that
>> controls this (AMR) does not prevent userspace from accessing itself,
>> so there is no need to save and restore when entering and exiting
>> userspace.
>> 
>> When entering the kernel from the kernel we save AMR and if it is not
>> blocking user access (because eg. we faulted doing a user access) we
>> reblock user access for the duration of the exception (ie. the page
>> fault) and then restore the AMR when returning back to the kernel.
>> 
>> This feature can be tested by using the lkdtm driver (CONFIG_LKDTM=y)
>> and performing the following:
>> 
>># (echo ACCESS_USERSPACE) > [debugfs]/provoke-crash/DIRECT
>> 
>> If enabled, this should send SIGSEGV to the thread.
>> 
>> We also add paranoid checking of AMR in switch and syscall return
>> under CONFIG_PPC_KUAP_DEBUG.
>> 
>> Co-authored-by: Michael Ellerman 
>> Signed-off-by: Russell Currey 
>> Signed-off-by: Michael Ellerman 
> 
> [...]
> 
>> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
>> index 15c67d2c0534..7cc25389c6bd 100644
>> --- a/arch/powerpc/kernel/entry_64.S
>> +++ b/arch/powerpc/kernel/entry_64.S
> 
> [...]
> 
>> @@ -594,6 +606,8 @@ _GLOBAL(_switch)
>>  std r23,_CCR(r1)
>>  std r1,KSP(r3)  /* Set old stack pointer */
>>   
>> +kuap_check_amr r9, r10
>> +
>>  FLUSH_COUNT_CACHE
>>   
>>  /*
> 
> I'm having a problem with this check. As you know I implemented the 
> exact same check in _switch() in entry_32.S. After adding some printk 
> inside an user_access_begin()/user_access_end() section, I started to 
> get valid user accesses blocked by KUAP. Then I activated 
> CONFIG_PPC_KUAP_DEBUG which led me to WARNINGs on this check.
> 
> This is due to schedule() being called by printk() inside the section 
> where user access is unlocked. How is this supposed to work ? When 

Unlocked user access sections are supposed to be very constrained,
I think x86's objtool has a checker to verify nothing much gets
called. Printk shouldn't be.

I was hitting the same assertion and it was because the uaccess
macros were pulling lots of things into the user access region.

https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200407041245.600651-1-npig...@gmail.com/

If that doesn't solve your problem... then now is printk being
called with user access enabled?

Thanks,
Nick

[PATCH v5, 2/4] powerpc: sysdev: fix compile error for fsl_85xx_cache_sram

2020-04-17 Thread Wang Wenhu

Include linux/io.h into fsl_85xx_cache_sram.c to fix the
implicit-declaration compile error when building Cache-Sram.

arch/powerpc/sysdev/fsl_85xx_cache_sram.c: In function ‘instantiate_cache_sram’:
arch/powerpc/sysdev/fsl_85xx_cache_sram.c:97:26: error: implicit declaration of 
function ‘ioremap_coherent’; did you mean ‘bitmap_complement’? 
[-Werror=implicit-function-declaration]
  cache_sram->base_virt = ioremap_coherent(cache_sram->base_phys,
  ^~~~
  bitmap_complement
arch/powerpc/sysdev/fsl_85xx_cache_sram.c:97:24: error: assignment makes 
pointer from integer without a cast [-Werror=int-conversion]
  cache_sram->base_virt = ioremap_coherent(cache_sram->base_phys,
^
arch/powerpc/sysdev/fsl_85xx_cache_sram.c:123:2: error: implicit declaration of 
function ‘iounmap’; did you mean ‘roundup’? 
[-Werror=implicit-function-declaration]
  iounmap(cache_sram->base_virt);
  ^~~
  roundup
cc1: all warnings being treated as errors

Cc: Greg Kroah-Hartman 
Cc: Christophe Leroy 
Cc: Scott Wood 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Fixes: 6db92cc9d07d ("powerpc/85xx: add cache-sram support")
Reviewed-by: Christophe Leroy 
Signed-off-by: Wang Wenhu 
---
Changes since v1:
 * None
Changes since v2:
 * None
Changes since v3:
 * None
Changes since v4:
 * None
---
 arch/powerpc/sysdev/fsl_85xx_cache_sram.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c 
b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c
index f6c665dac725..be3aef4229d7 100644
--- a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c
+++ b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "fsl_85xx_cache_ctlr.h"
 
-- 
2.17.1

[PATCH v5, 3/4] powerpc: sysdev: fix compile warning for fsl_85xx_cache_sram

2020-04-17 Thread Wang Wenhu

Function instantiate_cache_sram should not be linked into the init
section for its caller mpc85xx_l2ctlr_of_probe is none-__init.

Cc: Greg Kroah-Hartman 
Cc: Christophe Leroy 
Cc: Scott Wood 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Fixes: 6db92cc9d07d ("powerpc/85xx: add cache-sram support")
Reviewed-by: Christophe Leroy 
Signed-off-by: Wang Wenhu 

Warning information:
  MODPOST vmlinux.o
WARNING: modpost: vmlinux.o(.text+0x1e540): Section mismatch in reference from 
the function mpc85xx_l2ctlr_of_probe() to the function 
.init.text:instantiate_cache_sram()
The function mpc85xx_l2ctlr_of_probe() references
the function __init instantiate_cache_sram().
This is often because mpc85xx_l2ctlr_of_probe lacks a __init
annotation or the annotation of instantiate_cache_sram is wrong.
---
Changes since v1:
 * None
Changes since v2:
 * None
Changes since v3:
 * None
Changes since v4:
 * None
---
 arch/powerpc/sysdev/fsl_85xx_cache_sram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c 
b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c
index be3aef4229d7..3de5ac8382c0 100644
--- a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c
+++ b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c
@@ -68,7 +68,7 @@ void mpc85xx_cache_sram_free(void *ptr)
 }
 EXPORT_SYMBOL(mpc85xx_cache_sram_free);
 
-int __init instantiate_cache_sram(struct platform_device *dev,
+int instantiate_cache_sram(struct platform_device *dev,
struct sram_parameters sram_params)
 {
int ret = 0;
-- 
2.17.1

[PATCH 4/4] powerpc/powernv/pci: Sprinkle around some WARN_ON()s

2020-04-17 Thread Oliver O'Halloran

pnv_pci_ioda_configure_bus() should now only ever be called when a device is
added to the bus so add a WARN_ON() to the empty bus check. Similarly,
pnv_pci_ioda_setup_bus_PE() should only ever be called for an unconfigured PE,
so add a WARN_ON() for that case too.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 05436a9..627420c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1139,7 +1139,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
pci_bus *bus, bool all)
 * We should reuse it instead of allocating a new one.
 */
pe_num = phb->ioda.pe_rmap[bus->number << 8];
-   if (pe_num != IODA_INVALID_PE) {
+   if (WARN_ON(pe_num != IODA_INVALID_PE)) {
pe = >ioda.pe_array[pe_num];
return NULL;
}
@@ -3224,7 +3224,7 @@ static void pnv_pci_configure_bus(struct pci_bus *bus)
dev_info(>dev, "Configuring PE for bus\n");
 
/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
-   if (list_empty(>devices))
+   if (WARN_ON(list_empty(>devices)))
return;
 
/* Reserve PEs according to used M64 resources */
-- 
2.9.5

Re: [musl] Powerpc Linux 'scv' system call ABI proposal take 2

2020-04-17 Thread Florian Weimer

* Segher Boessenkool:

> On Thu, Apr 16, 2020 at 08:34:42PM -0400, Rich Felker wrote:
>> On Thu, Apr 16, 2020 at 06:02:35PM -0500, Segher Boessenkool wrote:
>> > On Thu, Apr 16, 2020 at 08:12:19PM +0200, Florian Weimer wrote:
>> > > > I think my choice would be just making the inline syscall be a single
>> > > > call insn to an asm source file that out-of-lines the loading of TOC
>> > > > pointer and call through it or branch based on hwcap so that it's not
>> > > > repeated all over the place.
>> > > 
>> > > I don't know how problematic control flow out of an inline asm is on
>> > > POWER.  But this is basically the -moutline-atomics approach.
>> > 
>> > Control flow out of inline asm (other than with "asm goto") is not
>> > allowed at all, just like on any other target (and will not work in
>> > practice, either -- just like on any other target).  But the suggestion
>> > was to use actual assembler code, not inline asm?
>> 
>> Calling it control flow out of inline asm is something of a misnomer.
>> The enclosing state is not discarded or altered; the asm statement
>> exits normally, reaching the next instruction in the enclosing
>> block/function as soon as the call from the asm statement returns,
>> with all register/clobber constraints satisfied.
>
> Ah.  That should always Just Work, then -- our ABIs guarantee you can.

After thinking about it, I agree: GCC will handle spilling of the link
register.  Branch-and-link instructions do not clobber the protected
zone, so no stack adjustment is needed (which would be problematic to
reflect in the unwind information).

Of course, the target function has to be written in assembler because
it must not use a regular stack frame.

[PATCH v6 1/9] powerpc/vas: Initialize window attributes for GZIP coprocessor type

2020-04-17 Thread Haren Myneni



Initialize send and receive window attributes for GZIP high and
normal priority types.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-window.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index d62787f..52844a1 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -817,7 +817,8 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, 
enum vas_cop_type cop)
 {
memset(rxattr, 0, sizeof(*rxattr));
 
-   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+   cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
rxattr->pin_win = true;
rxattr->nx_win = true;
rxattr->fault_win = false;
@@ -892,7 +893,8 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, 
enum vas_cop_type cop)
 {
memset(txattr, 0, sizeof(*txattr));
 
-   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+   cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
txattr->rej_no_credit = false;
txattr->rx_wcred_mode = true;
txattr->tx_wcred_mode = true;
@@ -976,9 +978,14 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
return false;
 
-   if (attr->user_win &&
-   (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
-   return false;
+   if (attr->user_win) {
+   if (attr->rsvd_txbuf_count)
+   return false;
+
+   if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP &&
+   cop != VAS_COP_TYPE_GZIP_HIPRI)
+   return false;
+   }
 
return true;
 }
-- 
1.8.3.1

[PATCH v6 7/9] crypto/nx: Enable and setup GZIP compression type

2020-04-17 Thread Haren Myneni



Changes to probe GZIP device-tree nodes, open RX windows and setup
GZIP compression type. No plans to provide GZIP usage in kernel right
now, but this patch enables GZIP for user space usage.

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/nx-common-powernv.c | 46 ++-
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/nx/nx-common-powernv.c 
b/drivers/crypto/nx/nx-common-powernv.c
index 82dfa60..651d286 100644
--- a/drivers/crypto/nx/nx-common-powernv.c
+++ b/drivers/crypto/nx/nx-common-powernv.c
@@ -65,6 +65,7 @@ struct nx_coproc {
  * Using same values as in skiboot or coprocessor type representing
  * in NX workbook.
  */
+#define NX_CT_GZIP (2) /* on P9 and later */
 #define NX_CT_842  (3)
 
 static int (*nx842_powernv_exec)(const unsigned char *in,
@@ -819,6 +820,9 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
if (type == NX_CT_842)
ret = nx_set_ct(coproc, priority, VAS_COP_TYPE_842_HIPRI,
VAS_COP_TYPE_842);
+   else if (type == NX_CT_GZIP)
+   ret = nx_set_ct(coproc, priority, VAS_COP_TYPE_GZIP_HIPRI,
+   VAS_COP_TYPE_GZIP);
 
if (ret)
goto err_out;
@@ -867,12 +871,16 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
return ret;
 }
 
-static int __init nx_coproc_init(int chip_id, int ct_842)
+static int __init nx_coproc_init(int chip_id, int ct_842, int ct_gzip)
 {
int ret = 0;
 
if (opal_check_token(OPAL_NX_COPROC_INIT)) {
ret = opal_nx_coproc_init(chip_id, ct_842);
+
+   if (!ret)
+   ret = opal_nx_coproc_init(chip_id, ct_gzip);
+
if (ret) {
ret = opal_error_code(ret);
pr_err("Failed to initialize NX for chip(%d): %d\n",
@@ -902,8 +910,8 @@ static int __init find_nx_device_tree(struct device_node 
*dn, int chip_id,
 static int __init nx_powernv_probe_vas(struct device_node *pn)
 {
int chip_id, vasid, ret = 0;
+   int ct_842 = 0, ct_gzip = 0;
struct device_node *dn;
-   int ct_842 = 0;
 
chip_id = of_get_ibm_chip_id(pn);
if (chip_id < 0) {
@@ -920,19 +928,24 @@ static int __init nx_powernv_probe_vas(struct device_node 
*pn)
for_each_child_of_node(pn, dn) {
ret = find_nx_device_tree(dn, chip_id, vasid, NX_CT_842,
"ibm,p9-nx-842", _842);
+
+   if (!ret)
+   ret = find_nx_device_tree(dn, chip_id, vasid,
+   NX_CT_GZIP, "ibm,p9-nx-gzip", _gzip);
+
if (ret)
return ret;
}
 
-   if (!ct_842) {
-   pr_err("NX842 FIFO nodes are missing\n");
+   if (!ct_842 || !ct_gzip) {
+   pr_err("NX FIFO nodes are missing\n");
return -EINVAL;
}
 
/*
 * Initialize NX instance for both high and normal priority FIFOs.
 */
-   ret = nx_coproc_init(chip_id, ct_842);
+   ret = nx_coproc_init(chip_id, ct_842, ct_gzip);
 
return ret;
 }
@@ -1072,10 +1085,22 @@ static __init int nx_compress_powernv_init(void)
nx842_powernv_exec = nx842_exec_icswx;
} else {
/*
+* Register VAS user space API for NX GZIP so
+* that user space can use GZIP engine.
+* Using high FIFO priority for kernel requests and
+* normal FIFO priority is assigned for userspace.
+* 842 compression is supported only in kernel.
+*/
+   ret = vas_register_coproc_api(THIS_MODULE, VAS_COP_TYPE_GZIP,
+   "nx-gzip");
+
+   /*
 * GZIP is not supported in kernel right now.
 * So open tx windows only for 842.
 */
-   ret = nx_open_percpu_txwins();
+   if (!ret)
+   ret = nx_open_percpu_txwins();
+
if (ret) {
nx_delete_coprocs();
return ret;
@@ -1096,6 +1121,15 @@ static __init int nx_compress_powernv_init(void)
 
 static void __exit nx_compress_powernv_exit(void)
 {
+   /*
+* GZIP engine is supported only in power9 or later and nx842_ct
+* is used on power8 (icswx).
+* VAS API for NX GZIP is registered during init for user space
+* use. So delete this API use for GZIP engine.
+*/
+   if (!nx842_ct)
+   vas_unregister_coproc_api();
+
crypto_unregister_alg(_powernv_alg);
 
nx_delete_coprocs();
-- 
1.8.3.1

[PATCH v6 8/9] crypto/nx: Remove 'pid' in vas_tx_win_attr struct

2020-04-17 Thread Haren Myneni



When window is opened, pid reference is taken for user space
windows. Not needed for kernel windows. So remove 'pid' in
vas_tx_win_attr struct.

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 arch/powerpc/include/asm/vas.h| 1 -
 drivers/crypto/nx/nx-common-powernv.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 6e427bc..e33f80b 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -86,7 +86,6 @@ struct vas_tx_win_attr {
int wcreds_max;
int lpid;
int pidr;   /* hardware PID (from SPRN_PID) */
-   int pid;/* linux process id */
int pswid;
int rsvd_txbuf_count;
int tc_mode;
diff --git a/drivers/crypto/nx/nx-common-powernv.c 
b/drivers/crypto/nx/nx-common-powernv.c
index 651d286..13c65de 100644
--- a/drivers/crypto/nx/nx-common-powernv.c
+++ b/drivers/crypto/nx/nx-common-powernv.c
@@ -692,7 +692,6 @@ static struct vas_window *nx_alloc_txwin(struct nx_coproc 
*coproc)
 */
vas_init_tx_win_attr(, coproc->ct);
txattr.lpid = 0;/* lpid is 0 for kernel requests */
-   txattr.pid = 0; /* pid is 0 for kernel requests */
 
/*
 * Open a VAS send window which is used to send request to NX.
-- 
1.8.3.1

Re: [PATCH v4,4/4] drivers: uio: new driver for fsl_85xx_cache_sram

2020-04-17 Thread Scott Wood

On Fri, 2020-04-17 at 09:42 +0200, Greg KH wrote:
> On Thu, Apr 16, 2020 at 11:58:29PM -0500, Scott Wood wrote:
> > On Fri, 2020-04-17 at 10:31 +0800, 王文虎 wrote:
> > > Sounds it is. And does the modification below fit well?
> > > ---
> > > -static const struct of_device_id uio_mpc85xx_l2ctlr_of_match[] = {
> > > -   {   .compatible = "uio,mpc85xx-cache-sram", },
> > > -   {},
> > > +#ifdef CONFIG_OF
> > > +static struct of_device_id uio_fsl_85xx_cache_sram_of_match[] = {
> > > +   { /* This is filled with module_parm */ },
> > > +   { /* Sentinel */ },
> > >  };
> > > +MODULE_DEVICE_TABLE(of, uio_fsl_85xx_cache_sram_of_match);
> > > +module_param_string(of_id,
> > > uio_fsl_85xx_cache_sram_of_match[0].compatible,
> > > +   sizeof(uio_fsl_85xx_cache_sram_of_match[0].c
> > > ompa
> > > tible), 0);
> > > +MODULE_PARM_DESC(of_id, "platform device id to be handled by cache-
> > > sram-
> > > uio");
> > > +#endif
> > 
> > No.  The point is that you wouldn't be configuring this with the device
> > tree
> > at all.
> 
> Wait, why not?  Don't force people to use module parameters, that is
> crazy.  DT describes the hardware involved, if someone wants to bind to
> a specific range of memory, as described by DT, why can't they do so?

Yes, DT describes the hardware, and as I've said a couple times already, this
isn't hardware description.

I'm not forcing people to use module parameters.  That was a least-effort
suggestion to avoid abusing the DT.  I later said I'd try to come up with a
patch that allocates regions dynamically (and most likely doesn't use UIO at
all).

> I can understand not liking the name "uio" in a dt tree, but there's no
> reason that DT can not describe what a driver binds to here.

The DT already describes this hardware, and there is already code that binds
to it.  This patch is trying to add a second node for it with configuration.

-Scott

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Jason Wang




On 2020/4/17 下午5:38, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 05:33:56PM +0800, Jason Wang wrote:

On 2020/4/17 下午5:01, Michael S. Tsirkin wrote:

There could be some misunderstanding here. I thought it's somehow similar: a
CONFIG_VHOST_MENU=y will be left in the defconfigs even if CONFIG_VHOST is
not set.

Thanks


BTW do entries with no prompt actually appear in defconfig?


Yes. I can see CONFIG_VHOST_DPN=y after make ARCH=m68k defconfig

You see it in .config right? So that's harmless right?



Yes.

Thanks

[PATCH v6 3/9] powerpc/vas: Add VAS user space API

2020-04-17 Thread Haren Myneni



On power9, userspace can send GZIP compression requests directly to NX
once kernel establishes NX channel / window with VAS. This patch provides
user space API which allows user space to establish channel using open
VAS_TX_WIN_OPEN ioctl, mmap and close operations.

Each window corresponds to file descriptor and application can open
multiple windows. After the window is opened, VAS_TX_WIN_OPEN icoctl to
open a window on specific VAS instance, mmap() system call to map
the hardware address of engine's request queue into the application's
virtual address space.

Then the application can then submit one or more requests to the the
engine by using the copy/paste instructions and pasting the CRBs to
the virtual address (aka paste_address) returned by mmap().

Only NX GZIP coprocessor type is supported right now and allow GZIP
engine access via /dev/crypto/nx-gzip device node.

Thanks to Michael Ellerman for his changes and suggestions to make the
ioctl generic to support any coprocessor type.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/vas.h  |  12 ++
 arch/powerpc/platforms/powernv/Makefile |   2 +-
 arch/powerpc/platforms/powernv/vas-api.c| 278 
 arch/powerpc/platforms/powernv/vas-window.c |   6 +-
 arch/powerpc/platforms/powernv/vas.h|   2 +
 5 files changed, 296 insertions(+), 4 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-api.c

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index f93e6b0..6e427bc 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -163,4 +163,16 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
  */
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
+/*
+ * Register / unregister coprocessor type to VAS API which will be exported
+ * to user space. Applications can use this API to open / close window
+ * which can be used to send / receive requests directly to cooprcessor.
+ *
+ * Only NX GZIP coprocessor type is supported now, but this API can be
+ * used for others in future.
+ */
+int vas_register_coproc_api(struct module *mod, enum vas_cop_type cop_type,
+   const char *name);
+void vas_unregister_coproc_api(void);
+
 #endif /* __ASM_POWERPC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 395789f..fe3f0fb 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_MEMORY_FAILURE)  += opal-memory-errors.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o vas-fault.o
+obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o vas-fault.o vas-api.o
 obj-$(CONFIG_OCXL_BASE)+= ocxl.o
 obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
 obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o
diff --git a/arch/powerpc/platforms/powernv/vas-api.c 
b/arch/powerpc/platforms/powernv/vas-api.c
new file mode 100644
index 000..98ed5d8
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-api.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * VAS user space API for its accelerators (Only NX-GZIP is supported now)
+ * Copyright (C) 2019 Haren Myneni, IBM Corp
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "vas.h"
+
+/*
+ * The driver creates the device node that can be used as follows:
+ * For NX-GZIP
+ *
+ * fd = open("/dev/crypto/nx-gzip", O_RDWR);
+ * rc = ioctl(fd, VAS_TX_WIN_OPEN, );
+ * paste_addr = mmap(NULL, PAGE_SIZE, prot, MAP_SHARED, fd, 0ULL).
+ * vas_copy(, 0, 1);
+ * vas_paste(paste_addr, 0, 1);
+ * close(fd) or exit process to close window.
+ *
+ * where "vas_copy" and "vas_paste" are defined in copy-paste.h.
+ * copy/paste returns to the user space directly. So refer NX hardware
+ * documententation for exact copy/paste usage and completion / error
+ * conditions.
+ */
+
+/*
+ * Wrapper object for the nx-gzip device - there is just one instance of
+ * this node for the whole system.
+ */
+static struct coproc_dev {
+   struct cdev cdev;
+   struct device *device;
+   char *name;
+   dev_t devt;
+   struct class *class;
+   enum vas_cop_type cop_type;
+} coproc_device;
+
+struct coproc_instance {
+   struct coproc_dev *coproc;
+   struct vas_window *txwin;
+};
+
+static char *coproc_devnode(struct device *dev, umode_t *mode)
+{
+   return kasprintf(GFP_KERNEL, "crypto/%s", dev_name(dev));
+}
+
+static int coproc_open(struct inode *inode, struct file *fp)
+{
+   struct coproc_instance *cp_inst;
+
+   cp_inst = kzalloc(sizeof(*cp_inst), GFP_KERNEL);
+   if (!cp_inst)
+   return -ENOMEM;
+
+   cp_inst->coproc =

[PATCH v6 2/9] powerpc/vas: Define VAS_TX_WIN_OPEN ioctl API

2020-04-17 Thread Haren Myneni



Define the VAS_TX_WIN_OPEN ioctl interface for NX GZIP access
from user space. This interface is used to open GZIP send window and
mmap region which can be used by userspace to send requests to NX
directly with copy/paste instructions.

Signed-off-by: Haren Myneni 
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |  1 +
 arch/powerpc/include/uapi/asm/vas-api.h| 22 ++
 2 files changed, 23 insertions(+)
 create mode 100644 arch/powerpc/include/uapi/asm/vas-api.h

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
b/Documentation/userspace-api/ioctl/ioctl-number.rst
index f759eda..f18accb 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -286,6 +286,7 @@ Code  Seq#Include File  
 Comments
 'v'   00-1F  linux/fs.h  conflict!
 'v'   00-0F  linux/sonypi.h  conflict!
 'v'   00-0F  media/v4l2-subdev.h conflict!
+'v'   20-27  arch/powerpc/include/uapi/asm/vas-api.hVAS API
 'v'   C0-FF  linux/meye.hconflict!
 'w'   allCERN SCI 
driver
 'y'   00-1F  packet 
based user level communications
diff --git a/arch/powerpc/include/uapi/asm/vas-api.h 
b/arch/powerpc/include/uapi/asm/vas-api.h
new file mode 100644
index 000..fe95d67
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/vas-api.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#ifndef _UAPI_MISC_VAS_H
+#define _UAPI_MISC_VAS_H
+
+#include 
+
+#define VAS_MAGIC  'v'
+#define VAS_TX_WIN_OPEN_IOW(VAS_MAGIC, 0x20, struct 
vas_tx_win_open_attr)
+
+struct vas_tx_win_open_attr {
+   __u32   version;
+   __s16   vas_id; /* specific instance of vas or -1 for default */
+   __u16   reserved1;
+   __u64   flags;  /* Future use */
+   __u64   reserved2[6];
+};
+
+#endif /* _UAPI_MISC_VAS_H */
-- 
1.8.3.1

[PATCH v6 5/9] crypto/nx: Rename nx-842-powernv file name to nx-common-powernv

2020-04-17 Thread Haren Myneni



Rename nx-842-powernv.c to nx-common-powernv.c to add code for setup
and enable new GZIP compression type. The actual functionality is not
changed in this patch.

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/Makefile|2 +-
 drivers/crypto/nx/nx-842-powernv.c| 1062 -
 drivers/crypto/nx/nx-common-powernv.c | 1062 +
 3 files changed, 1063 insertions(+), 1063 deletions(-)
 delete mode 100644 drivers/crypto/nx/nx-842-powernv.c
 create mode 100644 drivers/crypto/nx/nx-common-powernv.c

diff --git a/drivers/crypto/nx/Makefile b/drivers/crypto/nx/Makefile
index 015155d..bc89a20 100644
--- a/drivers/crypto/nx/Makefile
+++ b/drivers/crypto/nx/Makefile
@@ -15,4 +15,4 @@ obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS_PSERIES) += 
nx-compress-pseries.o nx-compres
 obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS_POWERNV) += nx-compress-powernv.o 
nx-compress.o
 nx-compress-objs := nx-842.o
 nx-compress-pseries-objs := nx-842-pseries.o
-nx-compress-powernv-objs := nx-842-powernv.o
+nx-compress-powernv-objs := nx-common-powernv.o
diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
deleted file mode 100644
index 8e63326..000
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ /dev/null
@@ -1,1062 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Driver for IBM PowerNV 842 compression accelerator
- *
- * Copyright (C) 2015 Dan Streetman, IBM Corp
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "nx-842.h"
-
-#include 
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dan Streetman ");
-MODULE_DESCRIPTION("842 H/W Compression driver for IBM PowerNV processors");
-MODULE_ALIAS_CRYPTO("842");
-MODULE_ALIAS_CRYPTO("842-nx");
-
-#define WORKMEM_ALIGN  (CRB_ALIGN)
-#define CSB_WAIT_MAX   (5000) /* ms */
-#define VAS_RETRIES(10)
-
-struct nx842_workmem {
-   /* Below fields must be properly aligned */
-   struct coprocessor_request_block crb; /* CRB_ALIGN align */
-   struct data_descriptor_entry ddl_in[DDL_LEN_MAX]; /* DDE_ALIGN align */
-   struct data_descriptor_entry ddl_out[DDL_LEN_MAX]; /* DDE_ALIGN align */
-   /* Above fields must be properly aligned */
-
-   ktime_t start;
-
-   char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
-} __packed __aligned(WORKMEM_ALIGN);
-
-struct nx842_coproc {
-   unsigned int chip_id;
-   unsigned int ct;
-   unsigned int ci;/* Coprocessor instance, used with icswx */
-   struct {
-   struct vas_window *rxwin;
-   int id;
-   } vas;
-   struct list_head list;
-};
-
-/*
- * Send the request to NX engine on the chip for the corresponding CPU
- * where the process is executing. Use with VAS function.
- */
-static DEFINE_PER_CPU(struct vas_window *, cpu_txwin);
-
-/* no cpu hotplug on powernv, so this list never changes after init */
-static LIST_HEAD(nx842_coprocs);
-static unsigned int nx842_ct;  /* used in icswx function */
-
-static int (*nx842_powernv_exec)(const unsigned char *in,
-   unsigned int inlen, unsigned char *out,
-   unsigned int *outlenp, void *workmem, int fc);
-
-/**
- * setup_indirect_dde - Setup an indirect DDE
- *
- * The DDE is setup with the the DDE count, byte count, and address of
- * first direct DDE in the list.
- */
-static void setup_indirect_dde(struct data_descriptor_entry *dde,
-  struct data_descriptor_entry *ddl,
-  unsigned int dde_count, unsigned int byte_count)
-{
-   dde->flags = 0;
-   dde->count = dde_count;
-   dde->index = 0;
-   dde->length = cpu_to_be32(byte_count);
-   dde->address = cpu_to_be64(nx842_get_pa(ddl));
-}
-
-/**
- * setup_direct_dde - Setup single DDE from buffer
- *
- * The DDE is setup with the buffer and length.  The buffer must be properly
- * aligned.  The used length is returned.
- * Returns:
- *   NSuccessfully set up DDE with N bytes
- */
-static unsigned int setup_direct_dde(struct data_descriptor_entry *dde,
-unsigned long pa, unsigned int len)
-{
-   unsigned int l = min_t(unsigned int, len, LEN_ON_PAGE(pa));
-
-   dde->flags = 0;
-   dde->count = 0;
-   dde->index = 0;
-   dde->length = cpu_to_be32(l);
-   dde->address = cpu_to_be64(pa);
-
-   return l;
-}
-
-/**
- * setup_ddl - Setup DDL from buffer
- *
- * Returns:
- *   0 Successfully set up DDL
- */
-static int setup_ddl(struct data_descriptor_entry *dde,
-struct data_descriptor_entry *ddl,
-unsigned char *buf, unsigned int len,
-bool in)
-{
-   unsigned long pa = nx842_get_pa(buf);
-   int i, ret, total_len = len;
-
-   if (!IS_ALIGNED(pa, DDE_BUFFER_ALIGN)) {
-   pr_debug("%s

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Geert Uytterhoeven

Hi Michael,

On Fri, Apr 17, 2020 at 10:57 AM Michael S. Tsirkin  wrote:
> On Fri, Apr 17, 2020 at 04:51:19PM +0800, Jason Wang wrote:
> > On 2020/4/17 下午4:46, Michael S. Tsirkin wrote:
> > > On Fri, Apr 17, 2020 at 04:39:49PM +0800, Jason Wang wrote:
> > > > On 2020/4/17 下午4:29, Michael S. Tsirkin wrote:
> > > > > On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:
> > > > > > On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:
> > > > > > > On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:
> > > > > > > > On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:
> > > > > > > > > On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:
> > > > > > > > > > We try to keep the defconfig untouched after decoupling 
> > > > > > > > > > CONFIG_VHOST
> > > > > > > > > > out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
> > > > > > > > > > ("vhost: refine vhost and vringh kconfig") by enabling 
> > > > > > > > > > VHOST_MENU by
> > > > > > > > > > default. Then the defconfigs can keep enabling 
> > > > > > > > > > CONFIG_VHOST_NET
> > > > > > > > > > without the caring of CONFIG_VHOST.
> > > > > > > > > >
> > > > > > > > > > But this will leave a "CONFIG_VHOST_MENU=y" in all 
> > > > > > > > > > defconfigs and even
> > > > > > > > > > for the ones that doesn't want vhost. So it actually shifts 
> > > > > > > > > > the
> > > > > > > > > > burdens to the maintainers of all other to add 
> > > > > > > > > > "CONFIG_VHOST_MENU is
> > > > > > > > > > not set". So this patch tries to enable CONFIG_VHOST 
> > > > > > > > > > explicitly in
> > > > > > > > > > defconfigs that enables CONFIG_VHOST_NET and 
> > > > > > > > > > CONFIG_VHOST_VSOCK.
> > > > > > > > > >
> > > > > > > > > > Acked-by: Christian Borntraeger   
> > > > > > > > > > (s390)
> > > > > > > > > > Acked-by: Michael Ellerman   (powerpc)
> > > > > > > > > > Cc: Thomas Bogendoerfer
> > > > > > > > > > Cc: Benjamin Herrenschmidt
> > > > > > > > > > Cc: Paul Mackerras
> > > > > > > > > > Cc: Michael Ellerman
> > > > > > > > > > Cc: Heiko Carstens
> > > > > > > > > > Cc: Vasily Gorbik
> > > > > > > > > > Cc: Christian Borntraeger
> > > > > > > > > > Reported-by: Geert Uytterhoeven
> > > > > > > > > > Signed-off-by: Jason Wang
> > > > > > > > > I rebased this on top of OABI fix since that
> > > > > > > > > seems more orgent to fix.
> > > > > > > > > Pushed to my vhost branch pls take a look and
> > > > > > > > > if possible test.
> > > > > > > > > Thanks!
> > > > > > > > I test this patch by generating the defconfigs that wants 
> > > > > > > > vhost_net or
> > > > > > > > vhost_vsock. All looks fine.
> > > > > > > >
> > > > > > > > But having CONFIG_VHOST_DPN=y may end up with the similar 
> > > > > > > > situation that
> > > > > > > > this patch want to address.
> > > > > > > > Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add 
> > > > > > > > another
> > > > > > > > menuconfig for VHOST_RING and do something similar?
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > Sorry I don't understand. After this patch CONFIG_VHOST_DPN is 
> > > > > > > just
> > > > > > > an internal variable for the OABI fix. I kept it separate
> > > > > > > so it's easy to revert for 5.8. Yes we could squash it into
> > > > > > > VHOST directly but I don't see how that changes logic at all.
> > > > > > Sorry for being unclear.
> > > > > >
> > > > > > I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will 
> > > > > > be left
> > > > > > in the defconfigs.
> > > > > But who cares?
> > > > FYI, please seehttps://www.spinics.net/lists/kvm/msg212685.html
> > > The complaint was not about the symbol IIUC.  It was that we caused
> > > everyone to build vhost unless they manually disabled it.
> >
> > There could be some misunderstanding here. I thought it's somehow similar: a
> > CONFIG_VHOST_MENU=y will be left in the defconfigs even if CONFIG_VHOST is
> > not set.
> >
> > Thanks
>
> Hmm. So looking at Documentation/kbuild/kconfig-language.rst :
>
> Things that merit "default y/m" include:
>
> a) A new Kconfig option for something that used to always be built
>should be "default y".
>
> b) A new gatekeeping Kconfig option that hides/shows other Kconfig
>options (but does not generate any code of its own), should be
>"default y" so people will see those other options.
>
> c) Sub-driver behavior or similar options for a driver that is
>"default n". This allows you to provide sane defaults.
>
>
> So it looks like VHOST_MENU is actually matching rule b).
> So what's the problem we are trying to solve with this patch, exactly?
>
> Geert could you clarify pls?

I can confirm VHOST_MENU is matching rule b), so it is safe to always
enable it.

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Jason Wang




On 2020/4/17 下午5:01, Michael S. Tsirkin wrote:

There could be some misunderstanding here. I thought it's somehow similar: a
CONFIG_VHOST_MENU=y will be left in the defconfigs even if CONFIG_VHOST is
not set.

Thanks


BTW do entries with no prompt actually appear in defconfig?



Yes. I can see CONFIG_VHOST_DPN=y after make ARCH=m68k defconfig

Thanks

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Michael S. Tsirkin

On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:
> 
> On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:
> > On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:
> > > We try to keep the defconfig untouched after decoupling CONFIG_VHOST
> > > out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
> > > ("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU by
> > > default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
> > > without the caring of CONFIG_VHOST.
> > > 
> > > But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and even
> > > for the ones that doesn't want vhost. So it actually shifts the
> > > burdens to the maintainers of all other to add "CONFIG_VHOST_MENU is
> > > not set". So this patch tries to enable CONFIG_VHOST explicitly in
> > > defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.
> > > 
> > > Acked-by: Christian Borntraeger  (s390)
> > > Acked-by: Michael Ellerman  (powerpc)
> > > Cc: Thomas Bogendoerfer
> > > Cc: Benjamin Herrenschmidt
> > > Cc: Paul Mackerras
> > > Cc: Michael Ellerman
> > > Cc: Heiko Carstens
> > > Cc: Vasily Gorbik
> > > Cc: Christian Borntraeger
> > > Reported-by: Geert Uytterhoeven
> > > Signed-off-by: Jason Wang
> > I rebased this on top of OABI fix since that
> > seems more orgent to fix.
> > Pushed to my vhost branch pls take a look and
> > if possible test.
> > Thanks!
> 
> 
> I test this patch by generating the defconfigs that wants vhost_net or
> vhost_vsock. All looks fine.
> 
> But having CONFIG_VHOST_DPN=y may end up with the similar situation that
> this patch want to address.
> Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add another
> menuconfig for VHOST_RING and do something similar?
> 
> Thanks

Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
an internal variable for the OABI fix. I kept it separate
so it's easy to revert for 5.8. Yes we could squash it into
VHOST directly but I don't see how that changes logic at all.

-- 
MST

[PATCH v5,0/4] drivers: uio: new driver uio_fsl_85xx_cache_sram

2020-04-17 Thread Wang Wenhu

This series add a new uio driver for freescale 85xx platforms to
access the Cache-Sram form user level. This is extremely helpful
for the user-space applications that require high performance memory
accesses.

It fixes the compile errors and warning of the hardware level drivers
and implements the uio driver in uio_fsl_85xx_cache_sram.c.

Changes since v1:
 * Addressed comments from Greg K-H
 * Moved kfree(info->name) into uio_info_free_internal()

Changes since v2:
 * Drop the patch that modifies Kconfigs of arch/powerpc/platforms
   and modified the sequence of patches:
01:dropped, 02->03, 03->02, 04->01, 05->04
 * Addressed comments from Greg, Scott and Christophe
 * Use "uiomem->internal_addr" as if condition for sram memory free,
   and memset the uiomem entry
 * Modified of_match_table make the driver apart from Cache-Sram HW info
   which belong to the HW level driver fsl_85xx_cache_sram to match
 * Use roundup_pow_of_two for align calc
 * Remove useless clear block of uiomem entries.
 * Use UIO_INFO_VER micro for info->version, and define it as
   "devicetree,pseudo", meaning this is pseudo device and probed from
   device tree configuration
 * Select FSL_85XX_CACHE_SRAM rather than depends on it

Changes since v3:
 * Addressed comments from Christophe(use devm_xxx memory alloc interfaces)
 
Changes since v4:
 * Use module_param_string for of_match_table, no binding to devicetree

Wang Wenhu (4):
  powerpc: sysdev: fix compile error for fsl_85xx_l2ctlr
  powerpc: sysdev: fix compile error for fsl_85xx_cache_sram
  powerpc: sysdev: fix compile warning for fsl_85xx_cache_sram
  drivers: uio: new driver for fsl_85xx_cache_sram

 arch/powerpc/sysdev/fsl_85xx_cache_sram.c |   3 +-
 arch/powerpc/sysdev/fsl_85xx_l2ctlr.c |   1 +
 drivers/uio/Kconfig   |   9 ++
 drivers/uio/Makefile  |   1 +
 drivers/uio/uio_fsl_85xx_cache_sram.c | 154 ++
 5 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 drivers/uio/uio_fsl_85xx_cache_sram.c

-- 
2.17.1

[PATCH v5, 1/4] powerpc: sysdev: fix compile error for fsl_85xx_l2ctlr

2020-04-17 Thread Wang Wenhu

Include "linux/of_address.h" to fix the compile error for
mpc85xx_l2ctlr_of_probe() when compiling fsl_85xx_cache_sram.c.

  CC  arch/powerpc/sysdev/fsl_85xx_l2ctlr.o
arch/powerpc/sysdev/fsl_85xx_l2ctlr.c: In function ‘mpc85xx_l2ctlr_of_probe’:
arch/powerpc/sysdev/fsl_85xx_l2ctlr.c:90:11: error: implicit declaration of 
function ‘of_iomap’; did you mean ‘pci_iomap’? 
[-Werror=implicit-function-declaration]
  l2ctlr = of_iomap(dev->dev.of_node, 0);
   ^~~~
   pci_iomap
arch/powerpc/sysdev/fsl_85xx_l2ctlr.c:90:9: error: assignment makes pointer 
from integer without a cast [-Werror=int-conversion]
  l2ctlr = of_iomap(dev->dev.of_node, 0);
 ^
cc1: all warnings being treated as errors
scripts/Makefile.build:267: recipe for target 
'arch/powerpc/sysdev/fsl_85xx_l2ctlr.o' failed
make[2]: *** [arch/powerpc/sysdev/fsl_85xx_l2ctlr.o] Error 1

Cc: Greg Kroah-Hartman 
Cc: Christophe Leroy 
Cc: Scott Wood 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Fixes: 6db92cc9d07d ("powerpc/85xx: add cache-sram support")
Reviewed-by: Christophe Leroy 
Signed-off-by: Wang Wenhu 
---
Changes since v1:
 * None
Changes since v2:
 * None
Changes since v3:
 * None
Changes since v4:
 * None
---
 arch/powerpc/sysdev/fsl_85xx_l2ctlr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/sysdev/fsl_85xx_l2ctlr.c 
b/arch/powerpc/sysdev/fsl_85xx_l2ctlr.c
index 2d0af0c517bb..7533572492f0 100644
--- a/arch/powerpc/sysdev/fsl_85xx_l2ctlr.c
+++ b/arch/powerpc/sysdev/fsl_85xx_l2ctlr.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "fsl_85xx_cache_ctlr.h"
-- 
2.17.1

Re: [PATCH v4,4/4] drivers: uio: new driver for fsl_85xx_cache_sram

2020-04-17 Thread Greg KH

On Thu, Apr 16, 2020 at 11:58:29PM -0500, Scott Wood wrote:
> On Fri, 2020-04-17 at 10:31 +0800, 王文虎 wrote:
> > > > On Thu, 2020-04-16 at 08:35 -0700, Wang Wenhu wrote:
> > > > > +#define UIO_INFO_VER "devicetree,pseudo"
> > > > 
> > > > What does this mean?  Changing a number into a non-obvious string (Why
> > > > "pseudo"?  Why does the UIO user care that the config came from the
> > > > device
> > > > tree?) just to avoid setting off Greg's version number autoresponse
> > > > isn't
> > > > really helping anything.
> > > > 
> > > > > +static const struct of_device_id uio_mpc85xx_l2ctlr_of_match[] = {
> > > > > + {   .compatible = "uio,mpc85xx-cache-sram", },
> > > 
> > > Form is , and "uio" is not a vendor (and never will be).
> > > 
> > 
> > Should have been something like "fsl,mpc85xx-cache-sram-uio", and if it is
> > to be defined with module parameters, this would be user defined.
> > Anyway, , should always be used.
> > 
> > > > > + {},
> > > > > +};
> > > > > +
> > > > > +static struct platform_driver uio_fsl_85xx_cache_sram = {
> > > > > + .probe = uio_fsl_85xx_cache_sram_probe,
> > > > > + .remove = uio_fsl_85xx_cache_sram_remove,
> > > > > + .driver = {
> > > > > + .name = DRIVER_NAME,
> > > > > + .owner = THIS_MODULE,
> > > > > + .of_match_table = uio_mpc85xx_l2ctlr_of_match,
> > > > > + },
> > > > > +};
> > > > 
> > > > Greg's comment notwithstanding, I really don't think this belongs in the
> > > > device tree (and if I do get overruled on that point, it at least needs
> > > > a
> > > > binding document).  Let me try to come up with a patch for dynamic
> > > > allocation.
> > > 
> > > Agreed. "UIO" bindings have long been rejected.
> > > 
> > 
> > Sounds it is. And does the modification below fit well?
> > ---
> > -static const struct of_device_id uio_mpc85xx_l2ctlr_of_match[] = {
> > -   {   .compatible = "uio,mpc85xx-cache-sram", },
> > -   {},
> > +#ifdef CONFIG_OF
> > +static struct of_device_id uio_fsl_85xx_cache_sram_of_match[] = {
> > +   { /* This is filled with module_parm */ },
> > +   { /* Sentinel */ },
> >  };
> > +MODULE_DEVICE_TABLE(of, uio_fsl_85xx_cache_sram_of_match);
> > +module_param_string(of_id, uio_fsl_85xx_cache_sram_of_match[0].compatible,
> > +   sizeof(uio_fsl_85xx_cache_sram_of_match[0].compa
> > tible), 0);
> > +MODULE_PARM_DESC(of_id, "platform device id to be handled by cache-sram-
> > uio");
> > +#endif
> 
> No.  The point is that you wouldn't be configuring this with the device tree
> at all.

Wait, why not?  Don't force people to use module parameters, that is
crazy.  DT describes the hardware involved, if someone wants to bind to
a specific range of memory, as described by DT, why can't they do so?

I can understand not liking the name "uio" in a dt tree, but there's no
reason that DT can not describe what a driver binds to here.

Remember, module parameters are NEVER the answer, this isn't the 1990's
anymore.

thanks,

greg k-h

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Jason Wang




On 2020/4/17 下午4:29, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:

On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:

On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:

On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:

We try to keep the defconfig untouched after decoupling CONFIG_VHOST
out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU by
default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
without the caring of CONFIG_VHOST.

But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and even
for the ones that doesn't want vhost. So it actually shifts the
burdens to the maintainers of all other to add "CONFIG_VHOST_MENU is
not set". So this patch tries to enable CONFIG_VHOST explicitly in
defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.

Acked-by: Christian Borntraeger  (s390)
Acked-by: Michael Ellerman  (powerpc)
Cc: Thomas Bogendoerfer
Cc: Benjamin Herrenschmidt
Cc: Paul Mackerras
Cc: Michael Ellerman
Cc: Heiko Carstens
Cc: Vasily Gorbik
Cc: Christian Borntraeger
Reported-by: Geert Uytterhoeven
Signed-off-by: Jason Wang

I rebased this on top of OABI fix since that
seems more orgent to fix.
Pushed to my vhost branch pls take a look and
if possible test.
Thanks!

I test this patch by generating the defconfigs that wants vhost_net or
vhost_vsock. All looks fine.

But having CONFIG_VHOST_DPN=y may end up with the similar situation that
this patch want to address.
Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add another
menuconfig for VHOST_RING and do something similar?

Thanks

Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
an internal variable for the OABI fix. I kept it separate
so it's easy to revert for 5.8. Yes we could squash it into
VHOST directly but I don't see how that changes logic at all.


Sorry for being unclear.

I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be left
in the defconfigs.

But who cares?



FYI, please see https://www.spinics.net/lists/kvm/msg212685.html



That does not add any code, does it?



It doesn't.

Thanks





This requires the arch maintainers to add
"CONFIG_VHOST_VDPN is not set". (Geert complains about this)

Thanks

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Michael S. Tsirkin

On Fri, Apr 17, 2020 at 04:39:49PM +0800, Jason Wang wrote:
> 
> On 2020/4/17 下午4:29, Michael S. Tsirkin wrote:
> > On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:
> > > On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:
> > > > On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:
> > > > > On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:
> > > > > > On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:
> > > > > > > We try to keep the defconfig untouched after decoupling 
> > > > > > > CONFIG_VHOST
> > > > > > > out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
> > > > > > > ("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU 
> > > > > > > by
> > > > > > > default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
> > > > > > > without the caring of CONFIG_VHOST.
> > > > > > > 
> > > > > > > But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and 
> > > > > > > even
> > > > > > > for the ones that doesn't want vhost. So it actually shifts the
> > > > > > > burdens to the maintainers of all other to add "CONFIG_VHOST_MENU 
> > > > > > > is
> > > > > > > not set". So this patch tries to enable CONFIG_VHOST explicitly in
> > > > > > > defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.
> > > > > > > 
> > > > > > > Acked-by: Christian Borntraeger  (s390)
> > > > > > > Acked-by: Michael Ellerman  (powerpc)
> > > > > > > Cc: Thomas Bogendoerfer
> > > > > > > Cc: Benjamin Herrenschmidt
> > > > > > > Cc: Paul Mackerras
> > > > > > > Cc: Michael Ellerman
> > > > > > > Cc: Heiko Carstens
> > > > > > > Cc: Vasily Gorbik
> > > > > > > Cc: Christian Borntraeger
> > > > > > > Reported-by: Geert Uytterhoeven
> > > > > > > Signed-off-by: Jason Wang
> > > > > > I rebased this on top of OABI fix since that
> > > > > > seems more orgent to fix.
> > > > > > Pushed to my vhost branch pls take a look and
> > > > > > if possible test.
> > > > > > Thanks!
> > > > > I test this patch by generating the defconfigs that wants vhost_net or
> > > > > vhost_vsock. All looks fine.
> > > > > 
> > > > > But having CONFIG_VHOST_DPN=y may end up with the similar situation 
> > > > > that
> > > > > this patch want to address.
> > > > > Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add 
> > > > > another
> > > > > menuconfig for VHOST_RING and do something similar?
> > > > > 
> > > > > Thanks
> > > > Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
> > > > an internal variable for the OABI fix. I kept it separate
> > > > so it's easy to revert for 5.8. Yes we could squash it into
> > > > VHOST directly but I don't see how that changes logic at all.
> > > 
> > > Sorry for being unclear.
> > > 
> > > I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be left
> > > in the defconfigs.
> > But who cares?
> 
> 
> FYI, please see https://www.spinics.net/lists/kvm/msg212685.html

The complaint was not about the symbol IIUC.  It was that we caused
everyone to build vhost unless they manually disabled it.

> 
> > That does not add any code, does it?
> 
> 
> It doesn't.
> 
> Thanks
> 
> 
> > 
> > > This requires the arch maintainers to add
> > > "CONFIG_VHOST_VDPN is not set". (Geert complains about this)
> > > 
> > > Thanks
> > > 
> > >

[PATCH v6 0/9] crypto/nx: Enable GZIP engine and provide userpace API

2020-04-17 Thread Haren Myneni



Power9 processor supports Virtual Accelerator Switchboard (VAS) which
allows kernel and userspace to send compression requests to Nest
Accelerator (NX) directly. The NX unit comprises of 2 842 compression
engines and 1 GZIP engine. Linux kernel already has 842 compression
support on kernel. This patch series adds GZIP compression support
from user space. The GZIP Compression engine implements the ZLIB and
GZIP compression algorithms. No plans of adding NX-GZIP compression
support in kernel right now.

Applications can send requests to NX directly with COPY/PASTE
instructions. But kernel has to establish channel / window on NX-GZIP
device for the userspace. So userspace access to the GZIP engine is
provided through /dev/crypto/nx-gzip device with several operations.

An application must open the this device to obtain a file descriptor (fd).
Using the fd, application should issue the VAS_TX_WIN_OPEN ioctl to
establish a connection to the engine. Once window is opened, should use
mmap() system call to map the hardware address of engine's request queue
into the application's virtual address space. Then user space forms the
request as co-processor Request Block (CRB) and paste this CRB on the
mapped HW address using COPY/PASTE instructions. Application can poll
on status flags (part of CRB) with timeout for request completion.

For VAS_TX_WIN_OPEN ioctl, if user space passes vas_id = -1 (struct
vas_tx_win_open_attr), kernel determines the VAS instance on the
corresponding chip based on the CPU on which the process is executing.
Otherwise, the specified VAS instance is used if application passes the
proper VAS instance (vas_id listed in /proc/device-tree/vas@*/ibm,vas_id).

Process can open multiple windows with different FDs or can send several
requests to NX on the same window at the same time.

A userspace library libnxz is available:
https://github.com/abalib/power-gzip

Applications that use inflate/deflate calls can link with libNXz and use
NX GZIP compression without any modification.

Tested the available 842 compression on power8 and power9 system to make
sure no regression and tested GZIP compression on power9 with tests
available in the above link.

Thanks to Bulent Abali for nxz library and tests development.

Changelog:

V2:
  - Move user space API code to powerpc as suggested. Also this API
can be extended to any other coprocessor type that VAS can support
in future. Example: Fast thread wakeup feature from VAS
  - Rebased to 5.6-rc3

V3:
  - Fix sparse warnings (patches 3&6)

V4:
  - Remove unused coproc_instid and add only window address in
fp->private_data.
  - Add NX User's manual and Copy/paste links in VAS API documentation
in patch and other changes as Daniel Axtens suggested

V5:
  - Added "NX Fault handling" section in VAS API documentation as Nick
suggested.
  - Documentation: mmap size should be PAGE_SIZE as Daniel Axtens pointed.

V6:
  - Make ioctl generic to support any coprocessor type (Michael Ellerman)
(patches 3&7)

Haren Myneni (9):
  powerpc/vas: Initialize window attributes for GZIP coprocessor type
  powerpc/vas: Define VAS_TX_WIN_OPEN ioctl API
  powerpc/vas: Add VAS user space API
  crypto/nx: Initialize coproc entry with kzalloc
  crypto/nx: Rename nx-842-powernv file name to nx-common-powernv
  crypto/nx: Make enable code generic to add new GZIP compression type
  crypto/nx: Enable and setup GZIP compresstion type
  crypto/nx: Remove 'pid' in vas_tx_win_attr struct
  Documentation/powerpc: VAS API

 Documentation/powerpc/index.rst|1 +
 Documentation/powerpc/vas-api.rst  |  292 +
 Documentation/userspace-api/ioctl/ioctl-number.rst |1 +
 arch/powerpc/include/asm/vas.h |   13 +-
 arch/powerpc/include/uapi/asm/vas-api.h|   22 +
 arch/powerpc/platforms/powernv/Makefile|2 +-
 arch/powerpc/platforms/powernv/vas-api.c   |  278 +
 arch/powerpc/platforms/powernv/vas-window.c|   23 +-
 arch/powerpc/platforms/powernv/vas.h   |2 +
 drivers/crypto/nx/Makefile |2 +-
 drivers/crypto/nx/nx-842-powernv.c | 1062 --
 drivers/crypto/nx/nx-common-powernv.c  | 1136 
 12 files changed, 1761 insertions(+), 1073 deletions(-)
 create mode 100644 Documentation/powerpc/vas-api.rst
 create mode 100644 arch/powerpc/include/uapi/asm/vas-api.h
 create mode 100644 arch/powerpc/platforms/powernv/vas-api.c
 delete mode 100644 drivers/crypto/nx/nx-842-powernv.c
 create mode 100644 drivers/crypto/nx/nx-common-powernv.c

-- 
1.8.3.1

[PATCH 2/4] powerpc/powernv/pci: Re-work bus PE configuration

2020-04-17 Thread Oliver O'Halloran

For normal PHBs IODA PEs are handled on a per-bus basis so all the devices
on that bus will share a PE. Which PE specificly is determined by the location
of the MMIO BARs for the devices on the bus so we can't actually configure the
bus PEs until after MMIO resources are allocated. As a result PEs are currently
configured by pcibios_setup_bridge(), which is called just before the bridge
windows are programmed into the bus' parent bridge. Configuring the bus PE here
causes a few problems:

1. The root bus doesn't have a parent bridge so setting up the PE for the root
   bus requires some hacks.

2. The PELT-V isn't setup correctly because pnv_ioda_set_peltv() assumes that
   PEs will be configured in root-to-leaf order. This assumption is broken
   because resource assignment is performed depth-first so the leaf bridges
   are setup before their parents are. The hack mentioned in 1) results in
   the "correct" PELT-V for busses immediately below the root port, but not
   for devices below a switch.

3. It's possible to break the sysfs PCI rescan feature by removing all
   the devices on a bus. When the last device is removed from a PE its
   will be de-configured. Rescanning the devices on a bus does not cause
   the bridge to be reconfigured rendering the devices on that bus
   unusable.

We can address most of these problems by moving the PE setup out of
pcibios_setup_bridge() and into pcibios_bus_add_device(). This fixes 1)
and 2) because pcibios_bus_add_device() is called on each device in
root-to-leaf order so PEs for parent buses will always be configured
before their children. It also fixes 3) by ensuring the PE is
configured before initialising DMA for the device. In the event the PE
was de-configured due to removing all the devices in that PE it will
now be reconfigured when a new device is added since there's no
dependecy on the bridge_setup() hook being called.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 81 ---
 1 file changed, 30 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 934cbee..2ba730c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -51,6 +51,7 @@ static const char * const pnv_phb_names[] = { "IODA1", 
"IODA2", "NPU_NVLINK",
  "NPU_OCAPI" };
 
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+static void pnv_pci_configure_bus(struct pci_bus *bus);
 
 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
@@ -1120,34 +1121,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct 
pci_dev *dev)
return pe;
 }
 
-static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
-{
-   struct pci_dev *dev;
-
-   list_for_each_entry(dev, >devices, bus_list) {
-   struct pci_dn *pdn = pci_get_pdn(dev);
-
-   if (pdn == NULL) {
-   pr_warn("%s: No device node associated with device !\n",
-   pci_name(dev));
-   continue;
-   }
-
-   /*
-* In partial hotplug case, the PCI device might be still
-* associated with the PE and needn't attach it to the PE
-* again.
-*/
-   if (pdn->pe_number != IODA_INVALID_PE)
-   continue;
-
-   pe->device_count++;
-   pdn->pe_number = pe->pe_number;
-   if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
-   pnv_ioda_setup_same_PE(dev->subordinate, pe);
-   }
-}
-
 /*
  * There're 2 types of PCI bus sensitive PEs: One that is compromised of
  * single PCI bus. Another one that contains the primary PCI bus and its
@@ -1168,7 +1141,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
pci_bus *bus, bool all)
pe_num = phb->ioda.pe_rmap[bus->number << 8];
if (pe_num != IODA_INVALID_PE) {
pe = >ioda.pe_array[pe_num];
-   pnv_ioda_setup_same_PE(bus, pe);
return NULL;
}
 
@@ -1212,9 +1184,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
pci_bus *bus, bool all)
return NULL;
}
 
-   /* Associate it with all child devices */
-   pnv_ioda_setup_same_PE(bus, pe);
-
/* Put PE to the list */
list_add_tail(>list, >ioda.pe_list);
 
@@ -1772,15 +1741,32 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev 
*pdev)
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
 
-   /*
-* The function can be called while the PE#
-* hasn't been assigned. Do nothing for the
-* case.
-*/
-   if (!pdn || pdn->pe_number == IODA_INVALID_PE)
-

Re:[PATCH v5,4/4] drivers: uio: new driver for fsl_85xx_cache_sram

2020-04-17 Thread 王文虎

>A driver for freescale 85xx platforms to access the Cache-Sram form>user 
>level. This is extremely helpful for some user-space applications
>that require high performance memory accesses.
>
>Cc: Greg Kroah-Hartman 
>Cc: Christophe Leroy 
>Cc: Scott Wood 
>Cc: Michael Ellerman 
>Cc: linuxppc-dev@lists.ozlabs.org
>Reviewed-by: Christophe Leroy 
>Signed-off-by: Wang Wenhu 

Hi, Christophe,
I labeled you Reviewed comment.

Regards,
Wenhu

>---
>Changes since v1:
> * Addressed comments from Greg K-H
> * Moved kfree(info->name) into uio_info_free_internal()
>Changes since v2:
> * Addressed comments from Greg, Scott and Christophe
> * Use "uiomem->internal_addr" as if condition for sram memory free,
>   and memset the uiomem entry
> * of_match_table modified to be apart from HW info which belong to
>   the HW level driver fsl_85xx_cache_sram to match
> * Use roundup_pow_of_two for align calc
> * Remove useless clear block of uiomem entries.
> * Use UIO_INFO_VER micro for info->version, and define it as
>   "devicetree,pseudo", meaning this is pseudo device and probed from
>   device tree configuration
>Changes since v3:
> * Addressed comments from Christophe(use devm_xxx memory alloc interfaces)
>Changes since v4:
> * Use module_param_string for of_match_table, no binding to devicetree
>---
> drivers/uio/Kconfig   |   9 ++
> drivers/uio/Makefile  |   1 +
> drivers/uio/uio_fsl_85xx_cache_sram.c | 154 ++
> 3 files changed, 164 insertions(+)
> create mode 100644 drivers/uio/uio_fsl_85xx_cache_sram.c
>
>diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
>index 202ee81cfc2b..9c3b47461b71 100644
>--- a/drivers/uio/Kconfig
>+++ b/drivers/uio/Kconfig
>@@ -105,6 +105,15 @@ config UIO_NETX
> To compile this driver as a module, choose M here; the module
> will be called uio_netx.
> 
>+config UIO_FSL_85XX_CACHE_SRAM
>+  tristate "Freescale 85xx Cache-Sram driver"
>+  depends on FSL_SOC_BOOKE && PPC32
>+  select FSL_85XX_CACHE_SRAM
>+  help
>+Generic driver for accessing the Cache-Sram form user level. This
>+is extremely helpful for some user-space applications that require
>+high performance memory accesses.
>+
> config UIO_FSL_ELBC_GPCM
>   tristate "eLBC/GPCM driver"
>   depends on FSL_LBC
>diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
>index c285dd2a4539..be2056cffc21 100644
>--- a/drivers/uio/Makefile
>+++ b/drivers/uio/Makefile
>@@ -10,4 +10,5 @@ obj-$(CONFIG_UIO_NETX)   += uio_netx.o
> obj-$(CONFIG_UIO_PRUSS) += uio_pruss.o
> obj-$(CONFIG_UIO_MF624) += uio_mf624.o
> obj-$(CONFIG_UIO_FSL_ELBC_GPCM)   += uio_fsl_elbc_gpcm.o
>+obj-$(CONFIG_UIO_FSL_85XX_CACHE_SRAM) += uio_fsl_85xx_cache_sram.o
> obj-$(CONFIG_UIO_HV_GENERIC)  += uio_hv_generic.o
>diff --git a/drivers/uio/uio_fsl_85xx_cache_sram.c 
>b/drivers/uio/uio_fsl_85xx_cache_sram.c
>new file mode 100644
>index ..4db3648629b3
>--- /dev/null
>+++ b/drivers/uio/uio_fsl_85xx_cache_sram.c
>@@ -0,0 +1,154 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/*
>+ * Copyright (C) 2020 Vivo Communication Technology Co. Ltd.
>+ * Copyright (C) 2020 Wang Wenhu 
>+ * All rights reserved.
>+ */
>+
>+#include 
>+#include 
>+#include 
>+#include 
>+#include 
>+#include 
>+
>+#define DRIVER_NAME   "uio_fsl_85xx_cache_sram"
>+#define UIO_INFO_VER  "devicetree,pseudo"
>+#define UIO_NAME  "uio_cache_sram"
>+
>+static void uio_info_free_internal(struct uio_info *info)
>+{
>+  int i;
>+
>+  for (i = 0; i < MAX_UIO_MAPS; i++) {
>+  struct uio_mem *uiomem = >mem[i];
>+
>+  if (uiomem->internal_addr) {
>+  mpc85xx_cache_sram_free(uiomem->internal_addr);
>+  memset(uiomem, 0, sizeof(*uiomem));
>+  }
>+  }
>+}
>+
>+static int uio_fsl_85xx_cache_sram_probe(struct platform_device *pdev)
>+{
>+  struct device_node *parent = pdev->dev.of_node;
>+  struct device_node *node = NULL;
>+  struct uio_info *info;
>+  struct uio_mem *uiomem;
>+  const char *dt_name;
>+  u32 mem_size;
>+  int ret;
>+
>+  /* alloc uio_info for one device */
>+  info = devm_kzalloc(>dev, sizeof(*info), GFP_KERNEL);
>+  if (!info)
>+  return -ENOMEM;
>+
>+  /* get optional uio name */
>+  if (of_property_read_string(parent, "uio_name", _name))
>+  dt_name = UIO_NAME;
>+
>+  info->name = devm_kstrdup(>dev, dt_name, GFP_KERNEL);
>+  if (!info->name)
>+  return -ENOMEM;
>+
>+  uiomem = info->mem;
>+  for_each_child_of_node(parent, node) {
>+  void *virt;
>+  phys_addr_t phys;
>+
>+  ret = of_property_read_u32(node, "cache-mem-size", _size);
>+  if (ret) {
>+  ret = -EINVAL;
>+  goto err_out;
>+  }
>+
>+  if (mem_size == 0) {
>+  dev_err(>dev,

Re:Re: [PATCH v4,4/4] drivers: uio: new driver for fsl_85xx_cache_sram

2020-04-17 Thread 王文虎


>> > > On Thu, 2020-04-16 at 08:35 -0700, Wang Wenhu wrote:
>> > > > +#define UIO_INFO_VER  "devicetree,pseudo"
>> > > 
>> > > What does this mean?  Changing a number into a non-obvious string (Why
>> > > "pseudo"?  Why does the UIO user care that the config came from the
>> > > device
>> > > tree?) just to avoid setting off Greg's version number autoresponse
>> > > isn't
>> > > really helping anything.
As I mentioned before, this is not currently meaningful for us, and maybe the 
better
way is to set it optionally for uio, but it belongs to uio core, which is a 
framework for
different kind of drivers or devices, but not only for us. So I guess this is 
not a thing
troubles and arguing about this is really helpless.

>> > > 
>> > > > +static const struct of_device_id uio_mpc85xx_l2ctlr_of_match[] = {
>> > > > +  {   .compatible = "uio,mpc85xx-cache-sram", },
>> > 
>> > Form is , and "uio" is not a vendor (and never will be).
>> > 
>> 
>> Should have been something like "fsl,mpc85xx-cache-sram-uio", and if it is
>> to be defined with module parameters, this would be user defined.
>> Anyway, , should always be used.
>> 
>> > > > +  {},
>> > > > +};
>> > > > +
>> > > > +static struct platform_driver uio_fsl_85xx_cache_sram = {
>> > > > +  .probe = uio_fsl_85xx_cache_sram_probe,
>> > > > +  .remove = uio_fsl_85xx_cache_sram_remove,
>> > > > +  .driver = {
>> > > > +  .name = DRIVER_NAME,
>> > > > +  .owner = THIS_MODULE,
>> > > > +  .of_match_table = uio_mpc85xx_l2ctlr_of_match,
>> > > > +  },
>> > > > +};
>> > > 
>> > > Greg's comment notwithstanding, I really don't think this belongs in the
>> > > device tree (and if I do get overruled on that point, it at least needs
>> > > a
>> > > binding document).  Let me try to come up with a patch for dynamic
>> > > allocation.
>> > 
>> > Agreed. "UIO" bindings have long been rejected.
>> > 
>> 
>> Sounds it is. And does the modification below fit well?
>> ---
>> -static const struct of_device_id uio_mpc85xx_l2ctlr_of_match[] = {
>> -   {   .compatible = "uio,mpc85xx-cache-sram", },
>> -   {},
>> +#ifdef CONFIG_OF
>> +static struct of_device_id uio_fsl_85xx_cache_sram_of_match[] = {
>> +   { /* This is filled with module_parm */ },
>> +   { /* Sentinel */ },
>>  };
>> +MODULE_DEVICE_TABLE(of, uio_fsl_85xx_cache_sram_of_match);
>> +module_param_string(of_id, uio_fsl_85xx_cache_sram_of_match[0].compatible,
>> +   sizeof(uio_fsl_85xx_cache_sram_of_match[0].compa
>> tible), 0);
>> +MODULE_PARM_DESC(of_id, "platform device id to be handled by cache-sram-
>> uio");
>> +#endif
>
>No.  The point is that you wouldn't be configuring this with the device tree
>at all.
>
It was to fit for the unbinding requriement. And I mentioned what if I want to 
create
more than one device and each owns multiple uiomaps?
Devicetree is definitely better choice to solve the problem and make the driver 
more
convenient for users to use. Pseudo device is a device and a device. Or else 
device
tree should be hardware-only-devicetree.

The point is why we left the better choice and write a plenty of redundant codes
to parse the module parameters?

Further more, I don't think there is enough reason for the lower driver mpc85xx 
cache-sram
to restrict other from using it in a way of module param or dynamic allocation 
with ioctl or so.

UIO is there and all these parts cooperate well to make the cache-sram more 
useful and better
resolve user requirement.

I will update the patch with the diff applied in v5.

Thanks,
Wenhu

[PATCH v5,4/4] drivers: uio: new driver for fsl_85xx_cache_sram

2020-04-17 Thread Wang Wenhu

A driver for freescale 85xx platforms to access the Cache-Sram form
user level. This is extremely helpful for some user-space applications
that require high performance memory accesses.

Cc: Greg Kroah-Hartman 
Cc: Christophe Leroy 
Cc: Scott Wood 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Reviewed-by: Christophe Leroy 
Signed-off-by: Wang Wenhu 
---
Changes since v1:
 * Addressed comments from Greg K-H
 * Moved kfree(info->name) into uio_info_free_internal()
Changes since v2:
 * Addressed comments from Greg, Scott and Christophe
 * Use "uiomem->internal_addr" as if condition for sram memory free,
   and memset the uiomem entry
 * of_match_table modified to be apart from HW info which belong to
   the HW level driver fsl_85xx_cache_sram to match
 * Use roundup_pow_of_two for align calc
 * Remove useless clear block of uiomem entries.
 * Use UIO_INFO_VER micro for info->version, and define it as
   "devicetree,pseudo", meaning this is pseudo device and probed from
   device tree configuration
Changes since v3:
 * Addressed comments from Christophe(use devm_xxx memory alloc interfaces)
Changes since v4:
 * Use module_param_string for of_match_table, no binding to devicetree
---
 drivers/uio/Kconfig   |   9 ++
 drivers/uio/Makefile  |   1 +
 drivers/uio/uio_fsl_85xx_cache_sram.c | 154 ++
 3 files changed, 164 insertions(+)
 create mode 100644 drivers/uio/uio_fsl_85xx_cache_sram.c

diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
index 202ee81cfc2b..9c3b47461b71 100644
--- a/drivers/uio/Kconfig
+++ b/drivers/uio/Kconfig
@@ -105,6 +105,15 @@ config UIO_NETX
  To compile this driver as a module, choose M here; the module
  will be called uio_netx.
 
+config UIO_FSL_85XX_CACHE_SRAM
+   tristate "Freescale 85xx Cache-Sram driver"
+   depends on FSL_SOC_BOOKE && PPC32
+   select FSL_85XX_CACHE_SRAM
+   help
+ Generic driver for accessing the Cache-Sram form user level. This
+ is extremely helpful for some user-space applications that require
+ high performance memory accesses.
+
 config UIO_FSL_ELBC_GPCM
tristate "eLBC/GPCM driver"
depends on FSL_LBC
diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
index c285dd2a4539..be2056cffc21 100644
--- a/drivers/uio/Makefile
+++ b/drivers/uio/Makefile
@@ -10,4 +10,5 @@ obj-$(CONFIG_UIO_NETX)+= uio_netx.o
 obj-$(CONFIG_UIO_PRUSS) += uio_pruss.o
 obj-$(CONFIG_UIO_MF624) += uio_mf624.o
 obj-$(CONFIG_UIO_FSL_ELBC_GPCM)+= uio_fsl_elbc_gpcm.o
+obj-$(CONFIG_UIO_FSL_85XX_CACHE_SRAM)  += uio_fsl_85xx_cache_sram.o
 obj-$(CONFIG_UIO_HV_GENERIC)   += uio_hv_generic.o
diff --git a/drivers/uio/uio_fsl_85xx_cache_sram.c 
b/drivers/uio/uio_fsl_85xx_cache_sram.c
new file mode 100644
index ..4db3648629b3
--- /dev/null
+++ b/drivers/uio/uio_fsl_85xx_cache_sram.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Vivo Communication Technology Co. Ltd.
+ * Copyright (C) 2020 Wang Wenhu 
+ * All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define DRIVER_NAME"uio_fsl_85xx_cache_sram"
+#define UIO_INFO_VER   "devicetree,pseudo"
+#define UIO_NAME   "uio_cache_sram"
+
+static void uio_info_free_internal(struct uio_info *info)
+{
+   int i;
+
+   for (i = 0; i < MAX_UIO_MAPS; i++) {
+   struct uio_mem *uiomem = >mem[i];
+
+   if (uiomem->internal_addr) {
+   mpc85xx_cache_sram_free(uiomem->internal_addr);
+   memset(uiomem, 0, sizeof(*uiomem));
+   }
+   }
+}
+
+static int uio_fsl_85xx_cache_sram_probe(struct platform_device *pdev)
+{
+   struct device_node *parent = pdev->dev.of_node;
+   struct device_node *node = NULL;
+   struct uio_info *info;
+   struct uio_mem *uiomem;
+   const char *dt_name;
+   u32 mem_size;
+   int ret;
+
+   /* alloc uio_info for one device */
+   info = devm_kzalloc(>dev, sizeof(*info), GFP_KERNEL);
+   if (!info)
+   return -ENOMEM;
+
+   /* get optional uio name */
+   if (of_property_read_string(parent, "uio_name", _name))
+   dt_name = UIO_NAME;
+
+   info->name = devm_kstrdup(>dev, dt_name, GFP_KERNEL);
+   if (!info->name)
+   return -ENOMEM;
+
+   uiomem = info->mem;
+   for_each_child_of_node(parent, node) {
+   void *virt;
+   phys_addr_t phys;
+
+   ret = of_property_read_u32(node, "cache-mem-size", _size);
+   if (ret) {
+   ret = -EINVAL;
+   goto err_out;
+   }
+
+   if (mem_size == 0) {
+   dev_err(>dev, "error cache-mem-size should not be 
0\n");
+   ret = -EINVAL;
+   goto err_out;
+   }
+
+

[PATCH 1/4] powerpc/powernv/pci: Add helper to find ioda_pe from BDFN

2020-04-17 Thread Oliver O'Halloran

For each PHB we maintain a reverse-map that can be used to find the
PE that a BDFN is currently mapped to. Add a helper for doing this
lookup so we can check if a PE has been configured without looking
at pdn->pe_number.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
 arch/powerpc/platforms/powernv/pci.h  |  1 +
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8ae8836..934cbee 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -660,6 +660,16 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int 
pe_no)
return state;
 }
 
+struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
+{
+   int pe_number = phb->ioda.pe_rmap[bdfn];
+
+   if (pe_number == IODA_INVALID_PE)
+   return NULL;
+
+   return >ioda.pe_array[pe_number];
+}
+
 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 {
struct pci_controller *hose = pci_bus_to_host(dev->bus);
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index fc05f9b..83d40a0 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -208,6 +208,7 @@ extern int pnv_eeh_phb_reset(struct pci_controller *hose, 
int option);
 
 extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
-- 
2.9.5

[PATCH 3/4] powerpc/powernv/pci: Reserve the root bus PE during init

2020-04-17 Thread Oliver O'Halloran

Doing it once during boot rather than doing it on the fly and drop the janky
populated logic.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 26 +-
 arch/powerpc/platforms/powernv/pci.h  |  1 -
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2ba730c..05436a9 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1145,8 +1145,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
pci_bus *bus, bool all)
}
 
/* PE number for root bus should have been reserved */
-   if (pci_is_root_bus(bus) &&
-   phb->ioda.root_pe_idx != IODA_INVALID_PE)
+   if (pci_is_root_bus(bus))
pe = >ioda.pe_array[phb->ioda.root_pe_idx];
 
/* Check if PE is determined by M64 */
@@ -3224,15 +3223,6 @@ static void pnv_pci_configure_bus(struct pci_bus *bus)
 
dev_info(>dev, "Configuring PE for bus\n");
 
-   /* The PE for root bus should be realized before any one else */
-   if (!phb->ioda.root_pe_populated) {
-   pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
-   if (pe) {
-   phb->ioda.root_pe_idx = pe->pe_number;
-   phb->ioda.root_pe_populated = true;
-   }
-   }
-
/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
if (list_empty(>devices))
return;
@@ -3517,11 +3507,10 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
 * that it can be populated again in PCI hot add path. The PE
 * shouldn't be destroyed as it's the global reserved resource.
 */
-   if (phb->ioda.root_pe_populated &&
-   phb->ioda.root_pe_idx == pe->pe_number)
-   phb->ioda.root_pe_populated = false;
-   else
-   pnv_ioda_free_pe(pe);
+   if (phb->ioda.root_pe_idx == pe->pe_number)
+   return;
+
+   pnv_ioda_free_pe(pe);
 }
 
 static void pnv_pci_release_device(struct pci_dev *pdev)
@@ -3629,6 +3618,7 @@ static void __init pnv_pci_init_ioda_phb(struct 
device_node *np,
struct pnv_phb *phb;
unsigned long size, m64map_off, m32map_off, pemap_off;
unsigned long iomap_off = 0, dma32map_off = 0;
+   struct pnv_ioda_pe *root_pe;
struct resource r;
const __be64 *prop64;
const __be32 *prop32;
@@ -3796,7 +3786,9 @@ static void __init pnv_pci_init_ioda_phb(struct 
device_node *np,
phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
} else {
-   phb->ioda.root_pe_idx = IODA_INVALID_PE;
+   /* otherwise just allocate one */
+   root_pe = pnv_ioda_alloc_pe(phb);
+   phb->ioda.root_pe_idx = root_pe->pe_number;
}
 
INIT_LIST_HEAD(>ioda.pe_list);
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 83d40a0..51c254f2 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -136,7 +136,6 @@ struct pnv_phb {
unsigned inttotal_pe_num;
unsigned intreserved_pe_idx;
unsigned introot_pe_idx;
-   boolroot_pe_populated;
 
/* 32-bit MMIO window */
unsigned intm32_size;
-- 
2.9.5

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Jason Wang




On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:

On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:

On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:

We try to keep the defconfig untouched after decoupling CONFIG_VHOST
out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU by
default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
without the caring of CONFIG_VHOST.

But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and even
for the ones that doesn't want vhost. So it actually shifts the
burdens to the maintainers of all other to add "CONFIG_VHOST_MENU is
not set". So this patch tries to enable CONFIG_VHOST explicitly in
defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.

Acked-by: Christian Borntraeger  (s390)
Acked-by: Michael Ellerman  (powerpc)
Cc: Thomas Bogendoerfer
Cc: Benjamin Herrenschmidt
Cc: Paul Mackerras
Cc: Michael Ellerman
Cc: Heiko Carstens
Cc: Vasily Gorbik
Cc: Christian Borntraeger
Reported-by: Geert Uytterhoeven
Signed-off-by: Jason Wang

I rebased this on top of OABI fix since that
seems more orgent to fix.
Pushed to my vhost branch pls take a look and
if possible test.
Thanks!


I test this patch by generating the defconfigs that wants vhost_net or
vhost_vsock. All looks fine.

But having CONFIG_VHOST_DPN=y may end up with the similar situation that
this patch want to address.
Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add another
menuconfig for VHOST_RING and do something similar?

Thanks

Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
an internal variable for the OABI fix. I kept it separate
so it's easy to revert for 5.8. Yes we could squash it into
VHOST directly but I don't see how that changes logic at all.



Sorry for being unclear.

I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be 
left in the defconfigs. This requires the arch maintainers to add 
"CONFIG_VHOST_VDPN is not set". (Geert complains about this)


Thanks

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Jason Wang




On 2020/4/17 下午4:46, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 04:39:49PM +0800, Jason Wang wrote:

On 2020/4/17 下午4:29, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:

On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:

On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:

On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:

On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:

We try to keep the defconfig untouched after decoupling CONFIG_VHOST
out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU by
default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
without the caring of CONFIG_VHOST.

But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and even
for the ones that doesn't want vhost. So it actually shifts the
burdens to the maintainers of all other to add "CONFIG_VHOST_MENU is
not set". So this patch tries to enable CONFIG_VHOST explicitly in
defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.

Acked-by: Christian Borntraeger   (s390)
Acked-by: Michael Ellerman   (powerpc)
Cc: Thomas Bogendoerfer
Cc: Benjamin Herrenschmidt
Cc: Paul Mackerras
Cc: Michael Ellerman
Cc: Heiko Carstens
Cc: Vasily Gorbik
Cc: Christian Borntraeger
Reported-by: Geert Uytterhoeven
Signed-off-by: Jason Wang

I rebased this on top of OABI fix since that
seems more orgent to fix.
Pushed to my vhost branch pls take a look and
if possible test.
Thanks!

I test this patch by generating the defconfigs that wants vhost_net or
vhost_vsock. All looks fine.

But having CONFIG_VHOST_DPN=y may end up with the similar situation that
this patch want to address.
Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add another
menuconfig for VHOST_RING and do something similar?

Thanks

Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
an internal variable for the OABI fix. I kept it separate
so it's easy to revert for 5.8. Yes we could squash it into
VHOST directly but I don't see how that changes logic at all.

Sorry for being unclear.

I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be left
in the defconfigs.

But who cares?

FYI, please seehttps://www.spinics.net/lists/kvm/msg212685.html

The complaint was not about the symbol IIUC.  It was that we caused
everyone to build vhost unless they manually disabled it.



There could be some misunderstanding here. I thought it's somehow 
similar: a CONFIG_VHOST_MENU=y will be left in the defconfigs even if 
CONFIG_VHOST is not set.


Thanks

Re: POWER9 crash due to STRICT_KERNEL_RWX (WAS: Re: Linux-next POWER9 NULL pointer NIP...)

2020-04-17 Thread Naveen N. Rao


Hi Qian,

Qian Cai wrote:

OK, reverted the commit,

c55d7b5e6426 (“powerpc: Remove STRICT_KERNEL_RWX incompatibility with 
RELOCATABLE”)

or set STRICT_KERNEL_RWX=n fixed the crash below and also mentioned in this 
thread,

https://lore.kernel.org/lkml/15ac5b0e-a221-4b8c-9039-fa96b8ef7...@lca.pw/


Do you see any errors logged in dmesg when you see the crash?  
STRICT_KERNEL_RWX changes how patch_instruction() works, so it would be 
interesting to see if there are any ftrace-related errors thrown before 
the crash.



- Naveen

Fix sysfs pci bus rescan on PowerNV (and other things)

2020-04-17 Thread Oliver O'Halloran

This series is based on top of my previously posted series which reworks
how devices are added to their IOMMU groups. The two series are largely
orthogonal to each other, but they both touch pnv_pci_ioda_dma_dev_setup()
so there's a minor merge conflict if they aren't applied together. I can
fix that if people think it's important, but applying them together is
probably easisest for everyone.

Base series: 
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=168715

With that out of the way, what the bulk of the changes in here are in 2/4
which moves the point where we do the HW configuration to allow a bus to be
used. Currently it's done when we setup the parent bridge for that bus and
we're moving it to be done when we add the first device to that bus.

For an example of why this change is necssary this is what happens on the
current linux-next master. This has one extra patch applied to print an
error when pci_enable_device() is blocked by the platform since it helps
highlight the issue:

/sys/devices/pci0022:00/0022:00:00.0 # echo 1 > 0022\:01\:00.0/remove
e1000e 0022:01:00.0 enP34p1s0: removed PHC
e1000e 0022:01:00.0 enP34p1s0: NIC Link is Down
pci 0022:01:00.0: Removing from iommu group 11

At this point the bus 0022:01 is empty.

/sys/devices/pci0022:00/0022:00:00.0 # echo 1 > rescan
pci 0022:01:00.0: [8086:10d3] type 00 class 0x02
pci 0022:01:00.0: reg 0x10: [mem 0x3fe9000c-0x3fe9000d]
pci 0022:01:00.0: reg 0x14: [mem 0x3fe9-0x3fe90007]
pci 0022:01:00.0: reg 0x18: [io  0x-0x001f]
pci 0022:01:00.0: reg 0x1c: [mem 0x3fe9000e-0x3fe9000e3fff]
pci 0022:01:00.0: reg 0x30: [mem 0x-0x0003 pref]
pci 0022:01:00.0: BAR3 [mem size 0x4000]: requesting alignment to 0x1
pci 0022:01:00.0: PME# supported from D0 D3hot D3cold
pci 0022:00:00.0: BAR 13: no space for [io  size 0x1000]
pci 0022:00:00.0: BAR 13: failed to assign [io  size 0x1000]
pci 0022:01:00.0: BAR 1: assigned [mem 0x3fe9-0x3fe90007]
pci 0022:01:00.0: BAR 6: assigned [mem 0x3fe90008-0x3fe9000b pref]
pci 0022:01:00.0: BAR 0: assigned [mem 0x3fe9000c-0x3fe9000d]
pci 0022:01:00.0: BAR 3: assigned [mem 0x3fe9000e-0x3fe9000e3fff]
pci 0022:01:00.0: BAR 2: no space for [io  size 0x0020]
pci 0022:01:00.0: BAR 2: failed to assign [io  size 0x0020]
e1000e 0022:01:00.0: pci_enable_device() blocked, no PE assigned.
e1000e: probe of 0022:01:00.0 failed with error -22

So on rescan we can re-discover the device, but the driver probe will
always fail at the point where the driver attemps to enable the device
because the PE was deconfigured.

Repeating this same experiment with this series (and dependency) applied:

/sys/devices/pci0022:00/0022:00:00.0 # echo 1 > rescan
pci 0022:01:00.0: [8086:10d3] type 00 class 0x02
pci 0022:01:00.0: reg 0x10: [mem 0x3fe9000c-0x3fe9000d]
pci 0022:01:00.0: reg 0x14: [mem 0x3fe9-0x3fe90007]
pci 0022:01:00.0: reg 0x18: [io  0x-0x001f]
pci 0022:01:00.0: reg 0x1c: [mem 0x3fe9000e-0x3fe9000e3fff]
pci 0022:01:00.0: reg 0x30: [mem 0x-0x0003 pref]
pci 0022:01:00.0: BAR3 [mem size 0x4000]: requesting alignment to 0x1
pci 0022:01:00.0: PME# supported from D0 D3hot D3cold
pci 0022:00:00.0: BAR 13: no space for [io  size 0x1000]
pci 0022:00:00.0: BAR 13: failed to assign [io  size 0x1000]
pci 0022:01:00.0: BAR 1: assigned [mem 0x3fe9-0x3fe90007]
pci 0022:01:00.0: BAR 6: assigned [mem 0x3fe90008-0x3fe9000b pref]
pci 0022:01:00.0: BAR 0: assigned [mem 0x3fe9000c-0x3fe9000d]
pci 0022:01:00.0: BAR 3: assigned [mem 0x3fe9000e-0x3fe9000e3fff]
pci 0022:01:00.0: BAR 2: no space for [io  size 0x0020]
pci 0022:01:00.0: BAR 2: failed to assign [io  size 0x0020]
pci_bus 0022:01: Configuring PE for bus
pci 0022:01 : [PE# fd] Secondary bus 0x0001 associated with 
PE#fd
pci 0022:01 : [PE# fd] Setting up 32-bit TCE table at 0..8000
pci 0022:01 : [PE# fd] Setting up window#0 0..7f pg=1
pci 0022:01 : [PE# fd] Enabling 64-bit DMA bypass
pci 0022:01:00.0: Configured PE#fd
pci 0022:01:00.0: Adding to iommu group 12
e1000e 0022:01:00.0: enabling device (0140 -> 0142)
e1000e 0022:01:00.0: Interrupt Throttling Rate (ints/sec) set to dynamic 
conservative mode
e1000e 0022:01:00.0 0022:01:00.0 (uninitialized): registered PHC clock
e1000e 0022:01:00.0 eth0: (PCI Express:2.5GT/s:Width x1) 68:05:ca:37:9c:d7
e1000e 0022:01:00.0 enP34p1s0: renamed from eth0
e1000e 0022:01:00.0 enP34p1s0: Intel(R) PRO/1000 Network Connection
e1000e 0022:01:00.0 enP34p1s0: MAC: 3, PHY: 8, PBA No: E46981-008
/sys/devices/pci0022:00/0022:00:00.0 #

Now, when the rescan happens we notice the PE was deconfigured after removing
the device and re-configure it. This allows the device to be enabled and
everything works. Probably.

Making this change also lays the groundwork for allowing devices to be
added to a bus PE as they're enabled rather than mapping all 256 devfns
on a bus

Re: [PATCH 1/4] dma-mapping: move the remaining DMA API calls out of line

2020-04-17 Thread Christoph Hellwig

On Wed, Apr 15, 2020 at 09:21:37PM +1000, Alexey Kardashevskiy wrote:
> And the fact they were exported leaves possibility that there is a
> driver somewhere relying on these symbols or distro kernel won't build
> because the symbol disappeared from exports (I do not know what KABI
> guarantees or if mainline kernel cares).

We absolutely do not care.  In fact for abuses of APIs that drivers
should not use we almost care to make them private and break people
abusing them.

> I do not care in particular but
> some might, a line separated with empty lines in the commit log would do.

I'll add a blurb for the next version.

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Michael S. Tsirkin

On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:
> 
> On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:
> > On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:
> > > On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:
> > > > On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:
> > > > > We try to keep the defconfig untouched after decoupling CONFIG_VHOST
> > > > > out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
> > > > > ("vhost: refine vhost and vringh kconfig") by enabling VHOST_MENU by
> > > > > default. Then the defconfigs can keep enabling CONFIG_VHOST_NET
> > > > > without the caring of CONFIG_VHOST.
> > > > > 
> > > > > But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs and even
> > > > > for the ones that doesn't want vhost. So it actually shifts the
> > > > > burdens to the maintainers of all other to add "CONFIG_VHOST_MENU is
> > > > > not set". So this patch tries to enable CONFIG_VHOST explicitly in
> > > > > defconfigs that enables CONFIG_VHOST_NET and CONFIG_VHOST_VSOCK.
> > > > > 
> > > > > Acked-by: Christian Borntraeger  (s390)
> > > > > Acked-by: Michael Ellerman  (powerpc)
> > > > > Cc: Thomas Bogendoerfer
> > > > > Cc: Benjamin Herrenschmidt
> > > > > Cc: Paul Mackerras
> > > > > Cc: Michael Ellerman
> > > > > Cc: Heiko Carstens
> > > > > Cc: Vasily Gorbik
> > > > > Cc: Christian Borntraeger
> > > > > Reported-by: Geert Uytterhoeven
> > > > > Signed-off-by: Jason Wang
> > > > I rebased this on top of OABI fix since that
> > > > seems more orgent to fix.
> > > > Pushed to my vhost branch pls take a look and
> > > > if possible test.
> > > > Thanks!
> > > 
> > > I test this patch by generating the defconfigs that wants vhost_net or
> > > vhost_vsock. All looks fine.
> > > 
> > > But having CONFIG_VHOST_DPN=y may end up with the similar situation that
> > > this patch want to address.
> > > Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add another
> > > menuconfig for VHOST_RING and do something similar?
> > > 
> > > Thanks
> > Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
> > an internal variable for the OABI fix. I kept it separate
> > so it's easy to revert for 5.8. Yes we could squash it into
> > VHOST directly but I don't see how that changes logic at all.
> 
> 
> Sorry for being unclear.
> 
> I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be left
> in the defconfigs.

But who cares? That does not add any code, does it?

> This requires the arch maintainers to add
> "CONFIG_VHOST_VDPN is not set". (Geert complains about this)
> 
> Thanks
> 
> 
> >

Re: [PATCH V2] vhost: do not enable VHOST_MENU by default

2020-04-17 Thread Michael S. Tsirkin

On Fri, Apr 17, 2020 at 04:51:19PM +0800, Jason Wang wrote:
> 
> On 2020/4/17 下午4:46, Michael S. Tsirkin wrote:
> > On Fri, Apr 17, 2020 at 04:39:49PM +0800, Jason Wang wrote:
> > > On 2020/4/17 下午4:29, Michael S. Tsirkin wrote:
> > > > On Fri, Apr 17, 2020 at 03:36:52PM +0800, Jason Wang wrote:
> > > > > On 2020/4/17 下午2:33, Michael S. Tsirkin wrote:
> > > > > > On Fri, Apr 17, 2020 at 11:12:14AM +0800, Jason Wang wrote:
> > > > > > > On 2020/4/17 上午6:55, Michael S. Tsirkin wrote:
> > > > > > > > On Wed, Apr 15, 2020 at 10:43:56AM +0800, Jason Wang wrote:
> > > > > > > > > We try to keep the defconfig untouched after decoupling 
> > > > > > > > > CONFIG_VHOST
> > > > > > > > > out of CONFIG_VIRTUALIZATION in commit 20c384f1ea1a
> > > > > > > > > ("vhost: refine vhost and vringh kconfig") by enabling 
> > > > > > > > > VHOST_MENU by
> > > > > > > > > default. Then the defconfigs can keep enabling 
> > > > > > > > > CONFIG_VHOST_NET
> > > > > > > > > without the caring of CONFIG_VHOST.
> > > > > > > > > 
> > > > > > > > > But this will leave a "CONFIG_VHOST_MENU=y" in all defconfigs 
> > > > > > > > > and even
> > > > > > > > > for the ones that doesn't want vhost. So it actually shifts 
> > > > > > > > > the
> > > > > > > > > burdens to the maintainers of all other to add 
> > > > > > > > > "CONFIG_VHOST_MENU is
> > > > > > > > > not set". So this patch tries to enable CONFIG_VHOST 
> > > > > > > > > explicitly in
> > > > > > > > > defconfigs that enables CONFIG_VHOST_NET and 
> > > > > > > > > CONFIG_VHOST_VSOCK.
> > > > > > > > > 
> > > > > > > > > Acked-by: Christian Borntraeger   
> > > > > > > > > (s390)
> > > > > > > > > Acked-by: Michael Ellerman   (powerpc)
> > > > > > > > > Cc: Thomas Bogendoerfer
> > > > > > > > > Cc: Benjamin Herrenschmidt
> > > > > > > > > Cc: Paul Mackerras
> > > > > > > > > Cc: Michael Ellerman
> > > > > > > > > Cc: Heiko Carstens
> > > > > > > > > Cc: Vasily Gorbik
> > > > > > > > > Cc: Christian Borntraeger
> > > > > > > > > Reported-by: Geert Uytterhoeven
> > > > > > > > > Signed-off-by: Jason Wang
> > > > > > > > I rebased this on top of OABI fix since that
> > > > > > > > seems more orgent to fix.
> > > > > > > > Pushed to my vhost branch pls take a look and
> > > > > > > > if possible test.
> > > > > > > > Thanks!
> > > > > > > I test this patch by generating the defconfigs that wants 
> > > > > > > vhost_net or
> > > > > > > vhost_vsock. All looks fine.
> > > > > > > 
> > > > > > > But having CONFIG_VHOST_DPN=y may end up with the similar 
> > > > > > > situation that
> > > > > > > this patch want to address.
> > > > > > > Maybe we can let CONFIG_VHOST depends on !ARM || AEABI then add 
> > > > > > > another
> > > > > > > menuconfig for VHOST_RING and do something similar?
> > > > > > > 
> > > > > > > Thanks
> > > > > > Sorry I don't understand. After this patch CONFIG_VHOST_DPN is just
> > > > > > an internal variable for the OABI fix. I kept it separate
> > > > > > so it's easy to revert for 5.8. Yes we could squash it into
> > > > > > VHOST directly but I don't see how that changes logic at all.
> > > > > Sorry for being unclear.
> > > > > 
> > > > > I meant since it was enabled by default, "CONFIG_VHOST_DPN=y" will be 
> > > > > left
> > > > > in the defconfigs.
> > > > But who cares?
> > > FYI, please seehttps://www.spinics.net/lists/kvm/msg212685.html
> > The complaint was not about the symbol IIUC.  It was that we caused
> > everyone to build vhost unless they manually disabled it.
> 
> 
> There could be some misunderstanding here. I thought it's somehow similar: a
> CONFIG_VHOST_MENU=y will be left in the defconfigs even if CONFIG_VHOST is
> not set.
> 
> Thanks

Hmm. So looking at Documentation/kbuild/kconfig-language.rst :

Things that merit "default y/m" include:

a) A new Kconfig option for something that used to always be built
   should be "default y".


b) A new gatekeeping Kconfig option that hides/shows other Kconfig
   options (but does not generate any code of its own), should be
   "default y" so people will see those other options.

c) Sub-driver behavior or similar options for a driver that is
   "default n". This allows you to provide sane defaults.


So it looks like VHOST_MENU is actually matching rule b).
So what's the problem we are trying to solve with this patch, exactly?

Geert could you clarify pls?


> 
> >

Re: POWER9 crash due to STRICT_KERNEL_RWX (WAS: Re: Linux-next POWER9 NULL pointer NIP...)

2020-04-17 Thread Qian Cai




> On Apr 16, 2020, at 10:46 PM, Russell Currey  wrote:
> 
> On Thu, 2020-04-16 at 22:40 -0400, Qian Cai wrote:
>>> On Apr 16, 2020, at 10:27 PM, Russell Currey 
>>> wrote:
>>> 
>>> Reverting the patch with the given config will have the same effect
>>> as
>>> STRICT_KERNEL_RWX=n.  Not discounting that it could be a bug on the
>>> powerpc side (i.e. relocatable kernels with strict RWX on haven't
>>> been
>>> exhaustively tested yet), but we should definitely figure out
>>> what's
>>> going on with this bad access first.
>> 
>> BTW, this bad access only happened once. The overwhelming rest of
>> crashes are with NULL pointer NIP like below. How can you explain
>> that STRICT_KERNEL_RWX=n would also make those NULL NIP disappear if
>> STRICT_KERNEL_RWX is just a messenger?
> 
> What happens if you test with STRICT_KERNEL_RWX=y and RELOCATABLE=n,
> reverting my patch?  This would give us an idea of whether it's
> something broken recently or if there's something else going on.

That combination will crash as well. I don’t think it is broken recently though 
due to
the crash could happen back in 5.6-rc1 when your commit first introduced.

> 
>> 
>> [  215.281666][T16896] LTP: starting chown04_16
>> [  215.424203][T18297] BUG: Unable to handle kernel instruction fetch
>> (NULL pointer?)
>> [  215.424289][T18297] Faulting instruction address: 0x
>> [  215.424313][T18297] Oops: Kernel access of bad area, sig: 11 [#1]
>> [  215.424341][T18297] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256
>> DEBUG_PAGEALLOC NUMA PowerNV
>> [  215.424383][T18297] Modules linked in: loop kvm_hv kvm ip_tables
>> x_tables xfs sd_mod bnx2x mdio tg3 ahci libahci libphy libata
>> firmware_class dm_mirror dm_region_hash dm_log dm_mod
>> [  215.424459][T18297] CPU: 85 PID: 18297 Comm: chown04_16 Tainted:
>> GW 5.6.0-next-20200405+ #3
>> [  215.424489][T18297] NIP:   LR: c0080fbc0408
>> CTR: 
>> [  215.424530][T18297] REGS: c000200b8606f990 TRAP: 0400   Tainted:
>> GW  (5.6.0-next-20200405+)
>> [  215.424570][T18297] MSR:  900040009033
>>   CR: 84000248  XER: 2004
>> [  215.424619][T18297] CFAR: c0080fbc64f4 IRQMASK: 0 
>> [  215.424619][T18297] GPR00: c06c2238 c000200b8606fc20
>> c165ce00  
>> [  215.424619][T18297] GPR04: c000201a58106400 c000200b8606fcc0
>> 5f037e7d 00013bfb 
>> [  215.424619][T18297] GPR08: c000201a58106400 
>>  c1652ee0 
>> [  215.424619][T18297] GPR12:  c000201fff69a600
>>   
>> [  215.424619][T18297] GPR16:  
>>   
>> [  215.424619][T18297] GPR20:  
>>  0007 
>> [  215.424619][T18297] GPR24:  
>> c0080fbc8688 c000200b8606fcc0 
>> [  215.424619][T18297] GPR28:  7fff
>> c0080fbc0400 c00020068b8c0e70 
>> [  215.424914][T18297] NIP [] 0x0
>> [  215.424953][T18297] LR [c0080fbc0408] find_free_cb+0x8/0x30
>> [loop]
>> find_free_cb at drivers/block/loop.c:2129
>> [  215.424997][T18297] Call Trace:
>> [  215.425036][T18297] [c000200b8606fc20] [c06c2290]
>> idr_for_each+0xf0/0x170 (unreliable)
>> [  215.425073][T18297] [c000200b8606fca0] [c0080fbc2744]
>> loop_lookup.part.2+0x4c/0xb0 [loop]
>> loop_lookup at drivers/block/loop.c:2144
>> [  215.425105][T18297] [c000200b8606fce0] [c0080fbc3558]
>> loop_control_ioctl+0x120/0x1d0 [loop]
>> [  215.425149][T18297] [c000200b8606fd40] [c04eb688]
>> ksys_ioctl+0xd8/0x130
>> [  215.425190][T18297] [c000200b8606fd90] [c04eb708]
>> sys_ioctl+0x28/0x40
>> [  215.425233][T18297] [c000200b8606fdb0] [c003cc30]
>> system_call_exception+0x110/0x1e0
>> [  215.425274][T18297] [c000200b8606fe20] [c000c9f0]
>> system_call_common+0xf0/0x278
>> [  215.425314][T18297] Instruction dump:
>> [  215.425338][T18297]     
>>    
>> [  215.425374][T18297]     
>>    
>> [  215.425422][T18297] ---[ end trace ebed248fad431966 ]---
>> [  215.642114][T18297] 
>> [  216.642220][T18297] Kernel panic - not syncing: Fatal exception

[PATCH] powerpc/mm: Kill the task on KUAP fault

2020-04-17 Thread Christophe Leroy

If a task generates a KUAP fault, even from an acceptable
user access sequence, it is not a simple EFAULT.

Instead of emiting a warning, print a critical message and
kill the task with SIGSEGV.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/kup.h   |  7 +--
 arch/powerpc/include/asm/book3s/64/kup-radix.h | 14 +++---
 arch/powerpc/include/asm/nohash/32/kup-8xx.h   |  8 ++--
 arch/powerpc/mm/fault.c|  6 +-
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h 
b/arch/powerpc/include/asm/book3s/32/kup.h
index 3c0ba22dc360..63422650cb86 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -181,12 +181,15 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long 
address, bool is_write)
 {
unsigned long begin = regs->kuap & 0xf000;
unsigned long end = regs->kuap << 28;
+   bool is_fault = address < begin || address >= end;
 
if (!is_write)
return false;
 
-   return WARN(address < begin || address >= end,
-   "Bug: write fault blocked by segment registers !");
+   if (is_fault)
+   pr_crit("Bug: write fault blocked by segment registers !");
+
+   return is_fault;
 }
 
 #endif /* CONFIG_PPC_KUAP */
diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h 
b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 3bcef989a35d..41a450d0aa06 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -137,9 +137,17 @@ static inline void restore_user_access(unsigned long flags)
 static inline bool
 bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 {
-   return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) &&
-   (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : 
AMR_KUAP_BLOCK_READ)),
-   "Bug: %s fault blocked by AMR!", is_write ? "Write" : 
"Read");
+   bool is_fault;
+
+   if (!mmu_has_feature(MMU_FTR_RADIX_KUAP))
+   return false;
+
+   is_fault = regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : 
AMR_KUAP_BLOCK_READ);
+
+   if (is_fault)
+   pr_crit("Bug: %s fault blocked by AMR!", is_write ? "Write" : 
"Read");
+
+   return is_fault;
 }
 #else /* CONFIG_PPC_KUAP */
 static inline void kuap_restore_amr(struct pt_regs *regs)
diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h 
b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index 85ed2390fb99..31419126c2bf 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -63,8 +63,12 @@ static inline void restore_user_access(unsigned long flags)
 static inline bool
 bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 {
-   return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf000),
-   "Bug: fault blocked by AP register !");
+   bool is_fault = !((regs->kuap ^ MD_APG_KUAP) & 0xf000);
+
+   if (is_fault)
+   pr_crit("Bug: fault blocked by AP register !\n");
+
+   return is_fault;
 }
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 84af6c8eecf7..91b458aa666e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -233,8 +233,12 @@ static bool bad_kernel_fault(struct pt_regs *regs, 
unsigned long error_code,
 
// Read/write fault in a valid region (the exception table search passed
// above), but blocked by KUAP is bad, it can never succeed.
-   if (bad_kuap_fault(regs, address, is_write))
+   if (bad_kuap_fault(regs, address, is_write)) {
+   pr_crit("kernel %s to userspace (%lx) blocked by KUAP\n",
+   is_write ? "write" : "read", address);
+   _exception(SIGSEGV, regs, SEGV_ACCERR, address);
return true;
+   }
 
// What's left? Kernel fault on user in well defined regions (extable
// matched), and allowed by KUAP in the faulting context.
-- 
2.25.0

Re: POWER9 crash due to STRICT_KERNEL_RWX (WAS: Re: Linux-next POWER9 NULL pointer NIP...)

2020-04-17 Thread Michael Ellerman

"Naveen N. Rao"  writes:
> Hi Qian,
>
> Qian Cai wrote:
>> OK, reverted the commit,
>> 
>> c55d7b5e6426 (“powerpc: Remove STRICT_KERNEL_RWX incompatibility with 
>> RELOCATABLE”)
>> 
>> or set STRICT_KERNEL_RWX=n fixed the crash below and also mentioned in this 
>> thread,
>> 
>> https://lore.kernel.org/lkml/15ac5b0e-a221-4b8c-9039-fa96b8ef7...@lca.pw/
>
> Do you see any errors logged in dmesg when you see the crash?  
> STRICT_KERNEL_RWX changes how patch_instruction() works, so it would be 
> interesting to see if there are any ftrace-related errors thrown before 
> the crash.

I've been able to reproduce with STRICT_KERNEL_RWX=y and concurrently
running:

# while true; do echo function > /sys/kernel/debug/tracing/current_tracer ; 
echo nop > /sys/kernel/debug/tracing/current_tracer ; done

and:

# while true; do find /lib/modules/$(uname -r) -name '*.ko' -printf "%f\n" | 
sed -e "s/\.ko//" | xargs -i modprobe -va {}; lsmod | awk '{print $1}' | xargs 
-i modprobe -vr {}; done

ie. stressing module loading/unloading and ftrace at the same time.


It's not 100% but it usually reproduces within 10-20 minutes.

It looks like sometimes our __patch_instruction() fails, and then that
somehow leads to things getting further messed up. Presumably we have
some bad error handling somewhere.

cheers

Re: POWER9 crash due to STRICT_KERNEL_RWX (WAS: Re: Linux-next POWER9 NULL pointer NIP...)

2020-04-17 Thread Michael Ellerman

Steven Rostedt  writes:
> On Thu, 16 Apr 2020 21:19:10 -0400
> Qian Cai  wrote:
>
>> OK, reverted the commit,
>> 
>> c55d7b5e6426 (“powerpc: Remove STRICT_KERNEL_RWX incompatibility with 
>> RELOCATABLE”)
>> 
>> or set STRICT_KERNEL_RWX=n fixed the crash below and also mentioned in this 
>> thread,
>
> This may be a symptom and not a cure.

I think it is a cure.

But we still have a bug, which is that when STRICT_KERNEL_RWX is enabled
we have some sort of corruption going on.

Enabling STRICT_KERNEL_RWX changes our implementation of
patch_instruction() which is used by ftrace, so I suspect this is a
powerpc bug.

>> [  148.110969][T13115] LTP: starting chown04_16
>> [  148.255048][T13380] kernel tried to execute exec-protected page 
>> (c16804ac) - exploit attempt? (uid: 0)
>> [  148.255099][T13380] BUG: Unable to handle kernel instruction fetch
>> [  148.255122][T13380] Faulting instruction address: 0xc16804ac
>> [  148.255136][T13380] Oops: Kernel access of bad area, sig: 11 [#1]
>> [  148.255157][T13380] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256 
>> DEBUG_PAGEALLOC NUMA PowerNV
>> [  148.255171][T13380] Modules linked in: loop kvm_hv kvm xfs sd_mod bnx2x 
>> mdio ahci tg3 libahci libphy libata firmware_class dm_mirror dm_region_hash 
>> dm_log dm_mod
>> [  148.255213][T13380] CPU: 45 PID: 13380 Comm: chown04_16 Tainted: G
>> W 5.6.0+ #7
>> [  148.255236][T13380] NIP:  c16804ac LR: c0080fa60408 CTR: 
>> c16804ac
>> [  148.255250][T13380] REGS: c010a6fafa00 TRAP: 0400   Tainted: G
>> W  (5.6.0+)
>> [  148.255281][T13380] MSR:  900010009033   CR: 
>> 84000248  XER: 2004
>> [  148.255310][T13380] CFAR: c0080fa66534 IRQMASK: 0 
>> [  148.255310][T13380] GPR00: c0973268 c010a6fafc90 
>> c1648200  
>> [  148.255310][T13380] GPR04: c00d8a22dc00 c010a6fafd30 
>> b5e98331 00012c9f 
>> [  148.255310][T13380] GPR08: c00d8a22dc00  
>>  c163c520 
>> [  148.255310][T13380] GPR12: c16804ac c01dad80 
>>   
>> [  148.255310][T13380] GPR16:   
>>   
>> [  148.255310][T13380] GPR20:   
>>   
>> [  148.255310][T13380] GPR24: 7fff8f5e2e48  
>> c0080fa6a488 c010a6fafd30 
>> [  148.255310][T13380] GPR28:  7fff 
>> c0080fa60400 c00efd0c6780 
>> [  148.255494][T13380] NIP [c16804ac] sysctl_net_busy_read+0x0/0x4
>
> The instruction pointer is on sysctl_net_busy_read? Isn't that data and
> not code?

Yes.

But we're corrupting the text, or data, somewhere, so we can jump
anywhere.

I have another trace where vhost_init() appears to call into
proc_dointvec() before crashing. vhost_init() is an empty function.

cheers

[PATCH] powerpc/mm: Fix CONFIG_PPC_KUAP_DEBUG on PPC32

2020-04-17 Thread Christophe Leroy

CONFIG_PPC_KUAP_DEBUG is not selectable because it depends on PPC_32
which doesn't exists.

Fixing it leads to a deadlock due to a vital register getting
clobbered in _switch().

Change dependency to PPC32 and use r0 instead of r4 in _switch()

Fixes: e2fb9f544431 ("powerpc/32: Prepare for Kernel Userspace Access 
Protection")
Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/entry_32.S | 2 +-
 arch/powerpc/platforms/Kconfig.cputype | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a6371fb8f761..8420abd4ea1c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -732,7 +732,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE)
stw r10,_CCR(r1)
stw r1,KSP(r3)  /* Set old stack pointer */
 
-   kuap_check r2, r4
+   kuap_check r2, r0
 #ifdef CONFIG_SMP
/* We need a sync somewhere here to make sure that if the
 * previous task gets rescheduled on another CPU, it sees all
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 11412078e732..9fffe99b343d 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -397,7 +397,7 @@ config PPC_KUAP
 
 config PPC_KUAP_DEBUG
bool "Extra debugging for Kernel Userspace Access Protection"
-   depends on PPC_KUAP && (PPC_RADIX_MMU || PPC_32)
+   depends on PPC_KUAP && (PPC_RADIX_MMU || PPC32)
help
  Add extra debugging for Kernel Userspace Access Protection (KUAP)
  If you're unsure, say N.
-- 
2.25.0

Re: POWER9 crash due to STRICT_KERNEL_RWX (WAS: Re: Linux-next POWER9 NULL pointer NIP...)

2020-04-17 Thread Qian Cai




> On Apr 17, 2020, at 3:01 AM, Naveen N. Rao  wrote:
> 
> Hi Qian,
> 
> Qian Cai wrote:
>> OK, reverted the commit,
>> c55d7b5e6426 (“powerpc: Remove STRICT_KERNEL_RWX incompatibility with 
>> RELOCATABLE”)
>> or set STRICT_KERNEL_RWX=n fixed the crash below and also mentioned in this 
>> thread,
>> https://lore.kernel.org/lkml/15ac5b0e-a221-4b8c-9039-fa96b8ef7...@lca.pw/
> 
> Do you see any errors logged in dmesg when you see the crash?  
> STRICT_KERNEL_RWX changes how patch_instruction() works, so it would be 
> interesting to see if there are any ftrace-related errors thrown before the 
> crash.

Yes, looks like there is a warning right after,

echo function > /sys/kernel/debug/tracing/current_tracer
echo nop > /sys/kernel/debug/tracing/current_tracer

and just before the crash,

[ T3454] ftrace-powerpc: Unexpected call sequence at de85f044: 48003d1d 
7c0802a6
[   56.870472][ T3454] [ cut here ]
[   56.870500][ T3454] WARNING: CPU: 52 PID: 3454 at kernel/trace/ftrace.c:2026 
ftrace_bug+0x104/0x310
[   56.870527][ T3454] Modules linked in: kvm_hv kvm ses enclosure 
scsi_transport_sas ip_tables x_tables xfs sd_mod i40e firmware_class aacraid 
dm_mirror dm_region_hash dm_log dm_mod
[   56.870592][ T3454] CPU: 52 PID: 3454 Comm: nip.sh Not tainted 
5.7.0-rc1-next-20200416 #4
[   56.870627][ T3454] NIP:  c02a3ae4 LR: c02a47fc CTR: 
c02436f0
[   56.870661][ T3454] REGS: c0069a9ef710 TRAP: 0700   Not tainted  
(5.7.0-rc1-next-20200416)
[   56.870697][ T3454] MSR:  9282b033 
  CR: 28228222  XER: 
[   56.870748][ T3454] CFAR: c02a3a2c IRQMASK: 0 
[   56.870748][ T3454] GPR00: c02a47fc c0069a9ef9a0 
c12f9000 ffea 
[   56.870748][ T3454] GPR04: c0002004e2160438 c007fedf0ad8 
614ca19d 0007 
[   56.870748][ T3454] GPR08: 0003  
 0002 
[   56.870748][ T3454] GPR12: 4000 c007fffd5600 
4000 000139ae9798 
[   56.870748][ T3454] GPR16: 000139ae9724 000139a86968 
000139a1f230 000139aed568 
[   56.870748][ T3454] GPR20: 0001402af8b0 0009 
000139a996e8 7fffc9186d94 
[   56.870748][ T3454] GPR24:  c0069a9efc00 
c132cd00 c0069a9efc40 
[   56.870748][ T3454] GPR28: c11c29e8 0001 
c0002004e2160438 c00809321a64 
[   56.870969][ T3454] NIP [c02a3ae4] ftrace_bug+0x104/0x310
ftrace_bug at kernel/trace/ftrace.c:2026
[   56.870995][ T3454] LR [c02a47fc] ftrace_modify_all_code+0x16c/0x210
ftrace_modify_all_code at kernel/trace/ftrace.c:2672
[   56.871034][ T3454] Call Trace:
[   56.871057][ T3454] [c0069a9ef9a0] [4b899a9efa00] 0x4b899a9efa00 
(unreliable)
[   56.871086][ T3454] [c0069a9efa20] [c02a47fc] 
ftrace_modify_all_code+0x16c/0x210
[   56.871125][ T3454] [c0069a9efa50] [c0061b68] 
arch_ftrace_update_code+0x18/0x30
[   56.871162][ T3454] [c0069a9efa70] [c02a49c4] 
ftrace_run_update_code+0x44/0xc0
[   56.871199][ T3454] [c0069a9efaa0] [c02aa3c8] 
ftrace_startup+0xe8/0x1b0
[   56.871236][ T3454] [c0069a9efae0] [c02aa4e0] 
register_ftrace_function+0x50/0xc0
[   56.871275][ T3454] [c0069a9efb10] [c02d0468] 
function_trace_init+0x98/0xd0
[   56.871312][ T3454] [c0069a9efb40] [c02c75c0] 
tracing_set_tracer+0x350/0x640
[   56.871349][ T3454] [c0069a9efbe0] [c02c7a90] 
tracing_set_trace_write+0x1e0/0x370
[   56.871388][ T3454] [c0069a9efd00] [c052094c] 
__vfs_write+0x3c/0x70
[   56.871424][ T3454] [c0069a9efd20] [c0523d4c] 
vfs_write+0xcc/0x200
[   56.871461][ T3454] [c0069a9efd70] [c05240ec] 
ksys_write+0x7c/0x140
[   56.871498][ T3454] [c0069a9efdc0] [c0038a94] 
system_call_exception+0x114/0x1e0
[   56.871535][ T3454] [c0069a9efe20] [c000c870] 
system_call_common+0xf0/0x278
[   56.871570][ T3454] Instruction dump:
[   56.871592][ T3454] 7d908120 4e800020 6000 2b890001 409effd4 3c62ff8b 
38631958 4bf4491d 
[   56.871639][ T3454] 6000 4bc0 6000 fba10068 <0fe0> 3901 
3ce20003 3d22fed7 
[   56.871685][ T3454] irq event stamp: 95388
[   56.871708][ T3454] hardirqs last  enabled at (95387): [] 
console_unlock+0x6a4/0x950
[   56.871746][ T3454] hardirqs last disabled at (95388): [] 
program_check_common_virt+0x2bc/0x310
[   56.871785][ T3454] softirqs last  enabled at (91222): [] 
__do_softirq+0x658/0x8d8
[   56.871823][ T3454] softirqs last disabled at (91215): [] 
irq_exit+0x16c/0x1d0
[   56.871859][ T3454] ---[ end trace 48f8445450a4e206 ]---
[   56.871907][ T3454] ftrace failed to modify 
[   56.871913][ T3454] [] 
show_sas_rphy_phy_identifier+0xc/0x60 [scsi_transport_sas]
show_sas_rphy_phy_identifier at drivers/scsi/scsi_transport_sas.c:1221
[   56.871969][ T3454]  actual:   1d:3d:00:48
[   56.871996][ T3454]

Re: [PATCH v6 09/10] powerpc/64s: Implement KUAP for Radix MMU

2020-04-17 Thread Christophe Leroy





Le 17/04/2020 à 12:39, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of April 17, 2020 8:10 pm:



Le 18/04/2019 à 08:51, Michael Ellerman a écrit :

Kernel Userspace Access Prevention utilises a feature of the Radix MMU
which disallows read and write access to userspace addresses. By
utilising this, the kernel is prevented from accessing user data from
outside of trusted paths that perform proper safety checks, such as
copy_{to/from}_user() and friends.

Userspace access is disabled from early boot and is only enabled when
performing an operation like copy_{to/from}_user(). The register that
controls this (AMR) does not prevent userspace from accessing itself,
so there is no need to save and restore when entering and exiting
userspace.

When entering the kernel from the kernel we save AMR and if it is not
blocking user access (because eg. we faulted doing a user access) we
reblock user access for the duration of the exception (ie. the page
fault) and then restore the AMR when returning back to the kernel.

This feature can be tested by using the lkdtm driver (CONFIG_LKDTM=y)
and performing the following:

# (echo ACCESS_USERSPACE) > [debugfs]/provoke-crash/DIRECT

If enabled, this should send SIGSEGV to the thread.

We also add paranoid checking of AMR in switch and syscall return
under CONFIG_PPC_KUAP_DEBUG.

Co-authored-by: Michael Ellerman 
Signed-off-by: Russell Currey 
Signed-off-by: Michael Ellerman 


[...]


diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 15c67d2c0534..7cc25389c6bd 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S


[...]


@@ -594,6 +606,8 @@ _GLOBAL(_switch)
std r23,_CCR(r1)
std r1,KSP(r3)  /* Set old stack pointer */
   
+	kuap_check_amr r9, r10

+
FLUSH_COUNT_CACHE
   
   	/*


I'm having a problem with this check. As you know I implemented the
exact same check in _switch() in entry_32.S. After adding some printk
inside an user_access_begin()/user_access_end() section, I started to
get valid user accesses blocked by KUAP. Then I activated
CONFIG_PPC_KUAP_DEBUG which led me to WARNINGs on this check.

This is due to schedule() being called by printk() inside the section
where user access is unlocked. How is this supposed to work ? When


Unlocked user access sections are supposed to be very constrained,
I think x86's objtool has a checker to verify nothing much gets
called. Printk shouldn't be.

I was hitting the same assertion and it was because the uaccess
macros were pulling lots of things into the user access region.

https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200407041245.600651-1-npig...@gmail.com/

If that doesn't solve your problem... then now is printk being
called with user access enabled?


Yes indeed, the printk was called with user access enabled, as I wanted 
to print all calls to unsafe_copy_to_user()


Thanks
Christophe

Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs

2020-04-17 Thread Qian Cai




> On Apr 2, 2020, at 7:24 AM, Michael Ellerman  wrote:
> 
> Qian Cai  writes:
>> From: Peter Zijlstra 
>> 
>> In the CPU-offline process, it calls mmdrop() after idle entry and the
>> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
>> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
>> lockdep complaining when mmdrop() uses RCU from either memcg or
>> debugobjects below.
>> 
>> Fix it by cleaning up the active_mm state from BP instead. Every arch
>> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
>> from AP. The only exception is parisc because it switches them to
>> _mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
>> but the patch will still work there because it calls mmgrab(_mm) in
>> smp_cpu_init() and then should call mmdrop(_mm) in finish_cpu().
> 
> Thanks for debugging this. How did you hit it in the first place?
> 
> A link to the original thread would have helped me:
> 
>  https://lore.kernel.org/lkml/20200113190331.12788-1-...@lca.pw/
> 
>> WARNING: suspicious RCU usage
>> -
>> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
>> 
>> other info that might help us debug this:
>> 
>> RCU used illegally from offline CPU!
>> Call Trace:
>> dump_stack+0xf4/0x164 (unreliable)
>> lockdep_rcu_suspicious+0x140/0x164
>> get_work_pool+0x110/0x150
>> __queue_work+0x1bc/0xca0
>> queue_work_on+0x114/0x120
>> css_release+0x9c/0xc0
>> percpu_ref_put_many+0x204/0x230
>> free_pcp_prepare+0x264/0x570
>> free_unref_page+0x38/0xf0
>> __mmdrop+0x21c/0x2c0
>> idle_task_exit+0x170/0x1b0
>> pnv_smp_cpu_kill_self+0x38/0x2e0
>> cpu_die+0x48/0x64
>> arch_cpu_idle_dead+0x30/0x50
>> do_idle+0x2f4/0x470
>> cpu_startup_entry+0x38/0x40
>> start_secondary+0x7a8/0xa80
>> start_secondary_resume+0x10/0x14
> 
> Do we know when this started happening? ie. can we determine a Fixes
> tag?
> 
>> 
>> Signed-off-by: Qian Cai 
>> ---
>> arch/powerpc/platforms/powernv/smp.c |  1 -
>> include/linux/sched/mm.h |  2 ++
>> kernel/cpu.c | 18 +-
>> kernel/sched/core.c  |  5 +++--
>> 4 files changed, 22 insertions(+), 4 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/smp.c 
>> b/arch/powerpc/platforms/powernv/smp.c
>> index 13e251699346..b2ba3e95bda7 100644
>> --- a/arch/powerpc/platforms/powernv/smp.c
>> +++ b/arch/powerpc/platforms/powernv/smp.c
>> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
>>  /* Standard hot unplug procedure */
>> 
>>  idle_task_exit();
>> -current->active_mm = NULL; /* for sanity */
> 
> If I'm reading it right, we'll now be running with active_mm == init_mm
> in the offline loop.
> 
> I guess that's fine, I can't think of any reason it would matter, and it
> seems like we were NULL'ing it out just for paranoia's sake not because
> of any actual problem.
> 
> Acked-by: Michael Ellerman  (powerpc)

Peter, can you take a look at this patch when you have a chance?

> 
> 
> cheers
> 
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index c49257a3b510..a132d875d351 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
>>  __mmdrop(mm);
>> }
>> 
>> +void mmdrop(struct mm_struct *mm);
>> +
>> /*
>>  * This has to be called after a get_task_mm()/mmget_not_zero()
>>  * followed by taking the mmap_sem for writing before modifying the
>> diff --git a/kernel/cpu.c b/kernel/cpu.c
>> index 2371292f30b0..244d30544377 100644
>> --- a/kernel/cpu.c
>> +++ b/kernel/cpu.c
>> @@ -3,6 +3,7 @@
>>  *
>>  * This code is licenced under the GPL.
>>  */
>> +#include 
>> #include 
>> #include 
>> #include 
>> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
>>  return bringup_wait_for_ap(cpu);
>> }
>> 
>> +static int finish_cpu(unsigned int cpu)
>> +{
>> +struct task_struct *idle = idle_thread_get(cpu);
>> +struct mm_struct *mm = idle->active_mm;
>> +
>> +/*
>> + * idle_task_exit() will have switched to _mm, now
>> + * clean up any remaining active_mm state.
>> + */
>> +if (mm != _mm)
>> +idle->active_mm = _mm;
>> +mmdrop(mm);
>> +return 0;
>> +}
>> +
>> /*
>>  * Hotplug state machine related functions
>>  */
>> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
>>  [CPUHP_BRINGUP_CPU] = {
>>  .name   = "cpu:bringup",
>>  .startup.single = bringup_cpu,
>> -.teardown.single= NULL,
>> +.teardown.single= finish_cpu,
>>  .cant_stop  = true,
>>  },
>>  /* Final state before CPU kills itself */
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index a2694ba82874..8787958339d5 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6200,13 +6200,14 @@ void

Re: [PATCH 4/8] binfmt_elf: open code copy_siginfo_to_user to kernelspace buffer

2020-04-17 Thread Christoph Hellwig

On Wed, Apr 15, 2020 at 10:20:11AM +0200, Arnd Bergmann wrote:
> > I'd rather keep it out of this series and to
> > an interested party.  Then again x32 doesn't seem to have a whole lot
> > of interested parties..
> 
> Fine with me. It's on my mental list of things that we want to kill off
> eventually as soon as the remaining users stop replying to questions
> about it.
> 
> In fact I should really turn that into a properly maintained list in
> Documentation/... that contains any options that someone has
> asked about removing in the past, along with the reasons for keeping
> it around and a time at which we should ask about it again.

To the newly added x86 maintainers:  Arnd brought up the point that
elf_core_dump writes the ABI siginfo format into the core dump. That
format differs for i386 vs x32.  Is there any good way to find out
which is the right format when are not in a syscall?

As far a I can tell x32 vs i386 just seems to be based around what
syscall table was used for the current syscall, but core dumps aren't
always in syscall context.

Re: [PATCH v4,4/4] drivers: uio: new driver for fsl_85xx_cache_sram

2020-04-17 Thread 王文虎

>On Fri, 2020-04-17 at 09:42 +0200, Greg KH wrote:>> On Thu, Apr 16, 2020 at 
>11:58:29PM -0500, Scott Wood wrote:
>> > On Fri, 2020-04-17 at 10:31 +0800, 王文虎 wrote:
>> > > Sounds it is. And does the modification below fit well?
>> > > ---
>> > > -static const struct of_device_id uio_mpc85xx_l2ctlr_of_match[] = {
>> > > -   {   .compatible = "uio,mpc85xx-cache-sram", },
>> > > -   {},
>> > > +#ifdef CONFIG_OF
>> > > +static struct of_device_id uio_fsl_85xx_cache_sram_of_match[] = {
>> > > +   { /* This is filled with module_parm */ },
>> > > +   { /* Sentinel */ },
>> > >  };
>> > > +MODULE_DEVICE_TABLE(of, uio_fsl_85xx_cache_sram_of_match);
>> > > +module_param_string(of_id,
>> > > uio_fsl_85xx_cache_sram_of_match[0].compatible,
>> > > +   sizeof(uio_fsl_85xx_cache_sram_of_match[0].c
>> > > ompa
>> > > tible), 0);
>> > > +MODULE_PARM_DESC(of_id, "platform device id to be handled by cache-
>> > > sram-
>> > > uio");
>> > > +#endif
>> > 
>> > No.  The point is that you wouldn't be configuring this with the device
>> > tree
>> > at all.
>> 
>> Wait, why not?  Don't force people to use module parameters, that is
>> crazy.  DT describes the hardware involved, if someone wants to bind to
>> a specific range of memory, as described by DT, why can't they do so?
>
>Yes, DT describes the hardware, and as I've said a couple times already, this
>isn't hardware description.
>
>I'm not forcing people to use module parameters.  That was a least-effort
>suggestion to avoid abusing the DT.  I later said I'd try to come up with a
>patch that allocates regions dynamically (and most likely doesn't use UIO at
>all).
>
>> I can understand not liking the name "uio" in a dt tree, but there's no
>> reason that DT can not describe what a driver binds to here.
>
>The DT already describes this hardware, and there is already code that binds
>to it.  This patch is trying to add a second node for it with configuration.
>
Hi, Scott, Greg,
Seems like no balance here. How about I implement a driver of uio including
the l2ctrl and cache_sram related implementations?
And this way, the driver would be a hardware level driver and targeted for uio.

Regards,
Wenhu

77 matches

Mail list logo