[PATCHv3 7/7] checkpatch: add pF/pf deprecation warning

2017-09-29 Thread Sergey Senozhatsky
We deprecated '%pF/%pf' printk specifiers, since '%pS/%ps' is now smart
enough to handle function pointer dereference on platforms where such
dereference is required.

checkpatch warning example:

WARNING: Deprecated vsprintf pointer extension '%pF' - use %pS instead

Signed-off-by: Sergey Senozhatsky 
Signed-off-by: Joe Perches 
Cc: Andy Whitcroft 
Tested-by: Helge Deller  # parisc64
Tested-by: Santosh Sivaraj  # powerpc64
Acked-by: Michael Ellerman  # powerpc64
Tested-by: Tony Luck  # ia64
---
 scripts/checkpatch.pl | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 03eb2551477d..387c453413e0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5762,18 +5762,25 @@ sub process {
for (my $count = $linenr; $count <= $lc; $count++) {
my $fmt = get_quoted_string($lines[$count - 1], 
raw_line($count, 0));
$fmt =~ s/%%//g;
-   if ($fmt =~ 
/(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGNO]).)/) {
+   if ($fmt =~ 
/(\%[\*\d\.]*p(?![\WSsBKRraEhMmIiUDdgVCbGNO]).)/) {
$bad_extension = $1;
last;
}
}
if ($bad_extension ne "") {
my $stat_real = raw_line($linenr, 0);
+   my $ext_type = "Invalid";
+   my $use = "";
for (my $count = $linenr + 1; $count <= $lc; 
$count++) {
$stat_real = $stat_real . "\n" . 
raw_line($count, 0);
}
+   if ($bad_extension =~ /p[Ff]/) {
+   $ext_type = "Deprecated";
+   $use = " - use %pS instead";
+   $use =~ s/pS/ps/ if ($bad_extension =~ 
/pf/);
+   }
WARN("VSPRINTF_POINTER_EXTENSION",
-"Invalid vsprintf pointer extension 
'$bad_extension'\n" . "$here\n$stat_real\n");
+"$ext_type vsprintf pointer extension 
'$bad_extension'$use\n" . "$here\n$stat_real\n");
}
}
 
-- 
2.14.2



[PATCHv3 6/7] symbol lookup: use new kernel and module dereference functions

2017-09-29 Thread Sergey Senozhatsky
Call appropriate function descriptor dereference ARCH callbacks:
- dereference_kernel_function_descriptor() if the pointer is a
  kernel symbol;

- dereference_module_function_descriptor() if the pointer is a
  module symbol.

This patch also removes dereference_function_descriptor() from
'%pF/%pf' vsprintf handler, because it has the same behavior with
'%pS/%ps' now.

Signed-off-by: Sergey Senozhatsky 
Tested-by: Helge Deller  # parisc64
Tested-by: Santosh Sivaraj  # powerpc64
Acked-by: Michael Ellerman  # powerpc64
Tested-by: Tony Luck  # ia64
---
 Documentation/printk-formats.txt | 20 ++--
 kernel/kallsyms.c|  1 +
 kernel/module.c  |  1 +
 lib/vsprintf.c   |  5 +
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 361789df51ec..3adbc4fdd482 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -50,26 +50,28 @@ Symbols/Function Pointers
 
 ::
 
+   %pS versatile_init+0x0/0x110
+   %ps versatile_init
%pF versatile_init+0x0/0x110
%pf versatile_init
-   %pS versatile_init+0x0/0x110
%pSRversatile_init+0x9/0x110
(with __builtin_extract_return_addr() translation)
-   %ps versatile_init
%pB prev_fn_of_versatile_init+0x88/0x88
 
-The ``F`` and ``f`` specifiers are for printing function pointers,
-for example, f->func,  They have the same result as
-``S`` and ``s`` specifiers. But they do an extra conversion on
-ia64, ppc64 and parisc64 architectures where the function pointers
-are actually function descriptors.
-
 The ``S`` and ``s`` specifiers can be used for printing symbols
 from direct addresses, for example, __builtin_return_address(0),
 (void *)regs->ip. They result in the symbol name with (``S``) or
 without (``s``) offsets. If KALLSYMS are disabled then the symbol
 address is printed instead.
 
+Note, that the ``F`` and ``f`` specifiers are identical to ``S`` (``s``)
+and thus deprecated. We have ``F`` and ``f`` because on ia64, ppc64 and
+parisc64 function pointers are indirect and, in fact, are function
+descriptors, which require additional dereferencing before we can lookup
+the symbol. As of now, ``S`` and ``s`` perform dereferencing on those
+platforms (when needed), so ``F`` and ``f`` exist for compatibility
+reasons only.
+
 The ``B`` specifier results in the symbol name with offsets and should be
 used when printing stack backtraces. The specifier takes into
 consideration the effect of compiler optimisations which may occur
@@ -77,8 +79,6 @@ when tail-call``s are used and marked with the noreturn GCC 
attribute.
 
 Examples::
 
-   printk("Going to call: %pF\n", gettimeofday);
-   printk("Going to call: %pF\n", p->func);
printk("%s: called from %pS\n", __func__, (void *)_RET_IP_);
printk("%s: called from %pS\n", __func__,
(void *)__builtin_return_address(0));
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..e2fc09ea9509 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -322,6 +322,7 @@ const char *kallsyms_lookup(unsigned long addr,
if (is_ksym_addr(addr)) {
unsigned long pos;
 
+   addr = dereference_kernel_function_descriptor(addr);
pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
diff --git a/kernel/module.c b/kernel/module.c
index b792e814150a..63361de377ad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3948,6 +3948,7 @@ const char *module_address_lookup(unsigned long addr,
preempt_disable();
mod = __module_address(addr);
if (mod) {
+   addr = dereference_module_function_descriptor(mod, addr);
if (modname)
*modname = mod->name;
ret = get_ksymbol(mod, addr, size, offset);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index bcd906a39010..bf04b4f5d8e7 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -40,7 +40,6 @@
 #include "../mm/internal.h"/* For the trace_print_flags arrays */
 
 #include   /* for PAGE_SIZE */
-#include   /* for dereference_function_descriptor() */
 #include  /* cpu_to_le16 */
 
 #include 
@@ -1721,10 +1720,8 @@ char *pointer(const char *fmt, char *buf, char *end, 
void *ptr,
}
 
switch (*fmt) {
-   case 'F':
+   case 'F': /* %pF and %pf are kept for compatibility reasons only */
case 'f':
-   ptr = (void *)dereference_function_descriptor((unsigned 
long)ptr);
-   /* Fallthrough */
case 'S':
case 's':
case 'B':
-- 
2.14.2



[PATCHv3 5/7] parisc64: Add .opd based function descriptor dereference

2017-09-29 Thread Sergey Senozhatsky
We are moving towards separate kernel and module function descriptor
dereference callbacks. This patch enables it for parisc64.

For pointers that belong to the kernel
-  Added __start_opd and __end_opd pointers, to track the kernel
   .opd section address range;

-  Added dereference_kernel_function_descriptor(). Now we
   will dereference only function pointers that are within
   [__start_opd, __end_opd];

For pointers that belong to a module
-  Added dereference_module_function_descriptor() to handle module
   function descriptor dereference. Now we will dereference only
   pointers that are within [module->opd.start, module->opd.end].

Signed-off-by: Sergey Senozhatsky 
Signed-off-by: Helge Deller 
Tested-by: Helge Deller  # parisc64
Tested-by: Santosh Sivaraj  # powerpc64
Acked-by: Michael Ellerman  # powerpc64
Tested-by: Tony Luck  # ia64
---
 arch/parisc/boot/compressed/vmlinux.lds.S |  2 ++
 arch/parisc/include/asm/sections.h|  2 ++
 arch/parisc/kernel/module.c   | 17 +
 arch/parisc/kernel/process.c  |  9 +
 arch/parisc/kernel/vmlinux.lds.S  |  2 ++
 5 files changed, 32 insertions(+)

diff --git a/arch/parisc/boot/compressed/vmlinux.lds.S 
b/arch/parisc/boot/compressed/vmlinux.lds.S
index a4ce3314e78e..4ebd4e65524c 100644
--- a/arch/parisc/boot/compressed/vmlinux.lds.S
+++ b/arch/parisc/boot/compressed/vmlinux.lds.S
@@ -29,7 +29,9 @@ SECTIONS
. = ALIGN(16);
/* Linkage tables */
.opd : {
+   __start_opd = .;
*(.opd)
+   __end_opd = .;
} PROVIDE (__gp = .);
.plt : {
*(.plt)
diff --git a/arch/parisc/include/asm/sections.h 
b/arch/parisc/include/asm/sections.h
index 59fbe0067112..845ddc9a3421 100644
--- a/arch/parisc/include/asm/sections.h
+++ b/arch/parisc/include/asm/sections.h
@@ -7,6 +7,8 @@
 #ifdef CONFIG_64BIT
 #undef dereference_function_descriptor
 unsigned long dereference_function_descriptor(unsigned long);
+#undef dereference_kernel_function_descriptor
+unsigned long dereference_kernel_function_descriptor(unsigned long);
 #endif
 
 #endif
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index f1a76935a314..28f89b3dcc11 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -66,6 +66,7 @@
 
 #include 
 #include 
+#include 
 
 #if 0
 #define DEBUGP printk
@@ -954,3 +955,19 @@ void module_arch_cleanup(struct module *mod)
 {
deregister_unwind_table(mod);
 }
+
+#ifdef CONFIG_64BIT
+unsigned long dereference_module_function_descriptor(struct module *mod,
+unsigned long addr)
+{
+   unsigned long start_opd = (Elf64_Addr)mod->core_layout.base +
+  mod->arch.fdesc_offset;
+   unsigned long end_opd = start_opd +
+   mod->arch.fdesc_count * sizeof(Elf64_Fdesc);
+
+   if (addr < start_opd || addr >= end_opd)
+   return addr;
+
+   return dereference_function_descriptor(addr);
+}
+#endif
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index d350aa913acc..423bbfe90e2b 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -276,6 +276,15 @@ unsigned long dereference_function_descriptor(unsigned 
long ptr)
ptr = (unsigned long)p;
return ptr;
 }
+
+unsigned long dereference_kernel_function_descriptor(unsigned long addr)
+{
+   if (addr < (unsigned long)__start_opd ||
+   addr >= (unsigned long)__end_opd)
+   return addr;
+
+   return dereference_function_descriptor(addr);
+}
 #endif
 
 static inline unsigned long brk_rnd(void)
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index ffe2cbf52d1a..ab030895dd1e 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -99,7 +99,9 @@ SECTIONS
. = ALIGN(16);
/* Linkage tables */
.opd : {
+   __start_opd = .;
*(.opd)
+   __end_opd = .;
} PROVIDE (__gp = .);
.plt : {
*(.plt)
-- 
2.14.2



[PATCHv3 4/7] powerpc64: Add .opd based function descriptor dereference

2017-09-29 Thread Sergey Senozhatsky
We are moving towards separate kernel and module function descriptor
dereference callbacks. This patch enables it for powerpc64.

For pointers that belong to the kernel
-  Added __start_opd and __end_opd pointers, to track the kernel
   .opd section address range;

-  Added dereference_kernel_function_descriptor(). Now we
   will dereference only function pointers that are within
   [__start_opd, __end_opd];

For pointers that belong to a module
-  Added dereference_module_function_descriptor() to handle module
   function descriptor dereference. Now we will dereference only
   pointers that are within [module->opd.start, module->opd.end].

Signed-off-by: Sergey Senozhatsky 
Tested-by: Helge Deller  # parisc64
Tested-by: Santosh Sivaraj  # powerpc64
Acked-by: Michael Ellerman  # powerpc64
Tested-by: Tony Luck  # ia64
---
 arch/powerpc/include/asm/module.h   |  3 +++
 arch/powerpc/include/asm/sections.h | 11 +++
 arch/powerpc/kernel/module_64.c | 16 
 arch/powerpc/kernel/vmlinux.lds.S   |  2 ++
 4 files changed, 32 insertions(+)

diff --git a/arch/powerpc/include/asm/module.h 
b/arch/powerpc/include/asm/module.h
index 6c0132c7212f..7e28442827f1 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -45,6 +45,9 @@ struct mod_arch_specific {
unsigned long tramp;
 #endif
 
+   /* For module function descriptor dereference */
+   unsigned long start_opd;
+   unsigned long end_opd;
 #else /* powerpc64 */
/* Indices of PLT sections within module. */
unsigned int core_plt_section;
diff --git a/arch/powerpc/include/asm/sections.h 
b/arch/powerpc/include/asm/sections.h
index 67379b8945e8..6b4ee0d1645f 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -75,6 +75,17 @@ static inline unsigned long 
dereference_function_descriptor(unsigned long ptr)
ptr = (unsigned long)p;
return ptr;
 }
+
+#undef dereference_kernel_function_descriptor
+static inline unsigned long
+dereference_kernel_function_descriptor(unsigned long addr)
+{
+   if (addr < (unsigned long)__start_opd ||
+   addr >= (unsigned long)__end_opd)
+   return addr;
+
+   return dereference_function_descriptor(addr);
+}
 #endif /* PPC64_ELF_ABI_v1 */
 
 #endif
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 0b0f89685b67..94caec045a90 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -344,6 +344,11 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr,
else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0)
dedotify_versions((void *)hdr + sechdrs[i].sh_offset,
  sechdrs[i].sh_size);
+   else if (!strcmp(secstrings + sechdrs[i].sh_name, ".opd")) {
+   me->arch.start_opd = sechdrs[i].sh_addr;
+   me->arch.end_opd = sechdrs[i].sh_addr +
+  sechdrs[i].sh_size;
+   }
 
/* We don't handle .init for the moment: rename to _init */
while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init")))
@@ -712,6 +717,17 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return 0;
 }
 
+#ifdef PPC64_ELF_ABI_v1
+unsigned long dereference_module_function_descriptor(struct module *mod,
+unsigned long addr)
+{
+   if (addr < mod->arch.start_opd || addr >= mod->arch.end_opd)
+   return addr;
+
+   return dereference_function_descriptor(addr);
+}
+#endif /* PPC64_ELF_ABI_v1 */
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 #ifdef CC_USING_MPROFILE_KERNEL
diff --git a/arch/powerpc/kernel/vmlinux.lds.S 
b/arch/powerpc/kernel/vmlinux.lds.S
index 882628fa6987..70e10251e083 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -277,7 +277,9 @@ SECTIONS
}
 
.opd : AT(ADDR(.opd) - LOAD_OFFSET) {
+   __start_opd = .;
*(.opd)
+   __end_opd = .;
}
 
. = ALIGN(256);
-- 
2.14.2



[PATCHv3 3/7] ia64: Add .opd based function descriptor dereference

2017-09-29 Thread Sergey Senozhatsky
We are moving towards separate kernel and module function descriptor
dereference callbacks. This patch enables it for IA64.

For pointers that belong to the kernel
-  Added __start_opd and __end_opd pointers, to track the kernel
   .opd section address range;

-  Added dereference_kernel_function_descriptor(). Now we
   will dereference only function pointers that are within
   [__start_opd, __end_opd];

For pointers that belong to a module
-  Added dereference_module_function_descriptor() to handle module
   function descriptor dereference. Now we will dereference only
   pointers that are within [module->opd.start, module->opd.end].

Signed-off-by: Sergey Senozhatsky 
Tested-by: Helge Deller  # parisc64
Tested-by: Santosh Sivaraj  # powerpc64
Acked-by: Michael Ellerman  # powerpc64
Tested-by: Tony Luck  # ia64
---
 arch/ia64/include/asm/sections.h | 10 +-
 arch/ia64/kernel/module.c| 13 +
 arch/ia64/kernel/vmlinux.lds.S   |  2 ++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h
index de6bfa1ef8fb..3ba7ce9d8bc8 100644
--- a/arch/ia64/include/asm/sections.h
+++ b/arch/ia64/include/asm/sections.h
@@ -37,6 +37,14 @@ static inline unsigned long 
dereference_function_descriptor(unsigned long ptr)
return ptr;
 }
 
+#undef dereference_kernel_function_descriptor
+static inline unsigned long
+dereference_kernel_function_descriptor(unsigned long addr)
+{
+   if (addr < (unsigned long)__start_opd ||
+   addr >= (unsigned long)__end_opd)
+   return addr;
+   return dereference_function_descriptor(addr);
+}
 
 #endif /* _ASM_IA64_SECTIONS_H */
-
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index d1d945c6bd05..0741ae6fa957 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -35,6 +35,7 @@
 
 #include 
 #include 
+#include 
 
 #define ARCH_MODULE_DEBUG 0
 
@@ -917,3 +918,15 @@ module_arch_cleanup (struct module *mod)
if (mod->arch.core_unw_table)
unw_remove_unwind_table(mod->arch.core_unw_table);
 }
+
+unsigned long
+dereference_module_function_descriptor(struct module *mod, unsigned long addr)
+{
+   Elf64_Shdr *opd = mod->arch.opd;
+
+   if (addr < opd->sh_addr ||
+   addr >= (opd->sh_addr + opd->sh_size))
+   return addr;
+
+   return dereference_function_descriptor(addr);
+}
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 798026dde52e..f872ba5ff82a 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -107,7 +107,9 @@ SECTIONS {
RODATA
 
.opd : AT(ADDR(.opd) - LOAD_OFFSET) {
+   __start_opd = .;
*(.opd)
+   __end_opd = .;
}
 
/*
-- 
2.14.2



[PATCHv3 2/7] sections: split dereference_function_descriptor()

2017-09-29 Thread Sergey Senozhatsky
There are two format specifiers to print out a pointer in symbolic
format: '%pS/%ps' and '%pF/%pf'. On most architectures, the two
mean exactly the same thing, but some architectures (ia64, ppc64,
parisc64) use an indirect pointer for C function pointers, where
the function pointer points to a function descriptor (which in
turn contains the actual pointer to the code). The '%pF/%pf, when
used appropriately, automatically does the appropriate function
descriptor dereference on such architectures.

The "when used appropriately" part is tricky. Basically this is
a subtle ABI detail, specific to some platforms, that made it to
the API level and people can be unaware of it and miss the whole
"we need to dereference the function" business out. [1] proves
that point (note that it fixes only '%pF' and '%pS', there might
be '%pf' and '%ps' cases as well).

It appears that we can handle everything within the affected
arches and make '%pS/%ps' smart enough to retire '%pF/%pf'.
Function descriptors live in .opd elf section and all affected
arches (ia64, ppc64, parisc64) handle it properly for kernel
and modules. So we, technically, can decide if the dereference
is needed by simply looking at the pointer: if it belongs to
.opd section then we need to dereference it.

The kernel and modules have their own .opd sections, obviously,
that's why we need to split dereference_function_descriptor()
and use separate kernel and module dereference arch callbacks.

This patch does the first step, it
a) adds dereference_kernel_function_descriptor() function.
b) adds a weak alias to dereference_module_function_descriptor()
   function.

So, for the time being, we will have:
1) dereference_function_descriptor()
   A generic function, that simply dereferences the pointer. There is
   bunch of places that call it: kgdbts, init/main.c, extable, etc.

2) dereference_kernel_function_descriptor()
   A function to call on kernel symbols that does kernel .opd section
   address range test.

3) dereference_module_function_descriptor()
   A function to call on modules' symbols that does modules' .opd
   section address range test.

[1] https://marc.info/?l=linux-kernel=150472969730573

Signed-off-by: Sergey Senozhatsky 
Tested-by: Helge Deller  # parisc64
Tested-by: Santosh Sivaraj  # powerpc64
Acked-by: Michael Ellerman  # powerpc64
Tested-by: Tony Luck  # ia64
---
 include/asm-generic/sections.h | 8 ++--
 include/linux/moduleloader.h   | 4 
 kernel/module.c| 6 ++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index e5da44eddd2f..387f22c41e0d 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -29,6 +29,7 @@
  * __ctors_start, __ctors_end
  * __irqentry_text_start, __irqentry_text_end
  * __softirqentry_text_start, __softirqentry_text_end
+ * __start_opd, __end_opd
  */
 extern char _text[], _stext[], _etext[];
 extern char _data[], _sdata[], _edata[];
@@ -47,12 +48,15 @@ extern char __softirqentry_text_start[], 
__softirqentry_text_end[];
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
 
+/* Start and end of .opd section - used for function descriptors. */
+extern char __start_opd[], __end_opd[];
+
 extern __visible const void __nosave_begin, __nosave_end;
 
-/* function descriptor handling (if any).  Override
- * in asm/sections.h */
+/* Function descriptor handling (if any).  Override in asm/sections.h */
 #ifndef dereference_function_descriptor
 #define dereference_function_descriptor(p) (p)
+#define dereference_kernel_function_descriptor(p) (p)
 #endif
 
 /* random extra sections (if any).  Override
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 4d0cb9bba93e..172904e9cded 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -85,6 +85,10 @@ void module_arch_cleanup(struct module *mod);
 /* Any cleanup before freeing mod->module_init */
 void module_arch_freeing_init(struct module *mod);
 
+/* Dereference module function descriptor */
+unsigned long dereference_module_function_descriptor(struct module *mod,
+unsigned long addr);
+
 #ifdef CONFIG_KASAN
 #include 
 #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
diff --git a/kernel/module.c b/kernel/module.c
index ea77ab13bead..b792e814150a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2121,6 +2121,12 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
+unsigned long __weak dereference_module_function_descriptor(struct module *mod,
+   unsigned long addr)
+{
+   return addr;
+}
+
 /* Free a module, remove from lists, etc. */
 static void free_module(struct 

[PATCHv3 1/7] switch dereference_function_descriptor() to `unsigned long'

2017-09-29 Thread Sergey Senozhatsky
Convert dereference_function_descriptor() to accept and return
`unsigned long'. There will be two new ARCH function for kernel
and module function pointer dereference, which will work with
`unsigned long', so the patch unifies interfaces.

Besides, dereference_function_descriptor() mostly work with
`unsigned long':

drivers/misc/kgdbts.c:
addr = (unsigned long) dereference_function_descriptor((void *)addr);

init/main.c:
addr = (unsigned long) dereference_function_descriptor(fn);

kernel/extable.c:
addr = (unsigned long) dereference_function_descriptor(ptr);

kernel/module.c:
unsigned long a = (unsigned long)dereference_function_descriptor(addr);

Convert dereference_function_descriptor() users tree-wide.

Signed-off-by: Sergey Senozhatsky 
Tested-by: Helge Deller  # parisc64
Tested-by: Santosh Sivaraj  # powerpc64
Acked-by: Michael Ellerman  # powerpc64
Tested-by: Tony Luck  # ia64
---
 arch/ia64/include/asm/sections.h| 6 +++---
 arch/parisc/include/asm/sections.h  | 2 +-
 arch/parisc/kernel/process.c| 6 +++---
 arch/parisc/mm/init.c   | 4 ++--
 arch/powerpc/include/asm/sections.h | 6 +++---
 drivers/misc/kgdbts.c   | 2 +-
 init/main.c | 2 +-
 kernel/extable.c| 2 +-
 kernel/module.c | 2 +-
 lib/vsprintf.c  | 2 +-
 10 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h
index 2ab2003698ef..de6bfa1ef8fb 100644
--- a/arch/ia64/include/asm/sections.h
+++ b/arch/ia64/include/asm/sections.h
@@ -27,13 +27,13 @@ extern char __start_unwind[], __end_unwind[];
 extern char __start_ivt_text[], __end_ivt_text[];
 
 #undef dereference_function_descriptor
-static inline void *dereference_function_descriptor(void *ptr)
+static inline unsigned long dereference_function_descriptor(unsigned long ptr)
 {
-   struct fdesc *desc = ptr;
+   struct fdesc *desc = (struct fdesc *)ptr;
void *p;
 
if (!probe_kernel_address(>ip, p))
-   ptr = p;
+   ptr = (unsigned long)p;
return ptr;
 }
 
diff --git a/arch/parisc/include/asm/sections.h 
b/arch/parisc/include/asm/sections.h
index 9d13c3507ad6..59fbe0067112 100644
--- a/arch/parisc/include/asm/sections.h
+++ b/arch/parisc/include/asm/sections.h
@@ -6,7 +6,7 @@
 
 #ifdef CONFIG_64BIT
 #undef dereference_function_descriptor
-void *dereference_function_descriptor(void *);
+unsigned long dereference_function_descriptor(unsigned long);
 #endif
 
 #endif
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 30f92391a93e..d350aa913acc 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -267,13 +267,13 @@ get_wchan(struct task_struct *p)
 }
 
 #ifdef CONFIG_64BIT
-void *dereference_function_descriptor(void *ptr)
+unsigned long dereference_function_descriptor(unsigned long ptr)
 {
-   Elf64_Fdesc *desc = ptr;
+   Elf64_Fdesc *desc = (Elf64_Fdesc *)ptr;
void *p;
 
if (!probe_kernel_address(>addr, p))
-   ptr = p;
+   ptr = (unsigned long)p;
return ptr;
 }
 #endif
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 1ca9a2b4239f..06e1b79e2946 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -389,10 +389,10 @@ static void __init setup_bootmem(void)
 static int __init parisc_text_address(unsigned long vaddr)
 {
static unsigned long head_ptr __initdata;
+   unsigned long addr = (unsigned long)_kernel_start;
 
if (!head_ptr)
-   head_ptr = PAGE_MASK & (unsigned long)
-   dereference_function_descriptor(_kernel_start);
+   head_ptr = PAGE_MASK & dereference_function_descriptor(addr);
 
return core_kernel_text(vaddr) || vaddr == head_ptr;
 }
diff --git a/arch/powerpc/include/asm/sections.h 
b/arch/powerpc/include/asm/sections.h
index 7902d6358854..67379b8945e8 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -66,13 +66,13 @@ static inline int overlaps_kvm_tmp(unsigned long start, 
unsigned long end)
 
 #ifdef PPC64_ELF_ABI_v1
 #undef dereference_function_descriptor
-static inline void *dereference_function_descriptor(void *ptr)
+static inline unsigned long dereference_function_descriptor(unsigned long ptr)
 {
-   struct ppc64_opd_entry *desc = ptr;
+   struct ppc64_opd_entry *desc = (struct ppc64_opd_entry *)ptr;
void *p;
 
if (!probe_kernel_address(>funcaddr, p))
-   ptr = p;
+   ptr = (unsigned long)p;
return ptr;
 }
 #endif /* PPC64_ELF_ABI_v1 */
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index fc7efedbc4be..6a5a159dfb75 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -225,7 +225,7 @@ static unsigned long 

[PATCHv3 0/7] printk/ia64/ppc64/parisc64: let's deprecate %pF/%pf printk specifiers

2017-09-29 Thread Sergey Senozhatsky
Hello


Petr, could you please pick up the series?

==

On some arches C function pointers are indirect and point to
a function descriptor, which contains the actual pointer to the code.
This mostly doesn't matter, except for cases when people want to print
out function pointers in symbolic format, because the usual '%pS/%ps'
does not work on those arches as expected. That's the reason why we
have '%pF/%pf', but since it's here because of a subtle ABI detail
specific to some arches (ppc64/ia64/parisc64) it's easy to misuse
'%pF/%pf' and '%pS/%ps' (see [1], for example).

This patch set attempts to move ia64/ppc64/parisc64 C function
pointer ABI details out of printk() to arch code. Function dereference
code now checks if a pointer belongs to a .opd ELF section and dereferences
that pointer only if it does. The kernel and modules have their own .opd
sections that's why I use two different ARCH functions: for kernel and
for module pointer dereference.

I planned to remove dereference_function_descriptor() entirely,
but then I discovered a bunch other uses cases (kgdbts, init/main.c,
extable, etc.), so I decided to keep dereference_function_descriptor()
around because the main point of this patch set is to deprecate %pF/%pf.
But at the same time, I think I can go further and handle both kernel
and module descriptor dereference in dereference_function_descriptor().
We need a module pointer for module .opd check, so that will come at an
extra cost of module lookup (may be there will some other issues along
the way, haven't checked it).

Right now we've got:

- dereference_function_descriptor(addr)
a generic (old) function. it simply attempts to dereference
whatever pointer we give it.

- dereference_kernel_function_descriptor(addr)
dereferences a kernel pointer if it's within the kernel's .opd
section.

- dereference_module_function_descriptor(module, addr)
dereference a module pointer if it's within the module's .opd
section.

v3:
-- picked up ACKs and Tested-by
-- tweaked checkpatch warning (Joe)
-- updated Documentation

v2:
-- convert dereference_function_descriptor() to unsigned long
-- fix kernel descriptor range checks (Helge)
-- fix parisc module descriptor range check (Helge)
-- fix ppc64 module range check
-- add checkpatch patch

Sergey Senozhatsky (7):
  switch dereference_function_descriptor() to `unsigned long'
  sections: split dereference_function_descriptor()
  ia64: Add .opd based function descriptor dereference
  powerpc64: Add .opd based function descriptor dereference
  parisc64: Add .opd based function descriptor dereference
  symbol lookup: use new kernel and module dereference functions
  checkpatch: add pF/pf deprecation warning

 Documentation/printk-formats.txt  | 20 ++--
 arch/ia64/include/asm/sections.h  | 16 
 arch/ia64/kernel/module.c | 13 +
 arch/ia64/kernel/vmlinux.lds.S|  2 ++
 arch/parisc/boot/compressed/vmlinux.lds.S |  2 ++
 arch/parisc/include/asm/sections.h|  4 +++-
 arch/parisc/kernel/module.c   | 17 +
 arch/parisc/kernel/process.c  | 15 ---
 arch/parisc/kernel/vmlinux.lds.S  |  2 ++
 arch/parisc/mm/init.c |  4 ++--
 arch/powerpc/include/asm/module.h |  3 +++
 arch/powerpc/include/asm/sections.h   | 17 ++---
 arch/powerpc/kernel/module_64.c   | 16 
 arch/powerpc/kernel/vmlinux.lds.S |  2 ++
 drivers/misc/kgdbts.c |  2 +-
 include/asm-generic/sections.h|  8 ++--
 include/linux/moduleloader.h  |  4 
 init/main.c   |  2 +-
 kernel/extable.c  |  2 +-
 kernel/kallsyms.c |  1 +
 kernel/module.c   |  9 -
 lib/vsprintf.c|  5 +
 scripts/checkpatch.pl | 11 +--
 23 files changed, 142 insertions(+), 35 deletions(-)

-- 
2.14.2



Re: [PATCH kernel v3] vfio/spapr: Add cond_resched() for huge updates

2017-09-29 Thread Alex Williamson
On Thu, 28 Sep 2017 19:16:12 +1000
Alexey Kardashevskiy  wrote:

> Clearing very big IOMMU tables can trigger soft lockups. This adds
> cond_resched() to allow the scheduler to do context switching when
> it decides to.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> 
> The testcase is POWER9 box with 264GB guest, 4 VFIO devices from
> independent IOMMU groups, 64K IOMMU pages. This configuration produces
> 4325376 TCE entries, each entry update incurs 4 OPAL calls to update
> an individual PE TCE cache; this produced lockups for more than 20s.
> Reducing table size to 4194304 (i.e. 256GB guest) or removing one
> of 4 VFIO devices makes the problem go away.
> 
> ---
> Changes:
> v3:
> * cond_resched() checks for should_resched() so we just call resched()
> and let the cpu scheduler decide whether to switch or not
> 
> v2:
> * replaced with time based solution
> ---
>  drivers/vfio/vfio_iommu_spapr_tce.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 63112c36ab2d..759a5bdd40e1 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -507,6 +507,8 @@ static int tce_iommu_clear(struct tce_container 
> *container,
>   enum dma_data_direction direction;
>  
>   for ( ; pages; --pages, ++entry) {
> + cond_resched();
> +
>   direction = DMA_NONE;
>   oldhpa = 0;
>   ret = iommu_tce_xchg(tbl, entry, , );

This looks fine to me, I've applied it to my local next branch for
v4.15.  I'll push that branch next week, once I can rebase to
4.14-rc3.  Thanks,

Alex


[PATCH v5 for 4.14 1/3] membarrier: Provide register expedited private command

2017-09-29 Thread Mathieu Desnoyers
Provide a new command allowing processes to register their intent to use
the private expedited command.

This allows PowerPC to skip the full memory barrier in switch_mm(), and
only issue the barrier when scheduling into a task belonging to a
process that has registered to use expedited private.

Processes are now required to register before using
MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.

Changes since v1:
- Use test_ti_thread_flag(next, ...) instead of test_thread_flag() in
  powerpc membarrier_arch_sched_in(), given that we want to specifically
  check the next thread state.
- Add missing ARCH_HAS_MEMBARRIER_HOOKS in Kconfig.
- Use task_thread_info() to pass thread_info from task to
  *_ti_thread_flag().

Changes since v2:
- Move membarrier_arch_sched_in() call to finish_task_switch().
- Check for NULL t->mm in membarrier_arch_fork().
- Use membarrier_sched_in() in generic code, which invokes the
  arch-specific membarrier_arch_sched_in(). This fixes allnoconfig
  build on PowerPC.
- Move asm/membarrier.h include under CONFIG_MEMBARRIER, fixing
  allnoconfig build on PowerPC.
- Build and runtime tested on PowerPC.

Changes since v3:
- Simply rely on copy_mm() to copy the membarrier_private_expedited mm
  field on fork.
- powerpc: test thread flag instead of reading
  membarrier_private_expedited in membarrier_arch_fork().
- powerpc: skip memory barrier in membarrier_arch_sched_in() if coming
  from kernel thread, since mmdrop() implies a full barrier.
- Set membarrier_private_expedited to 1 only after arch registration
  code, thus eliminating a race where concurrent commands could succeed
  when they should fail if issued concurrently with process
  registration.
- Use READ_ONCE() for membarrier_private_expedited field access in
  membarrier_private_expedited. Matches WRITE_ONCE() performed in
  process registration.

Changes since v4:
- Move powerpc hook from sched_in() to switch_mm(), based on feedback
  from Nicholas Piggin.

Signed-off-by: Mathieu Desnoyers 
CC: Peter Zijlstra 
CC: Paul E. McKenney 
CC: Boqun Feng 
CC: Andrew Hunter 
CC: Maged Michael 
CC: gro...@google.com
CC: Avi Kivity 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: Dave Watson 
CC: Alan Stern 
CC: Will Deacon 
CC: Andy Lutomirski 
CC: Ingo Molnar 
CC: Alexander Viro 
CC: Nicholas Piggin 
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-a...@vger.kernel.org
---
 MAINTAINERS|  2 ++
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/membarrier.h  | 43 +
 arch/powerpc/include/asm/thread_info.h |  3 ++
 arch/powerpc/kernel/Makefile   |  2 ++
 arch/powerpc/kernel/membarrier.c   | 45 ++
 arch/powerpc/mm/mmu_context.c  |  7 +
 fs/exec.c  |  1 +
 include/linux/mm_types.h   |  3 ++
 include/linux/sched/mm.h   | 50 ++
 include/uapi/linux/membarrier.h| 23 +++-
 init/Kconfig   |  3 ++
 kernel/fork.c  |  2 ++
 kernel/sched/core.c| 10 ---
 kernel/sched/membarrier.c  | 25 ++---
 15 files changed, 199 insertions(+), 21 deletions(-)
 create mode 100644 arch/powerpc/include/asm/membarrier.h
 create mode 100644 arch/powerpc/kernel/membarrier.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 6671f375f7fc..f11d8aece00d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8816,6 +8816,8 @@ L:linux-ker...@vger.kernel.org
 S: Supported
 F: kernel/sched/membarrier.c
 F: include/uapi/linux/membarrier.h
+F: arch/powerpc/kernel/membarrier.c
+F: arch/powerpc/include/asm/membarrier.h
 
 MEMORY MANAGEMENT
 L: linux...@kvack.org
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 809c468edab1..6f44c5f74f71 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
+   select ARCH_HAS_MEMBARRIER_HOOKS
select ARCH_HAS_SCALED_CPUTIME  if VIRT_CPU_ACCOUNTING_NATIVE
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/include/asm/membarrier.h 
b/arch/powerpc/include/asm/membarrier.h
new file mode 100644
index ..61152a7a3cf9
--- /dev/null
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -0,0 +1,43 @@
+#ifndef 

Re: [PATCH v3 00/20] Speculative page faults

2017-09-29 Thread Laurent Dufour

Hi Andrew,

On 28/09/2017 22:38, Andrew Morton wrote:

On Thu, 28 Sep 2017 14:29:02 +0200 Laurent Dufour  
wrote:


Laurent's [0/n] provides some nice-looking performance benefits for
workloads which are chosen to show performance benefits(!) but, alas,
no quantitative testing results for workloads which we may suspect will
be harmed by the changes(?).  Even things as simple as impact upon
single-threaded pagefault-intensive workloads and its effect upon
CONFIG_SMP=n .text size?


I forgot to mention in my previous email the impact on the .text section.

Here are the metrics I got :

.text size  UP  SMP Delta
4.13-mmotm  8444201 8964137 6.16%
'' +spf 8452041 8971929 6.15%
Delta   0.09%   0.09%   

No major impact as you could see.


8k text increase seems rather a lot actually.  That's a lot more
userspace cacheclines that get evicted during a fault...

Is the feature actually beneficial on uniprocessor?


This is useless on uniprocessor, and I will disable it on x86 when !SMP 
by not defining __HAVE_ARCH_CALL_SPF.
So the speculative page fault handler will not be built but the vm 
sequence counter and the SCRU stuff will still be there. I may also make 
it disabled through macro when __HAVE_ARCH_CALL_SPF is not defined, but 
this may obfuscated the code a bit...


On ppc64, as this feature requires book3s, it can't be built without SMP 
support.


I rebuild the code on my x86 guest with the following patch applied:
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -260,7 +260,7 @@ enum page_cache_mode {
 /*
  * Advertise that we call the Speculative Page Fault handler.
  */
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
 #define __HAVE_ARCH_CALL_SPF
 #endif

And this time I got the following size on UP :
UP
4.13-mmotm  8444201
'' +spf 8447945 (previously 8452041)
  +3744

If I disable all the vm_sequence operations and the SRCU stuff this 
would lead to 0.


Thanks,
Laurent.



Re: [linux-next] [bisected a4615d11] Memory DLPAR triggers WARN_ONCE() in mm/page_vma_mapped.c

2017-09-29 Thread Zi Yan
Hi Abdul,

I just want to follow up with this.

Did you have a chance to test my patch? Does it fix your original problem?

Thanks.

--
Best Regards
Yan Zi

On 13 Sep 2017, at 1:48, abdul wrote:

> On Mon, 2017-09-11 at 10:53 -0400, Zi Yan wrote:
>> Hi Abdul,
>>
>> Can you try this patch below? I think I missed that pmd entries
>> can be zapped, so the WARN_ONCE is unnecessary.
>>
>> Thanks.
>>
>> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
>> index 6a03946469a9..eb462e7db0a9 100644
>> --- a/mm/page_vma_mapped.c
>> +++ b/mm/page_vma_mapped.c
>> @@ -167,8 +167,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
>> *pvmw)
>> return not_found(pvmw);
>> return true;
>> }
>> -   } else
>> -   WARN_ONCE(1, "Non present huge pmd without 
>> pmd migration enabled!");
>> +   }
>> return not_found(pvmw);
>> } else {
>> /* THP pmd was split under us: handle on pte level */
>>
>> --
>> Best Regards
>> Yan Zi
>>
>> On 11 Sep 2017, at 5:56, abdul wrote:
>
> Kernel Oops a different bug is blocking me to verify the given patch,
> may be unrelated to this.
>
> I can validate once the below one gets fixed.
>
> BUG: Bad page state in process avocado  pfn:74943
> page:f1d250c0 count:1 mapcount:0 mapping:c594a299
> index:0x1
> flags: 0x3380004007c(referenced|uptodate|dirty|lru|active|
> swapbacked)
> raw: 03380004007c c594a299 0001 0001
> raw: 5deadbeef100 5deadbeef200  c0077e391800
> page dumped because: page still charged to cgroup
> page->mem_cgroup:c0077e391800
> bad because of flags: 0x4007c(referenced|uptodate|dirty|lru|active|
> swapbacked)
> Unable to handle kernel paging request for data at address
> 0x5deadbeef108
> Faulting instruction address: 0xc02b5604
> Oops: Kernel access of bad area, sig: 11 [#1]
> LE SMP NR_CPUS=2048 NUMA pSeries
> Modules linked in: xt_addrtype xt_conntrack ipt_MASQUERADE
> nf_nat_masquerade_ipv4 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4
> nf_nat_ipv4 iptable_filter ip_tables x_tables nf_nat nf_conntrack bridge
> stp llc dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio libcrc32c
> rtc_generic vmx_crypto pseries_rng autofs4
> CPU: 3 PID: 922 Comm: avocado Tainted: GB
> 4.13.0-next-20170907-autotest #2
> task: c00771bc1700 task.stack: c00771c04000
> NIP:  c02b5604 LR: c02b7678 CTR: 
> REGS: c00771c072c0 TRAP: 0380   Tainted: GB
> (4.13.0-next-20170907-autotest)
> MSR:  80010280b033   CR:
> 82228228  XER: 200f
> CFAR: c02b7674 SOFTE: 0
> GPR00: c02b7678 c00771c07540 c1599900
> 
> GPR04: f1d250e0 0001 5deadbeef100
> 5deadbeef200
> GPR08: 5deadbee c0077ff54710 
> 
> GPR12: 22242224 ce741f80 00077eb1
> c0077fbe88f8
> GPR16: c0077ff54600 4000 
> 2000
> GPR20: 0002 c0077fbe8918 c10d88f8
> 
> GPR24: 0001 0040 c0077ff54600
> f1d250c0
> GPR28: 0010  0001
> 
> NIP [c02b5604] __rmqueue+0xd4/0x680
> LR [c02b7678] get_page_from_freelist+0x798/0xe30
> Call Trace:
> [c00771c07540] [f1d250c0] 0xf1d250c0 (unreliable)
> [c00771c075f0] [c02b7678] get_page_from_freelist+0x798/0xe30
> [c00771c07700] [c02b88b8] __alloc_pages_nodemask
> +0x528/0x1120
> [c00771c078f0] [c0358864] new_node_page+0x174/0x200
> [c00771c07950] [c035f170] migrate_pages+0x2d0/0x1160
> [c00771c07a30] [c035b1e4] __offline_pages.constprop.6
> +0x8c4/0xa80
> [c00771c07b70] [c07e2448] memory_subsys_offline+0xa8/0x110
> [c00771c07ba0] [c07b45d4] device_offline+0x104/0x140
> [c00771c07be0] [c07e223c] store_mem_state+0x17c/0x190
> [c00771c07c20] [c07aec28] dev_attr_store+0x68/0xa0
> [c00771c07c60] [c0457680] sysfs_kf_write+0x80/0xb0
> [c00771c07ca0] [c045638c] kernfs_fop_write+0x17c/0x250
> [c00771c07cf0] [c038e20c] __vfs_write+0x6c/0x230
> [c00771c07d90] [c0390170] vfs_write+0xd0/0x270
> [c00771c07de0] [c039214c] SyS_write+0x6c/0x110
> [c00771c07e30] [c000b184] system_call+0x58/0x6c
> Instruction dump:
> 39290100 7c9a482a 7d3a4a14 7fa92040 3764ffe0 419e01d8 41c201d4 3d005dea
> e8e40008 e8c4 6108dbee 790807c6  6508f000 f8c7
> 7d094378
> ---[ end trace ca28dd806080b418 ]---
>
>
>>>
>>> Regard's
>>> Abdul Haleem
>>> IBM Linux Technology Center



Re: powerpc: Fix workaround for spurious MCE on POWER9

2017-09-29 Thread Michael Ellerman
On Fri, 2017-09-29 at 03:37:35 UTC, Michael Neuling wrote:
> In the recent commit:
>   d8bd9f3f09 powerpc: Handle MCE on POWER9 with only DSISR bit 30 set
> I screwed up the bit.  It should be bit 25 (IBM bit 38).
> 
> Signed-off-by: Michael Neuling 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/bca73f595a566f0262967535bb5b2e

cheers


Re: [V3] cxl: Fix memory page not handled

2017-09-29 Thread Michael Ellerman
On Tue, 2017-09-26 at 08:15:21 UTC, Christophe Lombard wrote:
> The in-kernel 'library' API can be called by drivers to help
> interaction with an IBM XSL on a POWER9 system.
> 
> The cxllib_handle_fault() API is used to handle memory fault. All memory
> pages of the specified buffer have to be handled but under certain
> conditions,the last page may not be touched, and the address the
> adapter is trying to access is never sent to the kernel for resolution.
> 
> This patch reworks start address of the loop with an address aligned on
> the page size. In this context, the last page is not missed.
> 
> Signed-off-by: Christophe Lombard 
> Acked-by: Frederic Barrat 
> Acked-by: Andrew Donnellan 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/4fc0870d7e462fe3b86e0f938ae75c

cheers


Re: [PATCH v8 6/6] powerpc/fadump: use the new parse_args callback arguments

2017-09-29 Thread Hari Bathini

In case, someone wishes for a changelog:


With fadump_rework_cmdline_params() function, parse_args() callback 
function,


taking new arguments - current & next, use them to process 
'fadump_extra_args='


parmeter, in enforcing the parameters passed through it for fadump kernel.


On Tuesday 12 September 2017 09:31 PM, Michal Suchanek wrote:

Signed-off-by: Michal Suchanek 
---
  arch/powerpc/kernel/fadump.c | 47 
  1 file changed, 13 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8778e1cc0380..1678d99ea835 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -481,33 +481,19 @@ struct param_info {
  };

  static void __init fadump_update_params(struct param_info *param_info,
-   char *param, char *val)
+   char *param, char *val,
+   char *currant, char *next)
  {
-   ptrdiff_t param_offset = param - param_info->tmp_cmdline;
+   ptrdiff_t param_offset = currant - param_info->tmp_cmdline;
size_t vallen = val ? strlen(val) : 0;
char *tgt = param_info->cmdline + param_offset
- param_info->shortening;
-   int shortening = 0;
-   int quoted = 0;
+   int shortening = ((next - 1) - (currant))
+   - (FADUMP_EXTRA_ARGS_LEN + 1 + vallen);

if (!val)
return;

-   /* leading '"' removed from parameter */
-   if ((param > param_info->tmp_cmdline) && *(param - 1) == '"') {
-   quoted = 1;
-   shortening += 1;
-   tgt--;
-   }
-
-   /* next_arg removes one leading and one trailing '"' */
-   if ((*(tgt + FADUMP_EXTRA_ARGS_LEN + 1 + vallen + shortening) == '"') &&
-   (quoted || (*(tgt + FADUMP_EXTRA_ARGS_LEN + 1) == '"'))) {
-   shortening += 1;
-   if (!quoted)
-   shortening += 1;
-   }
-
/* remove one leading and one trailing quote if both are present */
if ((val[0] == '"') && (val[vallen - 1] == '"')) {
shortening += 2;
@@ -515,22 +501,15 @@ static void __init fadump_update_params(struct param_info 
*param_info,
val++;
}

-   /* some characters were removed - move the trailing part of cmdline */
-   if (shortening) {
-   char *src;
+   strncpy(tgt, FADUMP_EXTRA_ARGS_PARAM, FADUMP_EXTRA_ARGS_LEN);
+   tgt += FADUMP_EXTRA_ARGS_LEN;
+   *tgt++ = ' ';
+   strncpy(tgt, val, vallen);
+   tgt += vallen;

-   strncpy(tgt, FADUMP_EXTRA_ARGS_PARAM, FADUMP_EXTRA_ARGS_LEN);
-   tgt += FADUMP_EXTRA_ARGS_LEN;
-   *tgt++ = ' ';
-
-   strncpy(tgt, val, vallen);
-   tgt += vallen;
-
-   src = tgt + shortening;
+   if (shortening) {
+   char *src = tgt + shortening;
memmove(tgt, src, strlen(src) + 1);
-   } else {
-   /* remove the '=' */
-   *(tgt + FADUMP_EXTRA_ARGS_LEN) = ' ';
}

param_info->shortening += shortening;
@@ -550,7 +529,7 @@ static int __init fadump_rework_cmdline_params(char *param, 
char *val,
 strlen(FADUMP_EXTRA_ARGS_PARAM) - 1))
return 0;

-   fadump_update_params(param_info, param, val);
+   fadump_update_params(param_info, param, val, currant, next);

return 0;
  }



Thanks
Hari



Re: [PATCH] powerpc/livepatch: Fix livepatch stack access

2017-09-29 Thread Balbir Singh
On Wed, 2017-09-20 at 16:41 +0530, Naveen N . Rao wrote:
> On 2017/09/20 03:49PM, Kamalesh Babulal wrote:
> > While running stress test with livepatch module loaded, kernel
> > bug was triggered.
> > 
> > cpu 0x5: Vector: 400 (Instruction Access) at [c000eb9d3b60]
> > pc: c000eb9d3e30
> > lr: c000eb9d3e30
> > sp: c000eb9d3de0
> >msr: 80001280b033
> >   current = 0xc000dbd38700
> >   paca= 0xcfe01400   softe: 0irq_happened: 0x01
> > pid   = 8618, comm = make
> > Linux version 4.13.0+ (root@ubuntu) (gcc version 6.3.0 20170406 (Ubuntu 
> > 6.3.0-12ubuntu2)) #1 SMP Wed Sep 13 03:49:27 EDT 2017
> > 
> > 5:mon> t
> > [c000eb9d3de0] c000eb9d3e30 (unreliable)
> > [c000eb9d3e30] c0008ab4 hardware_interrupt_common+0x114/0x120
> >  --- Exception: 501 (Hardware Interrupt) at c0053040 
> > livepatch_handler+0x4c/0x74
> > [c000eb9d4120] 57ac6e9d (unreliable)
> > [d89d9f78] 2e0965747962382e
> > SP (965747962342e09) is in userspace
> > 
> > When an interrupt is served in between the livepatch_handler execution,
> > there are chances of the livepatch_stack/task task getting corrupted.
> > 
> > CPU 1
> >  =
> > Task A  Interrupt Handler
> > =   =
> > livepatch_handler:
> >  mr r0, r1
> >  ld r1, TI_livepatch_sp(r12)
> > hardware_interrupt_common
> > |_do_IRQ
> >   |_ call_do_irq:
> > mflr r0
> > std  r0,16(r1)
> > stdu 
> > r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
> > ...
> > lis r2, STACK_END_MAGIC@h
> > ori r2, r2, STACK_END_MAGIC@l
> > ld  r12, -8(r1)  <- livepatch stack is corrupted.
> > 
> > Fix the corruption by using r11 register for livepatch stack manipulation,
> > instead of shuffling task stack and livepatch stack into r1 register.
> > Using r11 register also avoids disabling/enabling irq's while setting
> > up the livepatch stack.
> > 
> > Signed-off-by: Kamalesh Babulal 
> > Cc: Balbir Singh 
> > Cc: Naveen N. Rao 
> > Cc: linuxppc-dev@lists.ozlabs.org
> > ---
> >  arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 45 
> > +-
> >  1 file changed, 15 insertions(+), 30 deletions(-)
> 
> Reviewed-by: Naveen N. Rao 
Reviewed-by: Balbir Singh 



Re: [PATCH v4 for 4.14 1/3] membarrier: Provide register expedited private command

2017-09-29 Thread Peter Zijlstra
On Fri, Sep 29, 2017 at 09:38:53PM +1000, Nicholas Piggin wrote:

> Not really. There is some ability to hold onto a line for a time, but
> there is no way to starve them, let alone starve hundreds of other
> CPUs. They will request the cacheline exclusive and eventually get it.

OK, hardware fairness there is nice.

> I would really prefer to go this way on powerpc first. We could add the
> the registration APIs as basically no-ops, but which would allow the
> locking approach to be changed if we find it causes issues. I'll try to
> find some time and a big system when I can.

Fair enough I suppose.

> > A semi related issue; I suppose we can do a arch upcall to flush_tlb_mm
> > and reset the mm_cpumask when we change cpuset groups.
> 
> For powerpc we have been looking at how mm_cpumask can be improved.
> It has real drawbacks even when you don't consider this new syscall.

What else do you use mm_cpumask for?


Re: [PATCH v4 for 4.14 1/3] membarrier: Provide register expedited private command

2017-09-29 Thread Nicholas Piggin
On Fri, 29 Sep 2017 12:31:31 +0200
Peter Zijlstra  wrote:

> On Fri, Sep 29, 2017 at 02:27:57AM +1000, Nicholas Piggin wrote:
> 
> > The biggest power boxes are more tightly coupled than those big
> > SGI systems, but even so just plodding along taking and releasing
> > locks in turn would be fine on those SGI ones as well really. Not DoS
> > level. This is not a single mega hot cache line or lock that is
> > bouncing over the entire machine, but one process grabbing a line and
> > lock from each of 1000 CPUs.
> > 
> > Slight disturbance sure, but each individual CPU will see it as 1/1000th
> > of a disturbance, most of the cost will be concentrated in the syscall
> > caller.  
> 
> But once the:
> 
>   while (1)
>   sys_membarrier()
> 
> thread has all those (lock) lines in M state locally, it will become
> very hard for the remote CPUs to claim them back, because its constantly

Not really. There is some ability to hold onto a line for a time, but
there is no way to starve them, let alone starve hundreds of other
CPUs. They will request the cacheline exclusive and eventually get it.
Then the membarrier CPU has to pay to get it back. If there is a lot of
activity on the locks, the membarrier will have a difficult time to take
each one.

I don't say there is zero cost or can't interfere with others, only that
it does not seem particularly bad compared with other things. Once you
restrict it to mm_cpumask, then it's quite partitionable.

I would really prefer to go this way on powerpc first. We could add the
the registration APIs as basically no-ops, but which would allow the
locking approach to be changed if we find it causes issues. I'll try to
find some time and a big system when I can.

> touching them. Sure it will touch a 1000 other lines before its back to
> this one, but if they're all local that's fairly quick.
> 
> But you're right, your big machines have far smaller NUMA factors.
> 
> > > Bouncing that lock across the machine is *painful*, I have vague
> > > memories of cases where the lock ping-pong was most the time spend.
> > > 
> > > But only Power needs this, all the other architectures are fine with the
> > > lockless approach for MEMBAR_EXPEDITED_PRIVATE.  
> > 
> > Yes, we can add an iterator function that power can override in a few
> > lines. Less arch specific code than this proposal.  
> 
> A semi related issue; I suppose we can do a arch upcall to flush_tlb_mm
> and reset the mm_cpumask when we change cpuset groups.

For powerpc we have been looking at how mm_cpumask can be improved.
It has real drawbacks even when you don't consider this new syscall.

Thanks,
Nick


Re: [linux-next][Oops] memory hot-unplug results fault instruction address at /include/linux/list.h:104

2017-09-29 Thread Abdul Haleem
On Wed, 2017-09-20 at 12:54 -0700, Kees Cook wrote:
> On Wed, Sep 20, 2017 at 12:40 AM, Abdul Haleem
>  wrote:
> > On Tue, 2017-09-12 at 12:11 +0530, abdul wrote:
> >> Hi,
> >>
> >> Memory hot-unplug on PowerVM LPAR running next-20170911 results in
> >> Faulting instruction address: 0xc02b56c4
> >>
> >> which maps to the below code path:
> >>
> >> 0xc02b56c4 is in __rmqueue (./include/linux/list.h:104).
> >> 99 * This is only for internal list manipulation where we know
> >> 100* the prev/next entries already!
> >> 101*/
> >> 102   static inline void __list_del(struct list_head * prev, struct
> >> list_head * next)
> >> 103   {
> >> 104   next->prev = prev;
> >> 105   WRITE_ONCE(prev->next, next);
> >> 106   }
> >> 107
> >> 108   /**
> >>
> >
> > I see another kernel Oops when running transparent hugepages
> > de-fragmentation test.
> >
> > And the faulty instruction address again pointing to same code line
> > 0xc026f9f4 is in compaction_alloc (./include/linux/list.h:104)
> >
> > steps to recreate:
> > -
> > 1. Enable transparent hugepages ("always")
> > 2. Turn off the defrag $ echo 0 > khugepaged/defrag
> > 3. Write random to memory path
> > 4. Set huge pages numbers
> > 5. Turn on defrag $ echo 1 > khugepaged/defrag
> >
> >
> > new trace:
> > --
> > Unable to handle kernel paging request for data at address
> > 0x5deadbeef108
> 
> This looks like use-after-list-removal, that value appears to be LIST_POISON1.
> 
> Try enabling CONFIG_DEBUG_LIST to see if you get better details?

With above config enabled I see below messages and also call traces. But
no kernel Oops.

BUG: Bad page state in process drmgr  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8
index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping



-- 
Regard's

Abdul Haleem
IBM Linux Technology Centre


BUG: Bad page state in process drmgr  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process drmgr  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process drmgr  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process systemd-journal  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process systemd-journal  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process in:imklog  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process systemd-journal  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process in:imklog  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 f1dc31c8 0001 
raw: 5deadbeef100 5deadbeef200  
page dumped because: non-NULL mapping
BUG: Bad page state in process in:imklog  pfn:770c7
page:f1dc31c0 count:0 mapcount:0 mapping:f1dc31c8 index:0x1
flags: 0x338()
raw: 0338 

Re: [PATCH v4 for 4.14 1/3] membarrier: Provide register expedited private command

2017-09-29 Thread Peter Zijlstra
On Fri, Sep 29, 2017 at 02:27:57AM +1000, Nicholas Piggin wrote:

> The biggest power boxes are more tightly coupled than those big
> SGI systems, but even so just plodding along taking and releasing
> locks in turn would be fine on those SGI ones as well really. Not DoS
> level. This is not a single mega hot cache line or lock that is
> bouncing over the entire machine, but one process grabbing a line and
> lock from each of 1000 CPUs.
> 
> Slight disturbance sure, but each individual CPU will see it as 1/1000th
> of a disturbance, most of the cost will be concentrated in the syscall
> caller.

But once the:

while (1)
sys_membarrier()

thread has all those (lock) lines in M state locally, it will become
very hard for the remote CPUs to claim them back, because its constantly
touching them. Sure it will touch a 1000 other lines before its back to
this one, but if they're all local that's fairly quick.

But you're right, your big machines have far smaller NUMA factors.

> > Bouncing that lock across the machine is *painful*, I have vague
> > memories of cases where the lock ping-pong was most the time spend.
> > 
> > But only Power needs this, all the other architectures are fine with the
> > lockless approach for MEMBAR_EXPEDITED_PRIVATE.
> 
> Yes, we can add an iterator function that power can override in a few
> lines. Less arch specific code than this proposal.

A semi related issue; I suppose we can do a arch upcall to flush_tlb_mm
and reset the mm_cpumask when we change cpuset groups.


Re: [PATCH] powernv: Add OCC driver to mmap sensor area

2017-09-29 Thread Oliver
On Fri, Sep 29, 2017 at 4:47 PM, Shilpasri G Bhat
 wrote:
> This driver provides interface to mmap the OCC sensor area
> to userspace to parse and read OCC inband sensors.
>
> Signed-off-by: Shilpasri G Bhat 
> ---
> - The skiboot patch for this is posted here:
> https://lists.ozlabs.org/pipermail/skiboot/2017-September/009209.html
>
>  arch/powerpc/platforms/powernv/Makefile   |  2 +-
>  arch/powerpc/platforms/powernv/opal-occ.c | 88 
> +++
>  arch/powerpc/platforms/powernv/opal.c |  3 ++
>  3 files changed, 92 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/platforms/powernv/opal-occ.c
>
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index 37d60f7..7911295 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -2,7 +2,7 @@ obj-y   += setup.o opal-wrappers.o opal.o 
> opal-async.o idle.o
>  obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
>  obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
> opal-sensor.o
>  obj-y  += opal-msglog.o opal-hmi.o opal-power.o 
> opal-irqchip.o
> -obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
> opal-sensor-groups.o
> +obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
> opal-sensor-groups.o opal-occ.o
>
>  obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
>  obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o
> diff --git a/arch/powerpc/platforms/powernv/opal-occ.c 
> b/arch/powerpc/platforms/powernv/opal-occ.c
> new file mode 100644
> index 000..5ca3a41
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/opal-occ.c
> @@ -0,0 +1,88 @@
> +/*
> + * Copyright IBM Corporation 2017
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#define pr_fmt(fmt) "opal-occ: " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +static struct miscdevice occ;
> +static u64 sensor_base, sensor_size;
> +
> +static int opal_occ_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +   if (vma->vm_flags & VM_WRITE)
> +   return -EINVAL;
> +
> +   return vm_iomap_memory(vma, sensor_base, sensor_size);

Hmm, this concerns me slightly. Is this going to create an IO page
mapping (i.e. cache inhibited) or normal memory mapping?

Another possible issue is that we're limited to mapping 64KB pages so
this will expose whatever follows the sensor block in memory to user
space unless the range is 64KB aligned. It looks like this is probably
safe for P9 due to the layout of the sensor memory, but that might not
be true in the future. At the very least you should have a comment
mentioning the issue.

> +}
> +
> +static const struct file_operations opal_occ_fops = {
> +   .mmap   = opal_occ_mmap,
> +   .owner  = THIS_MODULE,
> +};
> +
> +static int opal_occ_probe(struct platform_device *pdev)
> +{
> +   u64 reg[2];
> +   int rc;
> +
> +   if (!pdev || !pdev->dev.of_node)
> +   return -ENODEV;
> +
> +   if (of_property_read_u64_array(pdev->dev.of_node, "occ-sensors",
> +  [0], 2)) {
> +   pr_warn("occ-sensors property not found\n");
> +   return -ENODEV;
> +   }
> +
> +   sensor_base = reg[0];
> +   sensor_size = reg[1];
> +   occ.minor = MISC_DYNAMIC_MINOR;
> +   occ.name = "occ";
> +   occ.fops = _occ_fops;
> +   rc = misc_register();
> +   if (rc)
> +   pr_warn("Failed to register OCC device\n");
> +
> +   return rc;
> +}
> +
> +static int opal_occ_remove(struct platform_device *pdev)
> +{
> +   misc_deregister();
> +   return 0;
> +}
> +
> +static const struct of_device_id opal_occ_match[] = {
> +   { .compatible = "ibm,opal-occ-inband-sensors" },
> +   { },
> +};
> +
> +static struct platform_driver opal_occ_driver = {
> +   .driver = {
> +   .name   = "opal_occ",
> +   .of_match_table = opal_occ_match,
> +},
> +   .probe  = opal_occ_probe,
> +   .remove = opal_occ_remove,
> +};
> +
> +module_platform_driver(opal_occ_driver);
> +
> +MODULE_DESCRIPTION("PowerNV OPAL-OCC driver");
> +MODULE_LICENSE("GPL");
> diff --git a/arch/powerpc/platforms/powernv/opal.c 
> b/arch/powerpc/platforms/powernv/opal.c
> index 65c79ec..a4f977f 100644
> --- 

Re: [PATCH 1/3] powerpc/lib/sstep: Add XER bits introduced in POWER ISA v3.0

2017-09-29 Thread Naveen N. Rao
On 2017/09/29 05:44AM, Sandipan Das wrote:
> This adds definitions for the OV32 and CA32 bits of XER that
> were introduced in POWER ISA v3.0. There are some existing
> instructions that currently set the OV and CA bits based on
> certain conditions.
> 
> The emulation behaviour of all these instructions needs to
> be updated to set these new bits accordingly.
> 
> Signed-off-by: Sandipan Das 

For this series:
Acked-by: Naveen N. Rao 

> ---
>  arch/powerpc/lib/sstep.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 5e8418c28bd8..16814bfc01da 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -31,6 +31,8 @@ extern char system_call_common[];
>  #define XER_SO   0x8000U
>  #define XER_OV   0x4000U
>  #define XER_CA   0x2000U
> +#define XER_OV32 0x0008U
> +#define XER_CA32 0x0004U
> 
>  #ifdef CONFIG_PPC_FPU
>  /*
> -- 
> 2.13.5
> 



[PATCH] powernv: Add OCC driver to mmap sensor area

2017-09-29 Thread Shilpasri G Bhat
This driver provides interface to mmap the OCC sensor area
to userspace to parse and read OCC inband sensors.

Signed-off-by: Shilpasri G Bhat 
---
- The skiboot patch for this is posted here:
https://lists.ozlabs.org/pipermail/skiboot/2017-September/009209.html

 arch/powerpc/platforms/powernv/Makefile   |  2 +-
 arch/powerpc/platforms/powernv/opal-occ.c | 88 +++
 arch/powerpc/platforms/powernv/opal.c |  3 ++
 3 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-occ.c

diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 37d60f7..7911295 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,7 +2,7 @@ obj-y   += setup.o opal-wrappers.o opal.o 
opal-async.o idle.o
 obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
opal-sensor.o
 obj-y  += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
-obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
opal-sensor-groups.o
+obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
opal-sensor-groups.o opal-occ.o
 
 obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o
diff --git a/arch/powerpc/platforms/powernv/opal-occ.c 
b/arch/powerpc/platforms/powernv/opal-occ.c
new file mode 100644
index 000..5ca3a41
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-occ.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright IBM Corporation 2017
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "opal-occ: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static struct miscdevice occ;
+static u64 sensor_base, sensor_size;
+
+static int opal_occ_mmap(struct file *file, struct vm_area_struct *vma)
+{
+   if (vma->vm_flags & VM_WRITE)
+   return -EINVAL;
+
+   return vm_iomap_memory(vma, sensor_base, sensor_size);
+}
+
+static const struct file_operations opal_occ_fops = {
+   .mmap   = opal_occ_mmap,
+   .owner  = THIS_MODULE,
+};
+
+static int opal_occ_probe(struct platform_device *pdev)
+{
+   u64 reg[2];
+   int rc;
+
+   if (!pdev || !pdev->dev.of_node)
+   return -ENODEV;
+
+   if (of_property_read_u64_array(pdev->dev.of_node, "occ-sensors",
+  [0], 2)) {
+   pr_warn("occ-sensors property not found\n");
+   return -ENODEV;
+   }
+
+   sensor_base = reg[0];
+   sensor_size = reg[1];
+   occ.minor = MISC_DYNAMIC_MINOR;
+   occ.name = "occ";
+   occ.fops = _occ_fops;
+   rc = misc_register();
+   if (rc)
+   pr_warn("Failed to register OCC device\n");
+
+   return rc;
+}
+
+static int opal_occ_remove(struct platform_device *pdev)
+{
+   misc_deregister();
+   return 0;
+}
+
+static const struct of_device_id opal_occ_match[] = {
+   { .compatible = "ibm,opal-occ-inband-sensors" },
+   { },
+};
+
+static struct platform_driver opal_occ_driver = {
+   .driver = {
+   .name   = "opal_occ",
+   .of_match_table = opal_occ_match,
+},
+   .probe  = opal_occ_probe,
+   .remove = opal_occ_remove,
+};
+
+module_platform_driver(opal_occ_driver);
+
+MODULE_DESCRIPTION("PowerNV OPAL-OCC driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 65c79ec..a4f977f 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -889,6 +889,9 @@ static int __init opal_init(void)
/* Initialise OPAL sensor groups */
opal_sensor_groups_init();
 
+   /* Initialise OCC driver */
+   opal_pdev_init("ibm,opal-occ-inband-sensors");
+
return 0;
 }
 machine_subsys_initcall(powernv, opal_init);
-- 
1.8.3.1



[linux-next][DLPAR] kernel BUG at arch/powerpc/lib/locks.c:34!

2017-09-29 Thread Abdul Haleem
Hi,

Memory hot-unplug operation on linux-next kernel (4K pagesize) results
in BUG_ON() at arch/powerpc/lib/locks.c

/*
 * Waiting for a read lock or a write lock on a rwlock...
 * This turns out to be the same for read and write locks, since
 * we only know the holder if it is write-locked.
 */
void __rw_yield(arch_rwlock_t *rw)
{
int lock_value;
unsigned int holder_cpu, yield_count;

lock_value = rw->lock;
if (lock_value >= 0)
return; /* no write lock at present */
holder_cpu = lock_value & 0x;
>>  BUG_ON(holder_cpu >= NR_CPUS);
yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
if ((yield_count & 1) == 0)
return; /* virtual cpu is currently running */
rmb();


Machine Type: Power 8 PowerVM LPAR
kernel : 4.14.0-rc2-next-20170928
gcc: version 6.3.1
Test : DLPAR Memory
config:
CONFIG_PPC_4K_PAGES=y
# CONFIG_PPC_64K_PAGES is not set


logs:

Offlined Pages 65536
Offlined Pages 65536
Offlined Pages 65536
Offlined Pages 65536
[ cut here ]
kernel BUG at arch/powerpc/lib/locks.c:34!
Oops: Exception in kernel mode, sig: 5 [#1]
LE SMP NR_CPUS=2048 NUMA pSeries
Dumping ftrace buffer: 
   (ftrace buffer empty)
Modules linked in: rpadlpar_io rpaphp bridge stp llc xt_tcpudp ipt_REJECT 
nf_reject_ipv4 xt_conntrack nfnetlink iptable_mangle iptable_nat 
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_filter 
vmx_crypto pseries_rng rng_core binfmt_misc nfsd ip_tables x_tables autofs4
CPU: 0 PID: 12030 Comm: touch Not tainted 4.14.0-rc2-next-20170928-autotest #1
task: c00271aecc00 task.stack: c0026c24c000
NIP:  c16a50d0 LR: c17ff2c4 CTR: c1af4270
REGS: c0026c24f860 TRAP: 0700   Not tainted  
(4.14.0-rc2-next-20170928-autotest)
MSR:  80029033   CR: 42008884  XER:   
CFAR: c17ff2c0 SOFTE: 1 
GPR00: c17ff2c4 c0026c24fae0 c3572500 c0026b7f37f0 
GPR04: 0002 c00270179b10 c3622500 00103265 
GPR08: 0001 a1e0 0323a1e0 c00270060420 
GPR12: 82008288 cfdc   
GPR16:     
GPR20:    0002 
GPR24: c2b252f0 c0026b7f37f0 fffd c00271aecc00 
GPR28: c00270008000 c0026b7f37e8 c0026b7f37f0 c361ff50 
NIP [c16a50d0] __spin_yield+0x60/0x130
LR [c17ff2c4] do_raw_spin_lock+0x2d4/0x2e0
Call Trace:
[c0026c24fae0] [c0026c24fb30] 0xc0026c24fb30 (unreliable)
[c0026c24fb50] [c17ff2c4] do_raw_spin_lock+0x2d4/0x2e0
[c0026c24fb80] [c27ca540] _raw_spin_lock+0x40/0x70
[c0026c24fba0] [c27bfbf0] __mutex_lock.isra.0+0x1a0/0x11f0
[c0026c24fca0] [c27c0f24] __mutex_lock_slowpath+0x44/0x70
[c0026c24fcc0] [c27c0ff4] mutex_lock+0xa4/0xd0
[c0026c24fce0] [c1af42b8] pipe_release+0x48/0x1e0
[c0026c24fd20] [c1ae0efc] __fput+0x12c/0x4f0
[c0026c24fd80] [c1ae12ec] fput+0x2c/0x50
[c0026c24fda0] [c178eb3c] task_work_run+0x17c/0x200
[c0026c24fe00] [c160adb8] do_notify_resume+0x1f8/0x220
[c0026c24fe30] [c15ebec4] ret_from_except_lite+0x70/0x74
Instruction dump:
2faa 39290001 f926da50 419e0078 3ce2000b e8e7da60 5549043e 3cc2000b 
210907ff 79080fe0 38e70001 f8e6da60 <0b08> 3ce20007 38e7ea78 1d290480 
---[ end trace 1343a8353f7a1a73 ]---

Kernel panic - not syncing: Fatal exception
Dumping ftrace buffer: 
   (ftrace buffer empty)
Rebooting in 10 seconds..


Test script to recreate :
https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/memhotplug.py

$ avocado run memhotplug.py --show-job-log

-- 
Regard's

Abdul Haleem
IBM Linux Technology Centre


#
# Automatically generated file; DO NOT EDIT.
# Linux/powerpc 4.11.0-rc7 Kernel Configuration
#
CONFIG_PPC64=y

#
# Processor support
#
CONFIG_PPC_BOOK3S_64=y
# CONFIG_PPC_BOOK3E_64 is not set
# CONFIG_POWER7_CPU is not set
CONFIG_POWER8_CPU=y
CONFIG_PPC_BOOK3S=y
CONFIG_PPC_FPU=y
CONFIG_ALTIVEC=y
CONFIG_VSX=y
# CONFIG_PPC_ICSWX is not set
CONFIG_PPC_STD_MMU=y
CONFIG_PPC_STD_MMU_64=y
CONFIG_PPC_RADIX_MMU=y
CONFIG_PPC_MM_SLICES=y
CONFIG_PPC_HAVE_PMU_SUPPORT=y
CONFIG_PPC_PERF_CTRS=y
CONFIG_SMP=y
CONFIG_NR_CPUS=2048
CONFIG_PPC_DOORBELL=y
# CONFIG_CPU_BIG_ENDIAN is not set
CONFIG_CPU_LITTLE_ENDIAN=y
CONFIG_PPC64_BOOT_WRAPPER=y
CONFIG_64BIT=y
CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
CONFIG_ARCH_DMA_ADDR_T_64BIT=y
CONFIG_MMU=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
CONFIG_NR_IRQS=512
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_ARCH_HAS_ILOG2_U32=y
CONFIG_ARCH_HAS_ILOG2_U64=y
CONFIG_GENERIC_HWEIGHT=y