[PATCH] nvram: print no error message when nvram is not set as pstore backend

2015-05-11 Thread Hari Bathini
Pstore only supports one backend at a time. The preferred
pstore backend is set by passing the pstore.backend=
argument to the kernel at boot time. Currently, while trying
to register with pstore, nvram throws an error message even
when "pstore.backend != nvram", which is unnecessary. This
patch removes the error message in case "pstore.backend != nvram".

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/nvram_64.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 1e703f8..bfdbcab 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -582,9 +582,10 @@ static int nvram_pstore_init(void)
spin_lock_init(&nvram_pstore_info.buf_lock);
 
rc = pstore_register(&nvram_pstore_info);
-   if (rc != 0)
-   pr_err("nvram: pstore_register() failed, defaults to "
-   "kmsg_dump; returned %d\n", rc);
+   if (rc && (rc != -EPERM))
+   /* Print error only when pstore.backend == nvram */
+   pr_err("nvram: pstore_register() failed, returned %d. "
+   "Defaults to kmsg_dump\n", rc);
 
return rc;
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] fadump: fix endianess issues in firmware assisted dump handling

2014-09-03 Thread Hari Bathini
Firmware-assisted dump (fadump) kernel code is not LE compliant. The
below patch tries to fix this issue. Tested this patch with upstream
kernel. Did some sanity testing for the  LE fadump vmcore generated.
Below output shows crash tool successfully opening LE fadump vmcore.

# crash $vmlinux vmcore

crash 7.0.5
Copyright (C) 2002-2014  Red Hat, Inc.
Copyright (C) 2004, 2005, 2006, 2010  IBM Corporation
Copyright (C) 1999-2006  Hewlett-Packard Co
Copyright (C) 2005, 2006, 2011, 2012  Fujitsu Limited
Copyright (C) 2006, 2007  VA Linux Systems Japan K.K.
Copyright (C) 2005, 2011  NEC Corporation
Copyright (C) 1999, 2002, 2007  Silicon Graphics, Inc.
Copyright (C) 1999, 2000, 2001, 2002  Mission Critical Linux, Inc.
This program is free software, covered by the GNU General Public 
License,
and you are welcome to change it and/or distribute copies of it under
certain conditions.  Enter "help copying" to see the conditions.
This program has absolutely no warranty.  Enter "help warranty" for 
details.

crash: /boot/vmlinux-3.16.0-rc7-7-default+: no .gnu_debuglink section
GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show 
copying"
and "show warranty" for details.
This GDB was configured as "powerpc64le-unknown-linux-gnu"...

  KERNEL: /boot/vmlinux-3.16.0-rc7-7-default+
DUMPFILE: vmcore
CPUS: 16
DATE: Sun Aug 24 14:31:28 2014
  UPTIME: 00:02:57
LOAD AVERAGE: 0.05, 0.08, 0.04
   TASKS: 256
NODENAME: linux-dhr2
 RELEASE: 3.16.0-rc7-7-default+
 VERSION: #54 SMP Mon Aug 18 14:08:23 EDT 2014
 MACHINE: ppc64le  (4116 Mhz)
  MEMORY: 40 GB
   PANIC: "Oops: Kernel access of bad area, sig: 11 [#1]" (check 
log for details)
 PID: 2234
 COMMAND: "bash"
TASK: c009652e4a30  [THREAD_INFO: c0096777c000]
 CPU: 2
       STATE: TASK_RUNNING (PANIC)

crash>

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h |   52 ---
 arch/powerpc/kernel/fadump.c  |  112 +
 arch/powerpc/platforms/pseries/lpar.c |9 ++-
 3 files changed, 89 insertions(+), 84 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index a677456..493e72f 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -70,39 +70,39 @@
 #define CPU_UNKNOWN(~((u32)0))
 
 /* Utility macros */
-#define SKIP_TO_NEXT_CPU(reg_entry)\
-({ \
-   while (reg_entry->reg_id != REG_ID("CPUEND"))   \
-   reg_entry++;\
-   reg_entry++;\
+#define SKIP_TO_NEXT_CPU(reg_entry)\
+({ \
+   while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))  \
+   reg_entry++;\
+   reg_entry++;\
 })
 
 /* Kernel Dump section info */
 struct fadump_section {
-   u32 request_flag;
-   u16 source_data_type;
-   u16 error_flags;
-   u64 source_address;
-   u64 source_len;
-   u64 bytes_dumped;
-   u64 destination_address;
+   __be32  request_flag;
+   __be16  source_data_type;
+   __be16  error_flags;
+   __be64  source_address;
+   __be64  source_len;
+   __be64  bytes_dumped;
+   __be64  destination_address;
 };
 
 /* ibm,configure-kernel-dump header. */
 struct fadump_section_header {
-   u32 dump_format_version;
-   u16 dump_num_sections;
-   u16 dump_status_flag;
-   u32 offset_first_dump_section;
+   __be32  dump_format_version;
+   __be16  dump_num_sections;
+   __be16  dump_status_flag;
+   __be32  offset_first_dump_section;
 
/* Fields for disk dump option. */
-   u32 dd_block_size;
-   u64 dd_block_offset;
-   u64 dd_num_blocks;
-   u32 dd_offset_disk_path;
+   __be32  dd_block_size;
+   __be64  dd_block_offset;
+   __be64  dd_num_blocks;
+   __be32  dd_offset_disk_path;
 
/* Maximum time allowed to prevent an automatic dump-reboot

[PATCH v2] fadump: fix endianess issues in firmware assisted dump handling

2014-10-01 Thread Hari Bathini
Firmware-assisted dump (fadump) kernel code is not LE compliant. The
below patch tries to fix this issue. Tested this patch with upstream
kernel. Did some sanity testing for the  LE fadump vmcore generated.
Below output shows crash tool successfully opening LE fadump vmcore.

# crash vmlinux vmcore

crash 7.0.5
Copyright (C) 2002-2014  Red Hat, Inc.
Copyright (C) 2004, 2005, 2006, 2010  IBM Corporation
Copyright (C) 1999-2006  Hewlett-Packard Co
Copyright (C) 2005, 2006, 2011, 2012  Fujitsu Limited
Copyright (C) 2006, 2007  VA Linux Systems Japan K.K.
Copyright (C) 2005, 2011  NEC Corporation
Copyright (C) 1999, 2002, 2007  Silicon Graphics, Inc.
Copyright (C) 1999, 2000, 2001, 2002  Mission Critical Linux, Inc.
This program is free software, covered by the GNU General Public 
License,
and you are welcome to change it and/or distribute copies of it under
certain conditions.  Enter "help copying" to see the conditions.
This program has absolutely no warranty.  Enter "help warranty" for 
details.

crash: vmlinux: no .gnu_debuglink section
GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show 
copying"
and "show warranty" for details.
This GDB was configured as "powerpc64le-unknown-linux-gnu"...

  KERNEL: vmlinux
DUMPFILE: vmcore
CPUS: 16
DATE: Wed Dec 31 19:00:00 1969
  UPTIME: 00:03:28
LOAD AVERAGE: 0.46, 0.86, 0.41
   TASKS: 268
NODENAME: linux-dhr2
 RELEASE: 3.17.0-rc5-7-default
 VERSION: #6 SMP Tue Sep 30 01:06:34 EDT 2014
 MACHINE: ppc64le  (4116 Mhz)
  MEMORY: 40 GB
   PANIC: "Oops: Kernel access of bad area, sig: 11 [#1]" (check 
log for details)
 PID: 6223
 COMMAND: "bash"
TASK: c009661b2500  [THREAD_INFO: c00967ac]
 CPU: 2
   STATE: TASK_RUNNING (PANIC)

crash>

Changes in v2:
1. Addressed casting related warnings.
2. Elaborated on why exceptions should not be changed to big endian during 
fadump boot.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h |   52 ---
 arch/powerpc/kernel/fadump.c  |  114 +
 arch/powerpc/platforms/pseries/lpar.c |   15 
 3 files changed, 96 insertions(+), 85 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index a677456..493e72f 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -70,39 +70,39 @@
 #define CPU_UNKNOWN(~((u32)0))
 
 /* Utility macros */
-#define SKIP_TO_NEXT_CPU(reg_entry)\
-({ \
-   while (reg_entry->reg_id != REG_ID("CPUEND"))   \
-   reg_entry++;\
-   reg_entry++;\
+#define SKIP_TO_NEXT_CPU(reg_entry)\
+({ \
+   while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))  \
+   reg_entry++;\
+   reg_entry++;\
 })
 
 /* Kernel Dump section info */
 struct fadump_section {
-   u32 request_flag;
-   u16 source_data_type;
-   u16 error_flags;
-   u64 source_address;
-   u64 source_len;
-   u64 bytes_dumped;
-   u64 destination_address;
+   __be32  request_flag;
+   __be16  source_data_type;
+   __be16  error_flags;
+   __be64  source_address;
+   __be64  source_len;
+   __be64  bytes_dumped;
+   __be64  destination_address;
 };
 
 /* ibm,configure-kernel-dump header. */
 struct fadump_section_header {
-   u32 dump_format_version;
-   u16 dump_num_sections;
-   u16 dump_status_flag;
-   u32 offset_first_dump_section;
+   __be32  dump_format_version;
+   __be16  dump_num_sections;
+   __be16  dump_status_flag;
+   __be32  offset_first_dump_section;
 
/* Fields for disk dump option. */
-   u32 dd_block_size;
-   u64 dd_block_offset;
-   u64 dd_num_blocks;
-   u32 dd_offset_disk_path;
+   __be32  dd_block_size;
+   __be64  dd_block_offset;
+   __be64  dd_num_blocks;
+   __be32  d

[PATCH 0/2] powerpc/pstore: Add pstore support for nvram partitions

2014-12-03 Thread Hari Bathini
This patch series adds pstore support on powernv platform to
read different nvram partitions and write compressed data to
oops-log nvram partition. As pseries platform already has
pstore support, this series moves most of the common code
for pseries and powernv platforms to a common file. Tested
the patches successfully on both pseries and powernv
platforms.

---

Hari Bathini (2):
  pstore: Add pstore type id for firmware partition
  pstore: add pstore support on powernv


 arch/powerpc/include/asm/nvram.h|   50 ++
 arch/powerpc/include/asm/rtas.h |2 
 arch/powerpc/kernel/nvram_64.c  |  679 +++
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 
 arch/powerpc/platforms/pseries/nvram.c  |  663 --
 fs/pstore/inode.c   |3 
 include/linux/pstore.h  |1 
 7 files changed, 749 insertions(+), 659 deletions(-)

--
- Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] pstore: Add pstore type id for firmware partition

2014-12-03 Thread Hari Bathini
This patch adds a pstore type id to be used for opal specific
nvram partitions.

Signed-off-by: Hari Bathini 
---
 fs/pstore/inode.c  |3 +++
 include/linux/pstore.h |1 +
 2 files changed, 4 insertions(+)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index fafb7a0..e83bb93 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -337,6 +337,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, 
u64 id, int count,
case PSTORE_TYPE_PPC_COMMON:
sprintf(name, "powerpc-common-%s-%lld", psname, id);
break;
+   case PSTORE_TYPE_PPC_OPAL:
+   sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+   break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6b..af44980 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
PSTORE_TYPE_PPC_RTAS= 4,
PSTORE_TYPE_PPC_OF  = 5,
PSTORE_TYPE_PPC_COMMON  = 6,
+   PSTORE_TYPE_PPC_OPAL= 7,
PSTORE_TYPE_UNKNOWN = 255
 };
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] pstore: add pstore support on powernv

2014-12-03 Thread Hari Bathini
This patch extends pstore, a generic interface to platform dependent
persistent storage, support for powernv  platform to capture certain
useful information, during dying moments. Such support is already in
place for  pseries platform.  This patch while adding pstore support
for  powernv platform,  moves common code for pseries and powernv to
arch/powerpc/kernel/nvram_64.c file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/nvram.h|   50 ++
 arch/powerpc/include/asm/rtas.h |2 
 arch/powerpc/kernel/nvram_64.c  |  679 +++
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 
 arch/powerpc/platforms/pseries/nvram.c  |  663 --
 5 files changed, 745 insertions(+), 659 deletions(-)

diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h
index b0fe0fe..09a518b 100644
--- a/arch/powerpc/include/asm/nvram.h
+++ b/arch/powerpc/include/asm/nvram.h
@@ -9,12 +9,43 @@
 #ifndef _ASM_POWERPC_NVRAM_H
 #define _ASM_POWERPC_NVRAM_H
 
-
+#include 
 #include 
 #include 
 #include 
 
+/*
+ * Set oops header version to distinguish between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
+struct err_log_info {
+   __be32 error_type;
+   __be32 seq_num;
+};
+
+struct nvram_os_partition {
+   const char *name;
+   int req_size;   /* desired size, in bytes */
+   int min_size;   /* minimum acceptable size (0 means req_size) */
+   long size;  /* size of data portion (excluding err_log_info) */
+   long index; /* offset of data portion of partition */
+   bool os_partition; /* partition initialized by OS, not FW */
+};
+
+struct oops_log_info {
+   __be16 version;
+   __be16 report_length;
+   __be64 timestamp;
+} __attribute__((packed));
+
+extern struct nvram_os_partition oops_log_partition;
+
 #ifdef CONFIG_PPC_PSERIES
+extern struct nvram_os_partition rtas_log_partition;
+
 extern int nvram_write_error_log(char * buff, int length,
 unsigned int err_type, unsigned int 
err_seq);
 extern int nvram_read_error_log(char * buff, int length,
@@ -50,6 +81,23 @@ extern void  pmac_xpram_write(int xpaddr, u8 data);
 /* Synchronize NVRAM */
 extern voidnvram_sync(void);
 
+/* Initialize NVRAM OS partition */
+extern int __init nvram_init_os_partition(struct nvram_os_partition *part);
+
+/* Initialize NVRAM oops partition */
+extern void __init nvram_init_oops_partition(int rtas_partition_exists);
+
+/* Read a NVRAM partition */
+extern int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+   int length, unsigned int *err_type,
+   unsigned int *error_log_cnt);
+
+/* Write to NVRAM OS partition */
+extern int nvram_write_os_partition(struct nvram_os_partition *part,
+   char *buff, int length,
+   unsigned int err_type,
+   unsigned int error_log_cnt);
+
 /* Determine NVRAM size */
 extern ssize_t nvram_get_size(void);
 
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index b390f55..a033fe9 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node,
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 
 #ifdef CONFIG_PPC_PSERIES
+extern unsigned long last_rtas_event;
+extern int clobbering_unread_rtas_event(void);
 extern int pseries_devicetree_update(s32 scope);
 extern void post_mobility_fixup(void);
 #endif
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 34f7c9b..8c439a3 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +57,682 @@ struct nvram_partition {
 
 static LIST_HEAD(nvram_partitions);
 
+#ifdef CONFIG_PPC_PSERIES
+struct nvram_os_partition rtas_log_partition = {
+   .name = "ibm,rtas-log",
+   .req_size = 2079,
+   .min_size = 1055,
+   .index = -1,
+   .os_partition = true
+};
+#endif
+
+struct nvram_os_partition oops_log_partition = {
+   .name = "lnx,oops-log",
+   .req_size = 4000,
+   .min_size = 2000,
+   .index = -1,
+   .os_partition = true
+};
+
+static const char *nvram_os_partitions[] = {
+#ifdef CONFIG_PPC_PSERIES
+   "ibm,rtas-log",
+#endif
+   "lnx,oops-log",
+   NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+   .dump = oops_to_nvram
+};
+
+/

Re: [2/2] pstore: add pstore support on powernv

2014-12-04 Thread Hari Bathini

On 12/04/2014 11:07 AM, Michael Ellerman wrote:

On Wed, 2014-03-12 at 11:03:15 UTC, Hari Bathini wrote:

This patch extends pstore, a generic interface to platform dependent
persistent storage, support for powernv  platform to capture certain
useful information, during dying moments. Such support is already in
place for  pseries platform.  This patch while adding pstore support
for  powernv platform,  moves common code for pseries and powernv to
arch/powerpc/kernel/nvram_64.c file.

Please move the common code first in a separate patch. Unless there's some
reason you absolutely can't do that.


Sure, Michael. Let me make the changes as suggested and
post the updated patch series.

Thanks
Hari


cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/kdump: skip enabling big endian exception during crash

2014-12-11 Thread Hari Bathini
In LE kernel, we currently have a hack for kexec that resets the exception 
endian
before starting a new kernel as the kernel that is loaded could be a big endian
or a little endian kernel. In kdump case, resetting exception endian fails when
one or more cpus is disabled. But in case of kdump, we can conveniently ignore
resetting endianess as crashkernel is always of same endianess as primary 
kernel.
This patch adds a new inline function to say if this is kdump path. This 
function
is used at places where such a check is needed.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/kexec.h   |   10 ++
 arch/powerpc/kernel/machine_kexec_64.c |2 +-
 arch/powerpc/platforms/pseries/lpar.c  |7 ++-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 19c36cb..0d96d4d 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -86,6 +86,11 @@ extern int overlaps_crashkernel(unsigned long start, 
unsigned long size);
 extern void reserve_crashkernel(void);
 extern void machine_kexec_mask_interrupts(void);
 
+static inline int is_kdump_path(void)
+{
+   return (crashing_cpu >= 0) ? 1 : 0;
+}
+
 #else /* !CONFIG_KEXEC */
 static inline void crash_kexec_secondary(struct pt_regs *regs) { }
 
@@ -106,6 +111,11 @@ static inline int 
crash_shutdown_unregister(crash_shutdown_t handler)
return 0;
 }
 
+static inline int is_kdump_path(void)
+{
+   return 0;
+}
+
 #endif /* CONFIG_KEXEC */
 #endif /* ! __ASSEMBLY__ */
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
b/arch/powerpc/kernel/machine_kexec_64.c
index 879b3aa..b4fe804 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -330,7 +330,7 @@ void default_machine_kexec(struct kimage *image)
 * using debugger IPI.
 */
 
-   if (crashing_cpu == -1)
+   if (!is_kdump_path())
kexec_prepare_cpus();
 
pr_debug("kexec: Starting switchover sequence.\n");
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index f6880d2..be41680 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "pseries.h"
@@ -257,8 +258,12 @@ static void pSeries_lpar_hptab_clear(void)
 *
 * This is also called on boot when a fadump happens. In that case we
 * must not change the exception endian mode.
+*
+* This is also called during kdump which doesn't need resetting, as the
+* the crashkernel is of same endainess as primary kernel.
 */
-   if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active()) {
+   if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active() &&
+   !is_kdump_path()) {
long rc;
 
rc = pseries_big_endian_exceptions();

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 0/3] powerpc/pstore: Add pstore support for nvram partitions

2014-12-16 Thread Hari Bathini
This patch series adds pstore support on powernv platform to
read different nvram partitions and write compressed data to
oops-log nvram partition. As pseries platform already has
pstore support, this series moves most of the common code
for pseries and powernv platforms to a common file. Tested
the patches successfully on both pseries and powernv
platforms.

---

Hari Bathini (3):
  powerpc/nvram: move generic code for nvram and pstore
  pstore: Add pstore type id for firmware partition
  pstore: add pstore support on powernv


 arch/powerpc/include/asm/nvram.h|   50 ++
 arch/powerpc/include/asm/rtas.h |2 
 arch/powerpc/kernel/nvram_64.c  |  681 +++
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 
 arch/powerpc/platforms/pseries/nvram.c  |  665 --
 fs/pstore/inode.c   |3 
 include/linux/pstore.h  |1 
 7 files changed, 751 insertions(+), 661 deletions(-)

--
- Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 1/3] powerpc/nvram: move generic code for nvram and pstore

2014-12-16 Thread Hari Bathini
With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/nvram.h   |   50 ++
 arch/powerpc/include/asm/rtas.h|2 
 arch/powerpc/kernel/nvram_64.c |  660 
 arch/powerpc/platforms/pseries/nvram.c |  665 
 4 files changed, 716 insertions(+), 661 deletions(-)

diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h
index b0fe0fe..09a518b 100644
--- a/arch/powerpc/include/asm/nvram.h
+++ b/arch/powerpc/include/asm/nvram.h
@@ -9,12 +9,43 @@
 #ifndef _ASM_POWERPC_NVRAM_H
 #define _ASM_POWERPC_NVRAM_H
 
-
+#include 
 #include 
 #include 
 #include 
 
+/*
+ * Set oops header version to distinguish between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
+struct err_log_info {
+   __be32 error_type;
+   __be32 seq_num;
+};
+
+struct nvram_os_partition {
+   const char *name;
+   int req_size;   /* desired size, in bytes */
+   int min_size;   /* minimum acceptable size (0 means req_size) */
+   long size;  /* size of data portion (excluding err_log_info) */
+   long index; /* offset of data portion of partition */
+   bool os_partition; /* partition initialized by OS, not FW */
+};
+
+struct oops_log_info {
+   __be16 version;
+   __be16 report_length;
+   __be64 timestamp;
+} __attribute__((packed));
+
+extern struct nvram_os_partition oops_log_partition;
+
 #ifdef CONFIG_PPC_PSERIES
+extern struct nvram_os_partition rtas_log_partition;
+
 extern int nvram_write_error_log(char * buff, int length,
 unsigned int err_type, unsigned int 
err_seq);
 extern int nvram_read_error_log(char * buff, int length,
@@ -50,6 +81,23 @@ extern void  pmac_xpram_write(int xpaddr, u8 data);
 /* Synchronize NVRAM */
 extern voidnvram_sync(void);
 
+/* Initialize NVRAM OS partition */
+extern int __init nvram_init_os_partition(struct nvram_os_partition *part);
+
+/* Initialize NVRAM oops partition */
+extern void __init nvram_init_oops_partition(int rtas_partition_exists);
+
+/* Read a NVRAM partition */
+extern int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+   int length, unsigned int *err_type,
+   unsigned int *error_log_cnt);
+
+/* Write to NVRAM OS partition */
+extern int nvram_write_os_partition(struct nvram_os_partition *part,
+   char *buff, int length,
+   unsigned int err_type,
+   unsigned int error_log_cnt);
+
 /* Determine NVRAM size */
 extern ssize_t nvram_get_size(void);
 
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index b390f55..a033fe9 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node,
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 
 #ifdef CONFIG_PPC_PSERIES
+extern unsigned long last_rtas_event;
+extern int clobbering_unread_rtas_event(void);
 extern int pseries_devicetree_update(s32 scope);
 extern void post_mobility_fixup(void);
 #endif
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 34f7c9b..dbff7f0 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +57,663 @@ struct nvram_partition {
 
 static LIST_HEAD(nvram_partitions);
 
+#ifdef CONFIG_PPC_PSERIES
+struct nvram_os_partition rtas_log_partition = {
+   .name = "ibm,rtas-log",
+   .req_size = 2079,
+   .min_size = 1055,
+   .index = -1,
+   .os_partition = true
+};
+#endif
+
+struct nvram_os_partition oops_log_partition = {
+   .name = "lnx,oops-log",
+   .req_size = 4000,
+   .min_size = 2000,
+   .index = -1,
+   .os_partition = true
+};
+
+static const char *nvram_os_partitions[] = {
+#ifdef CONFIG_PPC_PSERIES
+   "ibm,rtas-log",
+#endif
+   "lnx,oops-log",
+   NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+   .dump = oops_to_nvram
+};
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturing.
+ *
+ * oops_buf[] holds the compressed text, preceded by a oops header.
+ * oops heade

[PATCH v2 2/3] pstore: Add pstore type id for firmware partition

2014-12-16 Thread Hari Bathini
This patch adds a pstore type id to be used for opal specific
nvram partitions.

Signed-off-by: Hari Bathini 
---
 fs/pstore/inode.c  |3 +++
 include/linux/pstore.h |1 +
 2 files changed, 4 insertions(+)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5041660..8e0c009 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, 
u64 id, int count,
case PSTORE_TYPE_PPC_COMMON:
sprintf(name, "powerpc-common-%s-%lld", psname, id);
break;
+   case PSTORE_TYPE_PPC_OPAL:
+   sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+   break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6b..af44980 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
PSTORE_TYPE_PPC_RTAS= 4,
PSTORE_TYPE_PPC_OF  = 5,
PSTORE_TYPE_PPC_COMMON  = 6,
+   PSTORE_TYPE_PPC_OPAL= 7,
PSTORE_TYPE_UNKNOWN = 255
 };
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 3/3] pstore: add pstore support on powernv

2014-12-16 Thread Hari Bathini
This patch extends pstore, a generic interface to platform dependent
persistent storage, support for powernv  platform to capture certain
useful information, during dying moments. Such support is already in
place for  pseries platform.  This patch re-uses most of that code.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/nvram_64.c  |   25 +++--
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index dbff7f0..3afbc91 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -127,6 +127,14 @@ static size_t oops_data_sz;
 static struct z_stream_s stream;
 
 #ifdef CONFIG_PSTORE
+#ifdef CONFIG_PPC_POWERNV
+static struct nvram_os_partition skiboot_partition = {
+   .name = "ibm,skiboot",
+   .index = -1,
+   .os_partition = false
+};
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 static struct nvram_os_partition of_config_partition = {
.name = "of-config",
@@ -479,6 +487,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum 
pstore_type_id *type,
time->tv_nsec = 0;
break;
 #endif
+#ifdef CONFIG_PPC_POWERNV
+   case PSTORE_TYPE_PPC_OPAL:
+   sig = NVRAM_SIG_FW;
+   part = &skiboot_partition;
+   *type = PSTORE_TYPE_PPC_OPAL;
+   *id = PSTORE_TYPE_PPC_OPAL;
+   time->tv_sec = 0;
+   time->tv_nsec = 0;
+   break;
+#endif
default:
return 0;
}
@@ -554,8 +572,11 @@ static int nvram_pstore_init(void)
 {
int rc = 0;
 
-   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
-   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   if (machine_is(pseries)) {
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
+   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   } else
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL;
 
nvram_pstore_info.buf = oops_data;
nvram_pstore_info.bufsize = oops_data_sz;
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c 
b/arch/powerpc/platforms/powernv/opal-nvram.c
index f9896fd..9db4398 100644
--- a/arch/powerpc/platforms/powernv/opal-nvram.c
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -16,6 +16,7 @@
 #include 
 
 #include 
+#include 
 #include 
 
 static unsigned int nvram_size;
@@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, 
loff_t *index)
return count;
 }
 
+static int __init opal_nvram_init_log_partitions(void)
+{
+   /* Scan nvram for partitions */
+   nvram_scan_partitions();
+   nvram_init_oops_partition(0);
+   return 0;
+}
+machine_arch_initcall(powernv, opal_nvram_init_log_partitions);
+
 void __init opal_nvram_init(void)
 {
struct device_node *np;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 1/3] powerpc/nvram: move generic code for nvram and pstore

2014-12-17 Thread Hari Bathini

On 12/17/2014 05:33 AM, Michael Ellerman wrote:

On Tue, 2014-12-16 at 23:35 +0530, Hari Bathini wrote:

With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

Sharing the code is great.

But, you need to keep in mind that it is very common for us to build kernels
with both POWERNV=y and PSERIES=y.

So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things
that are optional on pseries. Not things that we *shouldn't* be doing on
powernv.

For example the logic in nvram_init_oops_partition() looks like it might do the
wrong thing for PSERIES=y POWERNV=y.


True. It might do wrong thing when an incorrect value is passed by the 
caller.
But since the caller is platform specific code 
[pseries_nvram_init_log_partitions() or
opal_nvram_init_log_partitions() routine], with appropriate parameter 
passed,

I haven't seen any issues while testing.



diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index b390f55..a033fe9 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node,
  extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
  
  #ifdef CONFIG_PPC_PSERIES

+extern unsigned long last_rtas_event;
+extern int clobbering_unread_rtas_event(void);

You should add an empty version of this for !PSERIES, so you don't have to
ifdef all the call sites.


Sure. Will update accordingly..

Thanks
Hari


cheers




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc/kdump: Ignore failure in enabling big endian exception during crash

2014-12-18 Thread Hari Bathini
In LE kernel, we currently have a hack for kexec that resets the exception
endian before starting a new kernel as the kernel that is loaded could be a
big endian or a little endian kernel. In kdump case, resetting exception
endian fails when one or more cpus is disabled. But we can ignore the failure
and still go ahead, as in most cases crashkernel will be of same endianess
as primary kernel and reseting endianess is not even needed in those cases.
This patch adds a new inline function to say if this is kdump path. This
function is used at places where such a check is needed.

Changes from v1:
Instead of skipping, ignore failure in enabling big endian exception
during crash

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/kexec.h   |   10 ++
 arch/powerpc/kernel/machine_kexec_64.c |2 +-
 arch/powerpc/platforms/pseries/lpar.c  |   10 +-
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 19c36cb..0d96d4d 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -86,6 +86,11 @@ extern int overlaps_crashkernel(unsigned long start, 
unsigned long size);
 extern void reserve_crashkernel(void);
 extern void machine_kexec_mask_interrupts(void);
 
+static inline int is_kdump_path(void)
+{
+   return (crashing_cpu >= 0) ? 1 : 0;
+}
+
 #else /* !CONFIG_KEXEC */
 static inline void crash_kexec_secondary(struct pt_regs *regs) { }
 
@@ -106,6 +111,11 @@ static inline int 
crash_shutdown_unregister(crash_shutdown_t handler)
return 0;
 }
 
+static inline int is_kdump_path(void)
+{
+   return 0;
+}
+
 #endif /* CONFIG_KEXEC */
 #endif /* ! __ASSEMBLY__ */
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
b/arch/powerpc/kernel/machine_kexec_64.c
index 879b3aa..b4fe804 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -330,7 +330,7 @@ void default_machine_kexec(struct kimage *image)
 * using debugger IPI.
 */
 
-   if (crashing_cpu == -1)
+   if (!is_kdump_path())
kexec_prepare_cpus();
 
pr_debug("kexec: Starting switchover sequence.\n");
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 469751d..63214fa 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "pseries.h"
@@ -257,6 +258,7 @@ static void pSeries_lpar_hptab_clear(void)
 *
 * This is also called on boot when a fadump happens. In that case we
 * must not change the exception endian mode.
+*
 */
if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active()) {
long rc;
@@ -267,8 +269,14 @@ static void pSeries_lpar_hptab_clear(void)
 * out to the user, but at least this will stop us from
 * continuing on further and creating an even more
 * difficult to debug situation.
+*
+* But if we reaching here after a crash, no point panicking.
+* Also, in kdump path, resetting endianess may not be needed
+* as the crashkernel most of the times is of same endianess
+* as primary kernel. So, let's ignore the failure and try
+* kdump'ing anyway.
 */
-   if (rc)
+   if (rc && !is_kdump_path())
panic("Could not enable big endian exceptions");
}
 #endif

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 0/3] powerpc/pstore: Add pstore support for nvram partitions

2014-12-24 Thread Hari Bathini
This patch series adds pstore support on powernv platform to
read different nvram partitions and write compressed data to
oops-log nvram partition. As pseries platform already has
pstore support, this series moves most of the common code
for pseries and powernv platforms to a common file. Tested
the patches successfully on both pseries and powernv
platforms.

Changes from v2:
Added an empty version of clobbering_unread_rtas_event()
routine for !PSERIES, to avoid ifdef at the call sites

---

Hari Bathini (3):
  powerpc/nvram: move generic code for nvram and pstore
  pstore: Add pstore type id for firmware partition
  pstore: add pstore support on powernv


 arch/powerpc/include/asm/nvram.h|   50 ++
 arch/powerpc/include/asm/rtas.h |4 
 arch/powerpc/kernel/nvram_64.c  |  677 +++
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 
 arch/powerpc/platforms/pseries/nvram.c  |  665 ---
 fs/pstore/inode.c   |3 
 include/linux/pstore.h  |1 
 7 files changed, 749 insertions(+), 661 deletions(-)

--
- Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore

2014-12-24 Thread Hari Bathini
With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/nvram.h   |   50 ++
 arch/powerpc/include/asm/rtas.h|4 
 arch/powerpc/kernel/nvram_64.c |  656 
 arch/powerpc/platforms/pseries/nvram.c |  665 
 4 files changed, 714 insertions(+), 661 deletions(-)

diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h
index b0fe0fe..09a518b 100644
--- a/arch/powerpc/include/asm/nvram.h
+++ b/arch/powerpc/include/asm/nvram.h
@@ -9,12 +9,43 @@
 #ifndef _ASM_POWERPC_NVRAM_H
 #define _ASM_POWERPC_NVRAM_H
 
-
+#include 
 #include 
 #include 
 #include 
 
+/*
+ * Set oops header version to distinguish between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
+struct err_log_info {
+   __be32 error_type;
+   __be32 seq_num;
+};
+
+struct nvram_os_partition {
+   const char *name;
+   int req_size;   /* desired size, in bytes */
+   int min_size;   /* minimum acceptable size (0 means req_size) */
+   long size;  /* size of data portion (excluding err_log_info) */
+   long index; /* offset of data portion of partition */
+   bool os_partition; /* partition initialized by OS, not FW */
+};
+
+struct oops_log_info {
+   __be16 version;
+   __be16 report_length;
+   __be64 timestamp;
+} __attribute__((packed));
+
+extern struct nvram_os_partition oops_log_partition;
+
 #ifdef CONFIG_PPC_PSERIES
+extern struct nvram_os_partition rtas_log_partition;
+
 extern int nvram_write_error_log(char * buff, int length,
 unsigned int err_type, unsigned int 
err_seq);
 extern int nvram_read_error_log(char * buff, int length,
@@ -50,6 +81,23 @@ extern void  pmac_xpram_write(int xpaddr, u8 data);
 /* Synchronize NVRAM */
 extern voidnvram_sync(void);
 
+/* Initialize NVRAM OS partition */
+extern int __init nvram_init_os_partition(struct nvram_os_partition *part);
+
+/* Initialize NVRAM oops partition */
+extern void __init nvram_init_oops_partition(int rtas_partition_exists);
+
+/* Read a NVRAM partition */
+extern int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+   int length, unsigned int *err_type,
+   unsigned int *error_log_cnt);
+
+/* Write to NVRAM OS partition */
+extern int nvram_write_os_partition(struct nvram_os_partition *part,
+   char *buff, int length,
+   unsigned int err_type,
+   unsigned int error_log_cnt);
+
 /* Determine NVRAM size */
 extern ssize_t nvram_get_size(void);
 
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index b390f55..bcf6693 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node,
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 
 #ifdef CONFIG_PPC_PSERIES
+extern unsigned long last_rtas_event;
+extern int clobbering_unread_rtas_event(void);
 extern int pseries_devicetree_update(s32 scope);
 extern void post_mobility_fixup(void);
+#else
+int clobbering_unread_rtas_event(void) { return 0; }
 #endif
 
 #ifdef CONFIG_PPC_RTAS_DAEMON
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 34f7c9b..42e5c6a 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +57,659 @@ struct nvram_partition {
 
 static LIST_HEAD(nvram_partitions);
 
+#ifdef CONFIG_PPC_PSERIES
+struct nvram_os_partition rtas_log_partition = {
+   .name = "ibm,rtas-log",
+   .req_size = 2079,
+   .min_size = 1055,
+   .index = -1,
+   .os_partition = true
+};
+#endif
+
+struct nvram_os_partition oops_log_partition = {
+   .name = "lnx,oops-log",
+   .req_size = 4000,
+   .min_size = 2000,
+   .index = -1,
+   .os_partition = true
+};
+
+static const char *nvram_os_partitions[] = {
+#ifdef CONFIG_PPC_PSERIES
+   "ibm,rtas-log",
+#endif
+   "lnx,oops-log",
+   NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+   .dump = oops_to_nvram
+};
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturi

[PATCH v3 2/3] pstore: Add pstore type id for firmware partition

2014-12-24 Thread Hari Bathini
This patch adds a pstore type id to be used for opal specific
nvram partitions.

Signed-off-by: Hari Bathini 
---
 fs/pstore/inode.c  |3 +++
 include/linux/pstore.h |1 +
 2 files changed, 4 insertions(+)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5041660..8e0c009 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, 
u64 id, int count,
case PSTORE_TYPE_PPC_COMMON:
sprintf(name, "powerpc-common-%s-%lld", psname, id);
break;
+   case PSTORE_TYPE_PPC_OPAL:
+   sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+   break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6b..af44980 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
PSTORE_TYPE_PPC_RTAS= 4,
PSTORE_TYPE_PPC_OF  = 5,
PSTORE_TYPE_PPC_COMMON  = 6,
+   PSTORE_TYPE_PPC_OPAL= 7,
PSTORE_TYPE_UNKNOWN = 255
 };
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 3/3] pstore: add pstore support on powernv

2014-12-24 Thread Hari Bathini
This patch extends pstore, a generic interface to platform dependent
persistent storage, support for powernv  platform to capture certain
useful information, during dying moments. Such support is already in
place for  pseries platform.  This patch re-uses most of that code.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/nvram_64.c  |   25 +++--
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 42e5c6a..293da88 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -127,6 +127,14 @@ static size_t oops_data_sz;
 static struct z_stream_s stream;
 
 #ifdef CONFIG_PSTORE
+#ifdef CONFIG_PPC_POWERNV
+static struct nvram_os_partition skiboot_partition = {
+   .name = "ibm,skiboot",
+   .index = -1,
+   .os_partition = false
+};
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 static struct nvram_os_partition of_config_partition = {
.name = "of-config",
@@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum 
pstore_type_id *type,
time->tv_nsec = 0;
break;
 #endif
+#ifdef CONFIG_PPC_POWERNV
+   case PSTORE_TYPE_PPC_OPAL:
+   sig = NVRAM_SIG_FW;
+   part = &skiboot_partition;
+   *type = PSTORE_TYPE_PPC_OPAL;
+   *id = PSTORE_TYPE_PPC_OPAL;
+   time->tv_sec = 0;
+   time->tv_nsec = 0;
+   break;
+#endif
default:
return 0;
}
@@ -552,8 +570,11 @@ static int nvram_pstore_init(void)
 {
int rc = 0;
 
-   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
-   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   if (machine_is(pseries)) {
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
+   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   } else
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL;
 
nvram_pstore_info.buf = oops_data;
nvram_pstore_info.bufsize = oops_data_sz;
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c 
b/arch/powerpc/platforms/powernv/opal-nvram.c
index f9896fd..9db4398 100644
--- a/arch/powerpc/platforms/powernv/opal-nvram.c
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -16,6 +16,7 @@
 #include 
 
 #include 
+#include 
 #include 
 
 static unsigned int nvram_size;
@@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, 
loff_t *index)
return count;
 }
 
+static int __init opal_nvram_init_log_partitions(void)
+{
+   /* Scan nvram for partitions */
+   nvram_scan_partitions();
+   nvram_init_oops_partition(0);
+   return 0;
+}
+machine_arch_initcall(powernv, opal_nvram_init_log_partitions);
+
 void __init opal_nvram_init(void)
 {
struct device_node *np;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] PPC64: Adding symbols in vmcoreinfo to facilitate dump filtering

2013-11-15 Thread Hari Bathini
When CONFIG_SPARSEMEM_VMEMMAP option is used in kernel, makedumpfile fails
to filter vmcore dump as it fails to do vmemmap translations. So far
dump filtering on ppc64 never had to deal with vmemmap addresses seperately
as vmemmap regions where mapped in zone normal. But with the inclusion of
CONFIG_SPARSEMEM_VMEMMAP config option in kernel, this vmemmap address
translation support becomes necessary for dump filtering. For vmemmap adress
translation, few kernel symbols are needed by dump filtering tool. This patch
adds those symbols to vmcoreinfo, which a dump filtering tool can use for
filtering the kernel dump. Tested this changes successfully with makedumpfile
tool that supports vmemmap to physical address translation outside zone normal.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/pgalloc-64.h |4 
 arch/powerpc/kernel/machine_kexec.c   |   12 
 2 files changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/pgalloc-64.h 
b/arch/powerpc/include/asm/pgalloc-64.h
index f65e27b..33e507a 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -17,6 +17,10 @@ struct vmemmap_backing {
unsigned long virt_addr;
 };
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+extern struct vmemmap_backing *vmemmap_list;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
 /*
  * Functions that deal with pagetables that could be at any level of
  * the table need to be passed an "index_size" so they know how to
diff --git a/arch/powerpc/kernel/machine_kexec.c 
b/arch/powerpc/kernel/machine_kexec.c
index e1ec57e..88a7fb4 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 
@@ -75,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void)
 #ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(contig_page_data);
 #endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+   VMCOREINFO_SYMBOL(vmemmap_list);
+   VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+   VMCOREINFO_SYMBOL(mmu_psize_defs);
+   VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+   VMCOREINFO_OFFSET(vmemmap_backing, list);
+   VMCOREINFO_OFFSET(vmemmap_backing, phys);
+   VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+   VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+   VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
 }
 
 /*

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2] PPC64: Adding symbols in vmcoreinfo to facilitate dump filtering

2013-11-24 Thread Hari Bathini
When CONFIG_SPARSEMEM_VMEMMAP option is set in kernel, makedumpfile
tool fails to filter vmcore dump as it fails to do translations for
vmemmap addresses that are mapped outside zone normal. For vmemmap
adress translation support in this scenario, few kernel symbols are
needed by dump filtering tool. This patch adds those symbols to
vmcoreinfo, which a dump filtering tool can use for filtering the
kernel dump. This changes are tested successfully with makedumpfile
tool that supports vmemmap to physical address translation outside
zone normal.

Changes from v1:
Updated patch decription and removed #ifdef around extern.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/pgalloc-64.h |2 ++
 arch/powerpc/kernel/machine_kexec.c   |   12 
 2 files changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/pgalloc-64.h 
b/arch/powerpc/include/asm/pgalloc-64.h
index f65e27b..3973e62 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -17,6 +17,8 @@ struct vmemmap_backing {
unsigned long virt_addr;
 };
 
+extern struct vmemmap_backing *vmemmap_list;
+
 /*
  * Functions that deal with pagetables that could be at any level of
  * the table need to be passed an "index_size" so they know how to
diff --git a/arch/powerpc/kernel/machine_kexec.c 
b/arch/powerpc/kernel/machine_kexec.c
index e1ec57e..88a7fb4 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -18,6 +18,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 
@@ -75,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void)
 #ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(contig_page_data);
 #endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+   VMCOREINFO_SYMBOL(vmemmap_list);
+   VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+   VMCOREINFO_SYMBOL(mmu_psize_defs);
+   VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+   VMCOREINFO_OFFSET(vmemmap_backing, list);
+   VMCOREINFO_OFFSET(vmemmap_backing, phys);
+   VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+   VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+   VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
 }
 
 /*

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore

2015-01-14 Thread Hari Bathini

On 01/14/2015 10:01 AM, Michael Ellerman wrote:

On Wed, 2014-12-24 at 17:28 +0530, Hari Bathini wrote:

With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

As I said in my reply to the previous version:

 ... you need to keep in mind that it is very common for us to build kernels
 with both POWERNV=y and PSERIES=y.
 
 So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things

 that are optional on pseries. Not things that we *shouldn't* be doing on
 powernv.


we could as well do away with the PPC_PSERIES flag in a couple of places in
arch/powerpc/kernel/nvram_64.c, but doing that will unnecessarily add
few extra variables for !PPC_PSERIES case.


Please explain in your commit message how you have dealt with that.



Sure. Will update the changelog


Also, you broke the build for every config that doesn't have
CONFIG_PPC_PSERIES, all 95 of them. This is pasemi_defconfig for example:


My bad!clobbering_unread_rtas_event should have been static inline
while defining under !PPC_PSERIES

Thanks
Hari


   LD  arch/powerpc/mm/built-in.o
 arch/powerpc/mm/init_64.o: In function `clobbering_unread_rtas_event':
 init_64.c:(.opd+0x48): multiple definition of 
`clobbering_unread_rtas_event'
 arch/powerpc/mm/mem.o:mem.c:(.opd+0x90): first defined here
 arch/powerpc/mm/init_64.o: In function `.clobbering_unread_rtas_event':
 init_64.c:(.text+0x80): multiple definition of 
`.clobbering_unread_rtas_event'
 arch/powerpc/mm/mem.o:mem.c:(.text+0x2c0): first defined here
   CC  arch/powerpc/kernel/udbg.o
 /home/kisskb/slave/src/scripts/Makefile.build:336: recipe for target 
'arch/powerpc/mm/built-in.o' failed
 make[2]: *** [arch/powerpc/mm/built-in.o] Error 1
 /home/kisskb/slave/src/Makefile:938: recipe for target 'arch/powerpc/mm' 
failed
 make[1]: *** [arch/powerpc/mm] Error 2
 make[1]: *** Waiting for unfinished jobs


cheers




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore

2015-01-16 Thread Hari Bathini

On 01/15/2015 03:58 AM, Michael Ellerman wrote:

On Wed, 2015-01-14 at 23:35 +0530, Hari Bathini wrote:

On 01/14/2015 10:01 AM, Michael Ellerman wrote:


On Wed, 2014-12-24 at 17:28 +0530, Hari Bathini wrote:

With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

As I said in my reply to the previous version:

 ... you need to keep in mind that it is very common for us to build kernels
 with both POWERNV=y and PSERIES=y.
 
 So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things

 that are optional on pseries. Not things that we *shouldn't* be doing on
 powernv.

we could as well do away with the PPC_PSERIES flag in a couple of
places in
arch/powerpc/kernel/nvram_64.c, but doing that will unnecessarily add
few extra variables for !PPC_PSERIES case.

Yep. I'm happy for them to be there, I just want you to explain in the
changelog that you've thought about the PSERIES=y POWERNV=y case and why the
code makes sense for that configuration.


Please explain in your commit message how you have dealt with that.

Sure. Will update the changelog

Thanks.
  

Also, you broke the build for every config that doesn't have
CONFIG_PPC_PSERIES, all 95 of them. This is pasemi_defconfig for example:

My bad! clobbering_unread_rtas_event should have been static inline
while defining under !PPC_PSERIES

Correct.

Please make sure you test build at least some of the other configurations in
future. I realise it's too time consuming to build all of them, but ideally for
every config symbol you use in your patch you need to build a kernel config
where that symbol =y and =n (and =m if it's tristate).


Sure, Michael. I will keep this in mind :)

Thanks
Hari


cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v4 1/3] powerpc/nvram: move generic code for nvram and pstore

2015-01-30 Thread Hari Bathini
With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/nvram.h   |   50 ++
 arch/powerpc/include/asm/rtas.h|4 
 arch/powerpc/kernel/nvram_64.c |  656 
 arch/powerpc/platforms/pseries/nvram.c |  665 
 4 files changed, 714 insertions(+), 661 deletions(-)

diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h
index b0fe0fe..09a518b 100644
--- a/arch/powerpc/include/asm/nvram.h
+++ b/arch/powerpc/include/asm/nvram.h
@@ -9,12 +9,43 @@
 #ifndef _ASM_POWERPC_NVRAM_H
 #define _ASM_POWERPC_NVRAM_H
 
-
+#include 
 #include 
 #include 
 #include 
 
+/*
+ * Set oops header version to distinguish between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
+struct err_log_info {
+   __be32 error_type;
+   __be32 seq_num;
+};
+
+struct nvram_os_partition {
+   const char *name;
+   int req_size;   /* desired size, in bytes */
+   int min_size;   /* minimum acceptable size (0 means req_size) */
+   long size;  /* size of data portion (excluding err_log_info) */
+   long index; /* offset of data portion of partition */
+   bool os_partition; /* partition initialized by OS, not FW */
+};
+
+struct oops_log_info {
+   __be16 version;
+   __be16 report_length;
+   __be64 timestamp;
+} __attribute__((packed));
+
+extern struct nvram_os_partition oops_log_partition;
+
 #ifdef CONFIG_PPC_PSERIES
+extern struct nvram_os_partition rtas_log_partition;
+
 extern int nvram_write_error_log(char * buff, int length,
 unsigned int err_type, unsigned int 
err_seq);
 extern int nvram_read_error_log(char * buff, int length,
@@ -50,6 +81,23 @@ extern void  pmac_xpram_write(int xpaddr, u8 data);
 /* Synchronize NVRAM */
 extern voidnvram_sync(void);
 
+/* Initialize NVRAM OS partition */
+extern int __init nvram_init_os_partition(struct nvram_os_partition *part);
+
+/* Initialize NVRAM oops partition */
+extern void __init nvram_init_oops_partition(int rtas_partition_exists);
+
+/* Read a NVRAM partition */
+extern int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+   int length, unsigned int *err_type,
+   unsigned int *error_log_cnt);
+
+/* Write to NVRAM OS partition */
+extern int nvram_write_os_partition(struct nvram_os_partition *part,
+   char *buff, int length,
+   unsigned int err_type,
+   unsigned int error_log_cnt);
+
 /* Determine NVRAM size */
 extern ssize_t nvram_get_size(void);
 
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index b390f55..123d7ff 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node,
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 
 #ifdef CONFIG_PPC_PSERIES
+extern unsigned long last_rtas_event;
+extern int clobbering_unread_rtas_event(void);
 extern int pseries_devicetree_update(s32 scope);
 extern void post_mobility_fixup(void);
+#else
+static inline int clobbering_unread_rtas_event(void) { return 0; }
 #endif
 
 #ifdef CONFIG_PPC_RTAS_DAEMON
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 34f7c9b..42e5c6a 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +57,659 @@ struct nvram_partition {
 
 static LIST_HEAD(nvram_partitions);
 
+#ifdef CONFIG_PPC_PSERIES
+struct nvram_os_partition rtas_log_partition = {
+   .name = "ibm,rtas-log",
+   .req_size = 2079,
+   .min_size = 1055,
+   .index = -1,
+   .os_partition = true
+};
+#endif
+
+struct nvram_os_partition oops_log_partition = {
+   .name = "lnx,oops-log",
+   .req_size = 4000,
+   .min_size = 2000,
+   .index = -1,
+   .os_partition = true
+};
+
+static const char *nvram_os_partitions[] = {
+#ifdef CONFIG_PPC_PSERIES
+   "ibm,rtas-log",
+#endif
+   "lnx,oops-log",
+   NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+   .dump = oops_to_nvram
+};
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we

[PATCH v4 0/3] powerpc/pstore: Add pstore support for nvram partitions

2015-01-30 Thread Hari Bathini
This patch series adds pstore support on powernv platform to
read different nvram partitions and write compressed data to
oops-log nvram partition. As pseries platform already has
pstore support, this series moves most of the common code
for pseries and powernv platforms to a common file. Tested
the patches successfully on both pseries and powernv
platforms. Also, tested the patches successfully, on a kernel
compiled with both CONFIG_PPC_PSERIES=y & CONFIG_PPC_POWERNV=y.

Changes from v3:
1. Updated the changelog
2. Resolved compile issues with !CONFIG_PPC_PSERIES

---

Hari Bathini (3):
  powerpc/nvram: move generic code for nvram and pstore
  pstore: Add pstore type id for PPC64 opal nvram partition
  pstore: add pstore support on powernv


 arch/powerpc/include/asm/nvram.h|   50 ++
 arch/powerpc/include/asm/rtas.h |4 
 arch/powerpc/kernel/nvram_64.c  |  677 +++
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 
 arch/powerpc/platforms/pseries/nvram.c  |  665 ---
 fs/pstore/inode.c   |3 
 include/linux/pstore.h  |1 
 7 files changed, 749 insertions(+), 661 deletions(-)

--
- Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v4 2/3] pstore: Add pstore type id for PPC64 opal nvram partition

2015-01-30 Thread Hari Bathini
This patch adds a new PPC64 partition type to be used for opal
specific nvram partition. A new partition type is needed as none
of the existing type matches this partition type.

Signed-off-by: Hari Bathini 
Cc: Anton Vorontsov 
Cc: Colin Cross 
Cc: Kees Cook 
Cc: Tony Luck 
---
 fs/pstore/inode.c  |3 +++
 include/linux/pstore.h |1 +
 2 files changed, 4 insertions(+)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5041660..8e0c009 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, 
u64 id, int count,
case PSTORE_TYPE_PPC_COMMON:
sprintf(name, "powerpc-common-%s-%lld", psname, id);
break;
+   case PSTORE_TYPE_PPC_OPAL:
+   sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+   break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6b..af44980 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
PSTORE_TYPE_PPC_RTAS= 4,
PSTORE_TYPE_PPC_OF  = 5,
PSTORE_TYPE_PPC_COMMON  = 6,
+   PSTORE_TYPE_PPC_OPAL= 7,
PSTORE_TYPE_UNKNOWN = 255
 };
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v4 3/3] pstore: add pstore support on powernv

2015-01-30 Thread Hari Bathini
This patch extends pstore, a generic interface to platform dependent
persistent storage, support for powernv  platform to capture certain
useful information, during dying moments. Such support is already in
place for  pseries platform. This patch re-uses most of that code.

It is a common practice to compile kernels with both CONFIG_PPC_PSERIES=y
and CONFIG_PPC_POWERNV=y. The code in nvram_init_oops_partition() routine
still works as intended, as the caller is platform specific code which
passes the appropriate value for "rtas_partition_exists" parameter.
In all other places, where CONFIG_PPC_PSERIES or CONFIG_PPC_POWERNV
flag is used in this patchset, it is to reduce the kernel size in cases
where this flag is not set and doesn't have any impact logic wise.

Signed-off-by: Hari Bathini 
Cc: Anton Vorontsov 
Cc: Colin Cross 
Cc: Kees Cook 
Cc: Tony Luck 
---
 arch/powerpc/kernel/nvram_64.c  |   25 +++--
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 42e5c6a..293da88 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -127,6 +127,14 @@ static size_t oops_data_sz;
 static struct z_stream_s stream;
 
 #ifdef CONFIG_PSTORE
+#ifdef CONFIG_PPC_POWERNV
+static struct nvram_os_partition skiboot_partition = {
+   .name = "ibm,skiboot",
+   .index = -1,
+   .os_partition = false
+};
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 static struct nvram_os_partition of_config_partition = {
.name = "of-config",
@@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum 
pstore_type_id *type,
time->tv_nsec = 0;
break;
 #endif
+#ifdef CONFIG_PPC_POWERNV
+   case PSTORE_TYPE_PPC_OPAL:
+   sig = NVRAM_SIG_FW;
+   part = &skiboot_partition;
+   *type = PSTORE_TYPE_PPC_OPAL;
+   *id = PSTORE_TYPE_PPC_OPAL;
+   time->tv_sec = 0;
+   time->tv_nsec = 0;
+   break;
+#endif
default:
return 0;
}
@@ -552,8 +570,11 @@ static int nvram_pstore_init(void)
 {
int rc = 0;
 
-   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
-   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   if (machine_is(pseries)) {
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
+   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   } else
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL;
 
nvram_pstore_info.buf = oops_data;
nvram_pstore_info.bufsize = oops_data_sz;
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c 
b/arch/powerpc/platforms/powernv/opal-nvram.c
index f9896fd..9db4398 100644
--- a/arch/powerpc/platforms/powernv/opal-nvram.c
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -16,6 +16,7 @@
 #include 
 
 #include 
+#include 
 #include 
 
 static unsigned int nvram_size;
@@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, 
loff_t *index)
return count;
 }
 
+static int __init opal_nvram_init_log_partitions(void)
+{
+   /* Scan nvram for partitions */
+   nvram_scan_partitions();
+   nvram_init_oops_partition(0);
+   return 0;
+}
+machine_arch_initcall(powernv, opal_nvram_init_log_partitions);
+
 void __init opal_nvram_init(void)
 {
struct device_node *np;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 1/3] powerpc/nvram: move generic code for nvram and pstore

2015-02-05 Thread Hari Bathini

On 01/30/2015 10:12 PM, Arnd Bergmann wrote:

On Friday 30 January 2015 20:44:00 Hari Bathini wrote:

With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

Signed-off-by: Hari Bathini 

Can you make this y2038-safe in the process, possibly as a
follow-up patch?


Arnd, sorry for the delayed response.
I will add these changes to this patch-set and re-spin..

Thanks
Hari


+extern unsigned long last_rtas_event;

time64_t


+   }
+   oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+   oops_hdr->report_length = cpu_to_be16(zipped_len);
+   oops_hdr->timestamp = cpu_to_be64(get_seconds());
+   return 0;

ktime_get_real_seconds()


+static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
+   int *count, struct timespec *time, char **buf,
+   bool *compressed, struct pstore_info *psi)

This has to remain timespec for now but can later be changed to timespec64
when the API gets changed.


+   oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+   oops_hdr->report_length = cpu_to_be16(text_len);
+   oops_hdr->timestamp = cpu_to_be64(get_seconds());

ktime_get_real_seconds()

Arnd
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 0/4] powerpc/pstore: Add pstore support for nvram partitions

2015-02-05 Thread Hari Bathini
This patch series adds pstore support on powernv platform to
read different nvram partitions and write compressed data to
oops-log nvram partition. As pseries platform already has
pstore support, this series moves most of the common code
for pseries and powernv platforms to a common file. Tested
the patches successfully on both pseries and powernv
platforms. Also, tested the patches successfully, on a kernel
compiled with both CONFIG_PPC_PSERIES=y & CONFIG_PPC_POWERNV=y.

Changes from v4:
1. Added a patch for y2038-safe code changes

---

Hari Bathini (4):
  powerpc/nvram: move generic code for nvram and pstore
  pstore: Add pstore type id for PPC64 opal nvram partition
  pstore: add pstore support on powernv
  powerpc: make timestamp related code y2038-safe


 arch/powerpc/include/asm/nvram.h|   50 ++
 arch/powerpc/include/asm/rtas.h |5 
 arch/powerpc/kernel/nvram_64.c  |  677 +++
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 
 arch/powerpc/platforms/pseries/nvram.c  |  673 ---
 fs/pstore/inode.c   |3 
 include/linux/pstore.h  |1 
 7 files changed, 754 insertions(+), 665 deletions(-)

--
-Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 1/4] powerpc/nvram: move generic code for nvram and pstore

2015-02-05 Thread Hari Bathini
With minor checks, we can move most of the code for nvram
under pseries to a common place to be re-used by other
powerpc platforms like powernv. This patch moves such
common code to arch/powerpc/kernel/nvram_64.c file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/nvram.h   |   50 ++
 arch/powerpc/include/asm/rtas.h|4 
 arch/powerpc/kernel/nvram_64.c |  656 
 arch/powerpc/platforms/pseries/nvram.c |  665 
 4 files changed, 714 insertions(+), 661 deletions(-)

diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h
index b0fe0fe..09a518b 100644
--- a/arch/powerpc/include/asm/nvram.h
+++ b/arch/powerpc/include/asm/nvram.h
@@ -9,12 +9,43 @@
 #ifndef _ASM_POWERPC_NVRAM_H
 #define _ASM_POWERPC_NVRAM_H
 
-
+#include 
 #include 
 #include 
 #include 
 
+/*
+ * Set oops header version to distinguish between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
+struct err_log_info {
+   __be32 error_type;
+   __be32 seq_num;
+};
+
+struct nvram_os_partition {
+   const char *name;
+   int req_size;   /* desired size, in bytes */
+   int min_size;   /* minimum acceptable size (0 means req_size) */
+   long size;  /* size of data portion (excluding err_log_info) */
+   long index; /* offset of data portion of partition */
+   bool os_partition; /* partition initialized by OS, not FW */
+};
+
+struct oops_log_info {
+   __be16 version;
+   __be16 report_length;
+   __be64 timestamp;
+} __attribute__((packed));
+
+extern struct nvram_os_partition oops_log_partition;
+
 #ifdef CONFIG_PPC_PSERIES
+extern struct nvram_os_partition rtas_log_partition;
+
 extern int nvram_write_error_log(char * buff, int length,
 unsigned int err_type, unsigned int 
err_seq);
 extern int nvram_read_error_log(char * buff, int length,
@@ -50,6 +81,23 @@ extern void  pmac_xpram_write(int xpaddr, u8 data);
 /* Synchronize NVRAM */
 extern voidnvram_sync(void);
 
+/* Initialize NVRAM OS partition */
+extern int __init nvram_init_os_partition(struct nvram_os_partition *part);
+
+/* Initialize NVRAM oops partition */
+extern void __init nvram_init_oops_partition(int rtas_partition_exists);
+
+/* Read a NVRAM partition */
+extern int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+   int length, unsigned int *err_type,
+   unsigned int *error_log_cnt);
+
+/* Write to NVRAM OS partition */
+extern int nvram_write_os_partition(struct nvram_os_partition *part,
+   char *buff, int length,
+   unsigned int err_type,
+   unsigned int error_log_cnt);
+
 /* Determine NVRAM size */
 extern ssize_t nvram_get_size(void);
 
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index b390f55..123d7ff 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node,
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 
 #ifdef CONFIG_PPC_PSERIES
+extern unsigned long last_rtas_event;
+extern int clobbering_unread_rtas_event(void);
 extern int pseries_devicetree_update(s32 scope);
 extern void post_mobility_fixup(void);
+#else
+static inline int clobbering_unread_rtas_event(void) { return 0; }
 #endif
 
 #ifdef CONFIG_PPC_RTAS_DAEMON
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 34f7c9b..42e5c6a 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +57,659 @@ struct nvram_partition {
 
 static LIST_HEAD(nvram_partitions);
 
+#ifdef CONFIG_PPC_PSERIES
+struct nvram_os_partition rtas_log_partition = {
+   .name = "ibm,rtas-log",
+   .req_size = 2079,
+   .min_size = 1055,
+   .index = -1,
+   .os_partition = true
+};
+#endif
+
+struct nvram_os_partition oops_log_partition = {
+   .name = "lnx,oops-log",
+   .req_size = 4000,
+   .min_size = 2000,
+   .index = -1,
+   .os_partition = true
+};
+
+static const char *nvram_os_partitions[] = {
+#ifdef CONFIG_PPC_PSERIES
+   "ibm,rtas-log",
+#endif
+   "lnx,oops-log",
+   NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+   .dump = oops_to_nvram
+};
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we

[PATCH v5 2/4] pstore: Add pstore type id for PPC64 opal nvram partition

2015-02-05 Thread Hari Bathini
This patch adds a new PPC64 partition type to be used for opal
specific nvram partition. A new partition type is needed as none
of the existing type matches this partition type.

Signed-off-by: Hari Bathini 
---
 fs/pstore/inode.c  |3 +++
 include/linux/pstore.h |1 +
 2 files changed, 4 insertions(+)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5041660..8e0c009 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, 
u64 id, int count,
case PSTORE_TYPE_PPC_COMMON:
sprintf(name, "powerpc-common-%s-%lld", psname, id);
break;
+   case PSTORE_TYPE_PPC_OPAL:
+   sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+   break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6b..af44980 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
PSTORE_TYPE_PPC_RTAS= 4,
PSTORE_TYPE_PPC_OF  = 5,
PSTORE_TYPE_PPC_COMMON  = 6,
+   PSTORE_TYPE_PPC_OPAL= 7,
PSTORE_TYPE_UNKNOWN = 255
 };
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 3/4] pstore: add pstore support on powernv

2015-02-05 Thread Hari Bathini
This patch extends pstore, a generic interface to platform dependent
persistent storage, support for powernv  platform to capture certain
useful information, during dying moments. Such support is already in
place for  pseries platform. This patch re-uses most of that code.

It is a common practice to compile kernels with both CONFIG_PPC_PSERIES=y
and CONFIG_PPC_POWERNV=y. The code in nvram_init_oops_partition() routine
still works as intended, as the caller is platform specific code which
passes the appropriate value for "rtas_partition_exists" parameter.
In all other places, where CONFIG_PPC_PSERIES or CONFIG_PPC_POWERNV
flag is used in this patchset, it is to reduce the kernel size in cases
where this flag is not set and doesn't have any impact logic wise.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/nvram_64.c  |   25 +++--
 arch/powerpc/platforms/powernv/opal-nvram.c |   10 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 42e5c6a..293da88 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -127,6 +127,14 @@ static size_t oops_data_sz;
 static struct z_stream_s stream;
 
 #ifdef CONFIG_PSTORE
+#ifdef CONFIG_PPC_POWERNV
+static struct nvram_os_partition skiboot_partition = {
+   .name = "ibm,skiboot",
+   .index = -1,
+   .os_partition = false
+};
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 static struct nvram_os_partition of_config_partition = {
.name = "of-config",
@@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum 
pstore_type_id *type,
time->tv_nsec = 0;
break;
 #endif
+#ifdef CONFIG_PPC_POWERNV
+   case PSTORE_TYPE_PPC_OPAL:
+   sig = NVRAM_SIG_FW;
+   part = &skiboot_partition;
+   *type = PSTORE_TYPE_PPC_OPAL;
+   *id = PSTORE_TYPE_PPC_OPAL;
+   time->tv_sec = 0;
+   time->tv_nsec = 0;
+   break;
+#endif
default:
return 0;
}
@@ -552,8 +570,11 @@ static int nvram_pstore_init(void)
 {
int rc = 0;
 
-   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
-   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   if (machine_is(pseries)) {
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
+   nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+   } else
+   nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL;
 
nvram_pstore_info.buf = oops_data;
nvram_pstore_info.bufsize = oops_data_sz;
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c 
b/arch/powerpc/platforms/powernv/opal-nvram.c
index f9896fd..9db4398 100644
--- a/arch/powerpc/platforms/powernv/opal-nvram.c
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -16,6 +16,7 @@
 #include 
 
 #include 
+#include 
 #include 
 
 static unsigned int nvram_size;
@@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, 
loff_t *index)
return count;
 }
 
+static int __init opal_nvram_init_log_partitions(void)
+{
+   /* Scan nvram for partitions */
+   nvram_scan_partitions();
+   nvram_init_oops_partition(0);
+   return 0;
+}
+machine_arch_initcall(powernv, opal_nvram_init_log_partitions);
+
 void __init opal_nvram_init(void)
 {
struct device_node *np;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5 4/4] powerpc: make timestamp related code y2038-safe

2015-02-05 Thread Hari Bathini
While we are here, let us make timestamp related code
y2038-safe.

Suggested-by: Arnd Bergmann 
Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/rtas.h|3 ++-
 arch/powerpc/kernel/nvram_64.c |6 +++---
 arch/powerpc/platforms/pseries/nvram.c |   10 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 123d7ff..efa9152 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -4,6 +4,7 @@
 
 #include 
 #include 
+#include 
 
 /*
  * Definitions for talking to the RTAS on CHRP machines.
@@ -343,7 +344,7 @@ extern int early_init_dt_scan_rtas(unsigned long node,
 extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 
 #ifdef CONFIG_PPC_PSERIES
-extern unsigned long last_rtas_event;
+extern time64_t last_rtas_event;
 extern int clobbering_unread_rtas_event(void);
 extern int pseries_devicetree_update(s32 scope);
 extern void post_mobility_fixup(void);
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 293da88..1e703f8 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -376,7 +376,7 @@ static int zip_oops(size_t text_len)
}
oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
oops_hdr->report_length = cpu_to_be16(zipped_len);
-   oops_hdr->timestamp = cpu_to_be64(get_seconds());
+   oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds());
return 0;
 }
 
@@ -423,7 +423,7 @@ static int nvram_pstore_write(enum pstore_type_id type,
 
oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
oops_hdr->report_length = cpu_to_be16(size);
-   oops_hdr->timestamp = cpu_to_be64(get_seconds());
+   oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds());
 
if (compressed)
err_type = ERR_TYPE_KERNEL_PANIC_GZ;
@@ -721,7 +721,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
err_type = ERR_TYPE_KERNEL_PANIC;
oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
oops_hdr->report_length = cpu_to_be16(text_len);
-   oops_hdr->timestamp = cpu_to_be64(get_seconds());
+   oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds());
}
 
(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
diff --git a/arch/powerpc/platforms/pseries/nvram.c 
b/arch/powerpc/platforms/pseries/nvram.c
index 97b8fc6..d77713b 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -37,10 +37,10 @@ static DEFINE_SPINLOCK(nvram_lock);
 
 /* See clobbering_unread_rtas_event() */
 #define NVRAM_RTAS_READ_TIMEOUT 5  /* seconds */
-static unsigned long last_unread_rtas_event;   /* timestamp */
+static time64_t last_unread_rtas_event;/* timestamp */
 
 #ifdef CONFIG_PSTORE
-unsigned long last_rtas_event;
+time64_t last_rtas_event;
 #endif
 
 static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
@@ -145,9 +145,9 @@ int nvram_write_error_log(char * buff, int length,
int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
err_type, error_log_cnt);
if (!rc) {
-   last_unread_rtas_event = get_seconds();
+   last_unread_rtas_event = ktime_get_real_seconds();
 #ifdef CONFIG_PSTORE
-   last_rtas_event = get_seconds();
+   last_rtas_event = ktime_get_real_seconds();
 #endif
}
 
@@ -201,7 +201,7 @@ int clobbering_unread_rtas_event(void)
 {
return (oops_log_partition.index == rtas_log_partition.index
&& last_unread_rtas_event
-   && get_seconds() - last_unread_rtas_event <=
+   && ktime_get_real_seconds() - last_unread_rtas_event <=
NVRAM_RTAS_READ_TIMEOUT);
 }
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 2/4] pstore: Add pstore type id for PPC64 opal nvram partition

2015-03-04 Thread Hari Bathini

On 02/06/2015 01:06 AM, Hari Bathini wrote:

This patch adds a new PPC64 partition type to be used for opal
specific nvram partition. A new partition type is needed as none
of the existing type matches this partition type.

Signed-off-by: Hari Bathini 


This patch series is reviewed by Kees.
Reference link: https://lkml.org/lkml/2015/2/5/651

Reviewed-by: Kees Cook 

Thanks
Hari


---
  fs/pstore/inode.c  |3 +++
  include/linux/pstore.h |1 +
  2 files changed, 4 insertions(+)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5041660..8e0c009 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, 
u64 id, int count,
case PSTORE_TYPE_PPC_COMMON:
sprintf(name, "powerpc-common-%s-%lld", psname, id);
break;
+   case PSTORE_TYPE_PPC_OPAL:
+   sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+   break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6b..af44980 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
PSTORE_TYPE_PPC_RTAS= 4,
PSTORE_TYPE_PPC_OF  = 5,
PSTORE_TYPE_PPC_COMMON  = 6,
+   PSTORE_TYPE_PPC_OPAL= 7,
PSTORE_TYPE_UNKNOWN = 255
  };
  


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] ppc64/book3s: copy interrupts till __end_handlers marker instead of __end_interrupts

2016-03-28 Thread Hari Bathini
Some of the interrupt vectors on 64-bit POWER server processors  are
only 32 bytes long (8 instructions), which is not enough for the full
first-level interrupt handler. For these we need to branch to an out-
of-line (OOL) handler. But when we are running a relocatable kernel,
interrupt vectors till __end_interrupts marker are copied down to real
address 0x100. So, branching to labels (read OOL handlers) outside this
section should be handled differently (see LOAD_HANDLER()), considering
relocatable kernel, which would need atleast 4 instructions.

However, branching from interrupt vector means that we corrupt the CFAR
(come-from address register) on POWER7 and later processors as mentioned
in commit 1707dd16. So, EXCEPTION_PROLOG_0
(6 instructions) that contains the part up to the point where the CFAR is
saved in the PACA should be part of the short interrupt vectors before we
branch out to OOL handlers.

But as mentioned already, there are interrupt vectors on 64-bit POWER server
processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.),
which cannot accomodate the above two cases at the same time owing to space
constraint. Currently, in these interrupt vectors, we simply branch out to
OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when
running a relocatable kernel (eg. kdump case). While this has been the case
for sometime now and kdump is used widely, we were fortunate not to see any
problems so far, for three reasons:

1. In almost all cases, production kernel (relocatable) is used for
   kdump as well, which would mean that crashed kernel's OOL handler
   would be at the same place where we endup branching to, from short
   interrupt vector of kdump kernel.
2. Also, OOL handler was unlikely the reason for crash in almost all
   the kdump scenarios, which meant we had a sane OOL handler from
   crashed kernel that we branched to.
3. On most 64-bit POWER server processors, page size is large enough
   that marking interrupt vector code as executable (see commit
   429d2e83) leads to marking OOL handler code from crashed kernel,
   that sits right below interrupt vector code from kdump kernel, as
   executable as well.

Let us fix this undependable code path firstly, by moving down __end_handlers
marker down past OOL handlers. Secondly, copying interrupt vectors down till
__end_handlers marker instead of __end_interrupts, when running a relocatable
kernel, to make sure we endup in relocated (kdump) kernel's OOL handler instead
of crashed kernel's. Thirdly, by marking all the interrupt vector code that is
copied down to real address 0x100 as executable, considering the relocation on
exception feature that allows exceptions to be raised in virtual mode (IR=DR=1).

This fix has been tested successfully in kdump scenario, on a lpar with 4K page
size by using different default/production kernel and kdump kernel.

Signed-off-by: Hari Bathini 
Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/sections.h  |3 ++-
 arch/powerpc/kernel/exceptions-64s.S |8 
 arch/powerpc/kernel/head_64.S|2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/sections.h 
b/arch/powerpc/include/asm/sections.h
index abf5866..b4139a5 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -10,6 +10,7 @@
 
 extern char __start_interrupts[];
 extern char __end_interrupts[];
+extern char __end_handlers[];
 
 extern char __prom_init_toc_start[];
 extern char __prom_init_toc_end[];
@@ -39,7 +40,7 @@ static inline int overlaps_interrupt_vector_text(unsigned 
long start,
 {
unsigned long real_start, real_end;
real_start = __start_interrupts - _stext;
-   real_end = __end_interrupts - _stext;
+   real_end = __end_handlers - _stext;
 
return start < (unsigned long)__va(real_end) &&
(unsigned long)__va(real_start) < end;
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 7716ceb..98e2ce5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1230,10 +1230,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
STD_EXCEPTION_COMMON(0xf60, facility_unavailable, 
facility_unavailable_exception)
STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, 
facility_unavailable_exception)
 
-   .align  7
-   .globl  __end_handlers
-__end_handlers:
-
/* Equivalents to the above handlers for relocation-on interrupt 
vectors */
STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
@@ -1244,6 +1240,10 @@ __end_handlers:
STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
 
+   .align  7
+   .globl  __end_handlers
+_

Re: ppc64/book3s: copy interrupts till __end_handlers marker instead of __end_interrupts

2016-03-29 Thread Hari Bathini



On 03/29/2016 03:47 PM, Michael Ellerman wrote:

Hi Hari,

You win the "Best Change Log of the Year" award.

Some comments below ...

On Mon, 2016-28-03 at 11:23:22 UTC, Hari Bathini wrote:

Some of the interrupt vectors on 64-bit POWER server processors  are
only 32 bytes long (8 instructions), which is not enough for the full
first-level interrupt handler. For these we need to branch to an out-
of-line (OOL) handler. But when we are running a relocatable kernel,
interrupt vectors till __end_interrupts marker are copied down to real
address 0x100. So, branching to labels (read OOL handlers) outside this
section should be handled differently (see LOAD_HANDLER()), considering
relocatable kernel, which would need atleast 4 instructions.

However, branching from interrupt vector means that we corrupt the CFAR
(come-from address register) on POWER7 and later processors as mentioned
in commit 1707dd16. So, EXCEPTION_PROLOG_0
(6 instructions) that contains the part up to the point where the CFAR is
saved in the PACA should be part of the short interrupt vectors before we
branch out to OOL handlers.

But as mentioned already, there are interrupt vectors on 64-bit POWER server
processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.),
which cannot accomodate the above two cases at the same time owing to space
constraint. Currently, in these interrupt vectors, we simply branch out to
OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when
running a relocatable kernel (eg. kdump case). While this has been the case
for sometime now and kdump is used widely, we were fortunate not to see any
problems so far, for three reasons:

 1. In almost all cases, production kernel (relocatable) is used for
kdump as well, which would mean that crashed kernel's OOL handler
would be at the same place where we endup branching to, from short
interrupt vector of kdump kernel.
 2. Also, OOL handler was unlikely the reason for crash in almost all
the kdump scenarios, which meant we had a sane OOL handler from
crashed kernel that we branched to.
 3. On most 64-bit POWER server processors, page size is large enough
that marking interrupt vector code as executable (see commit
429d2e83) leads to marking OOL handler code from crashed kernel,
that sits right below interrupt vector code from kdump kernel, as
executable as well.

Let us fix this undependable code path firstly, by moving down __end_handlers
marker down past OOL handlers. Secondly, copying interrupt vectors down till
__end_handlers marker instead of __end_interrupts, when running a relocatable
kernel, to make sure we endup in relocated (kdump) kernel's OOL handler instead
of crashed kernel's. Thirdly, by marking all the interrupt vector code that is
copied down to real address 0x100 as executable, considering the relocation on
exception feature that allows exceptions to be raised in virtual mode (IR=DR=1).

This fix has been tested successfully in kdump scenario, on a lpar with 4K page
size by using different default/production kernel and kdump kernel.

So I think you've missed one important case.


My bad! I missed out on considering this case..


In do_final_fixups() we recopy the (now patched) kernel code down to zero. That
code uses __end_interrupts as its limit, so I think if you look closely your OOL
handlers down at zero will not have had feature fixups applied to them.

I think perhaps the better fix is just to move __end_interrupts down (up) to the
right location. AFAICS all users of __end_interrupts actually want that address.

It would also mean we could remove __end_handlers as unused.


True. This sounds less complicated.


So can you please check that I'm right about do_final_fixups(), and then try
moving __end_interrupts and check that works?


Yeah. Testing the patch. Will post it soon.
Thanks for the review!

- Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-03-29 Thread Hari Bathini
Some of the interrupt vectors on 64-bit POWER server processors  are
only 32 bytes long (8 instructions), which is not enough for the full
first-level interrupt handler. For these we need to branch to an out-
of-line (OOL) handler. But when we are running a relocatable kernel,
interrupt vectors till __end_interrupts marker are copied down to real
address 0x100. So, branching to labels (read OOL handlers) outside this
section should be handled differently (see LOAD_HANDLER()), considering
relocatable kernel, which would need atleast 4 instructions.

However, branching from interrupt vector means that we corrupt the CFAR
(come-from address register) on POWER7 and later processors as mentioned
in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains
the part up to the point where the CFAR is saved in the PACA should be
part of the short interrupt vectors before we branch out to OOL handlers.

But as mentioned already, there are interrupt vectors on 64-bit POWER server
processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.),
which cannot accomodate the above two cases at the same time owing to space
constraint. Currently, in these interrupt vectors, we simply branch out to
OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when
running a relocatable kernel (eg. kdump case). While this has been the case
for sometime now and kdump is used widely, we were fortunate not to see any
problems so far, for three reasons:

1. In almost all cases, production kernel (relocatable) is used for
   kdump as well, which would mean that crashed kernel's OOL handler
   would be at the same place where we endup branching to, from short
   interrupt vector of kdump kernel.
2. Also, OOL handler was unlikely the reason for crash in almost all
   the kdump scenarios, which meant we had a sane OOL handler from
   crashed kernel that we branched to.
3. On most 64-bit POWER server processors, page size is large enough
   that marking interrupt vector code as executable (see commit
   429d2e83) leads to marking OOL handler code from crashed kernel,
   that sits right below interrupt vector code from kdump kernel, as
   executable as well.

Let us fix this undependable code path by moving the __end_interrupts marker
down past OOL handlers to make sure that we also copy OOL handlers to real
address 0x100 when running a relocatable kernel. This helps in cases discussed
above, where interrupt vectors are not long enough to branch out to OOL handlers
with LOAD_HANDLER(). While we are here, let us remove the virtually 
insignificant
__end_handlers marker.

This fix has been tested successfully in kdump scenario, on a lpar with 4K page
size by using different default/production kernel and kdump kernel.

Signed-off-by: Hari Bathini 
Signed-off-by: Mahesh Salgaonkar 
---

changes from v1:
1. Changed the subject from "copy interrupts till __end_handlers marker
   instead of __end_interrupts" to a more generic one
2. Used __end_interrupts marker instead of __end_handlers to make the fix
   less complicated.
3. Removed unused __end_handlers marker.


 arch/powerpc/kernel/exceptions-64s.S |   23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 7716ceb..e598580 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt:
 #endif
 
 /*
- * Code from here down to __end_handlers is invoked from the
- * exception prologs above.  Because the prologs assemble the
+ * Code from here down to end of out of line handlers is invoked from
+ * the exception prologs above.  Because the prologs assemble the
  * addresses of these handlers using the LOAD_HANDLER macro,
  * which uses an ori instruction, these handlers must be in
  * the first 64k of the kernel image.
@@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline:
 #endif
STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
 
-   /* Other future vectors */
-   .align  7
-   .globl  __end_interrupts
-__end_interrupts:
-
.align  7
 system_call_entry:
b   system_call_common
@@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
STD_EXCEPTION_COMMON(0xf60, facility_unavailable, 
facility_unavailable_exception)
STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, 
facility_unavailable_exception)
 
-   .align  7
-   .globl  __end_handlers
-__end_handlers:
-
/* Equivalents to the above handlers for relocation-on interrupt 
vectors */
STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
@@ -1244,6 +1235,16 @@ __end_handlers:
STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facil

Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-03-30 Thread Hari Bathini



On 03/30/2016 05:55 AM, Michael Ellerman wrote:

On Tue, 2016-29-03 at 18:34:37 UTC, Hari Bathini wrote:

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 7716ceb..e598580 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt:
  #endif
  
  /*

- * Code from here down to __end_handlers is invoked from the
- * exception prologs above.  Because the prologs assemble the
+ * Code from here down to end of out of line handlers is invoked from
+ * the exception prologs above.  Because the prologs assemble the

I think it would be better to just replace __end_handlers with __end_interrupts,
that way it's entirely clear what location you're talking about.


@@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline:
  #endif
STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
  
-	/* Other future vectors */

-   .align  7
-   .globl  __end_interrupts
-__end_interrupts:
-
.align  7
  system_call_entry:
b   system_call_common
@@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
STD_EXCEPTION_COMMON(0xf60, facility_unavailable, 
facility_unavailable_exception)
STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, 
facility_unavailable_exception)
  
-	.align	7

-   .globl  __end_handlers
-__end_handlers:
-

Sorry I wasn't clear in my last mail, please do this as a separate cleanup patch
after this patch.


ok..


@@ -1244,6 +1235,16 @@ __end_handlers:
STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
  
+	/* FIXME: For now, let us move the __end_interrupts marker down past

Why is it FIXME?

In general I don't want to merge code that adds a FIXME unless there is some
very good reason.

AFAICS this is a permanent solution isn't it?


Except for a few short interrupt vectors like 0x4f00, 04f20, etc., all other
vectors defined till __end_interrupts marker ensure that LOAD_HANDLER() is
used for branching to labels like system_call_entry, data_access_common, 
etc.

that are currently not copied to real 0 in relocation case.

So, we are forced to move the __end_interrupts marker down only to handle
space constraint in the short vectors. So, I added the FIXME to remind the
scope for improvement in the code. But after thinking over again now, moving
the marker down makes us copy an additional 1~2 KB along with the 21~22 KB
that we are copying already. So, not much of an improvement to lose 
sleep over

or to add a FIXME, I guess. Your thoughts?

Also, FIXME is the reason, why I did not replace __end_handlers with
__end_interrupts in the comment earlier.


+* the out-of-line handlers, to make sure we also copy OOL handlers
+* to real adress 0x100 when running a relocatable kernel. This helps

It doesn't "help" it's 100% required.


Yep. Will change the wording.
Thanks for the review!

- Hari


+* in cases where interrupt vectors are not long enough (like 0x4f00,
+* 0x4f20, etc.) to branch out to OOL handlers with LOAD_HANDLER().
+*/
+   .align  7
+   .globl  __end_interrupts
+__end_interrupts:
+
  #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
  /*
   * Data area reserved for FWNMI option.


cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-03-30 Thread Hari Bathini



On 03/30/2016 12:44 PM, Hari Bathini wrote:



On 03/30/2016 05:55 AM, Michael Ellerman wrote:

On Tue, 2016-29-03 at 18:34:37 UTC, Hari Bathini wrote:
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S

index 7716ceb..e598580 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt:
  #endif
/*
- * Code from here down to __end_handlers is invoked from the
- * exception prologs above.  Because the prologs assemble the
+ * Code from here down to end of out of line handlers is invoked from
+ * the exception prologs above.  Because the prologs assemble the
I think it would be better to just replace __end_handlers with 
__end_interrupts,

that way it's entirely clear what location you're talking about.


@@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline:
  #endif
  STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
  -/* Other future vectors */
-.align7
-.globl__end_interrupts
-__end_interrupts:
-
  .align7
  system_call_entry:
  bsystem_call_common
@@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
  STD_EXCEPTION_COMMON(0xf60, facility_unavailable, 
facility_unavailable_exception)
  STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, 
facility_unavailable_exception)

  -.align7
-.globl__end_handlers
-__end_handlers:
-
Sorry I wasn't clear in my last mail, please do this as a separate 
cleanup patch

after this patch.


ok..


@@ -1244,6 +1235,16 @@ __end_handlers:
  STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
  STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
  +/* FIXME: For now, let us move the __end_interrupts marker 
down past

Why is it FIXME?

In general I don't want to merge code that adds a FIXME unless there 
is some

very good reason.

AFAICS this is a permanent solution isn't it?


Except for a few short interrupt vectors like 0x4f00, 04f20, etc., all 
other
vectors defined till __end_interrupts marker ensure that 
LOAD_HANDLER() is
used for branching to labels like system_call_entry, 
data_access_common, etc.

that are currently not copied to real 0 in relocation case.

So, we are forced to move the __end_interrupts marker down only to handle
space constraint in the short vectors. So, I added the FIXME to remind 
the
scope for improvement in the code. But after thinking over again now, 
moving
the marker down makes us copy an additional 1~2 KB along with the 
21~22 KB
that we are copying already. So, not much of an improvement to lose 
sleep over

or to add a FIXME, I guess. Your thoughts?



Alternatively, how about moving the OOLs handlers that can't be branched 
with LOAD_HANDLER
under __end_interrupts. This way we won't be copying more than a few 
absolutely needed handlers.


STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
.
.
STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)


We can leave __end_handlers marker to indicate code that should be part 
of the

first 64K of kernel image.

Thanks
Hari


Also, FIXME is the reason, why I did not replace __end_handlers with
__end_interrupts in the comment earlier.

+ * the out-of-line handlers, to make sure we also copy OOL 
handlers
+ * to real adress 0x100 when running a relocatable kernel. This 
helps

It doesn't "help" it's 100% required.


Yep. Will change the wording.
Thanks for the review!

- Hari

+ * in cases where interrupt vectors are not long enough (like 
0x4f00,
+ * 0x4f20, etc.) to branch out to OOL handlers with 
LOAD_HANDLER().

+ */
+.align7
+.globl__end_interrupts
+__end_interrupts:
+
  #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
  /*
   * Data area reserved for FWNMI option.


cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-03-30 Thread Hari Bathini



On 03/30/2016 04:47 PM, Michael Ellerman wrote:

On Wed, 2016-03-30 at 13:14 +0530, Hari Bathini wrote:

Alternatively, how about moving the OOLs handlers that can't be branched with
LOAD_HANDLER under __end_interrupts. This way we won't be copying more than a
few absolutely needed handlers.

STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
.
.
STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)


We can leave __end_handlers marker to indicate code that should be part
of the first 64K of kernel image.

That might work. But I suspect you will run into issues with ".org backwards",
ie. running out of space in head_64.S

But try it and let me know if it works.


It worked. Doing some sanity testing.
Will post v3 soon with this approach.


I think we also need to write a script or little C program which looks at the
vmlinux and checks that nothing below __end_whatever does a direct branch. So
that we don't break it again in future.


Yep. That would make life easy..
Let me see if I can do something about it.

Thanks
Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-03-30 Thread Hari Bathini
Some of the interrupt vectors on 64-bit POWER server processors  are
only 32 bytes long (8 instructions), which is not enough for the full
first-level interrupt handler. For these we need to branch to an out-
of-line (OOL) handler. But when we are running a relocatable kernel,
interrupt vectors till __end_interrupts marker are copied down to real
address 0x100. So, branching to labels (read OOL handlers) outside this
section should be handled differently (see LOAD_HANDLER()), considering
relocatable kernel, which would need atleast 4 instructions.

However, branching from interrupt vector means that we corrupt the CFAR
(come-from address register) on POWER7 and later processors as mentioned
in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains
the part up to the point where the CFAR is saved in the PACA should be
part of the short interrupt vectors before we branch out to OOL handlers.

But as mentioned already, there are interrupt vectors on 64-bit POWER server
processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.),
which cannot accomodate the above two cases at the same time owing to space
constraint. Currently, in these interrupt vectors, we simply branch out to
OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when
running a relocatable kernel (eg. kdump case). While this has been the case
for sometime now and kdump is used widely, we were fortunate not to see any
problems so far, for three reasons:

1. In almost all cases, production kernel (relocatable) is used for
   kdump as well, which would mean that crashed kernel's OOL handler
   would be at the same place where we endup branching to, from short
   interrupt vector of kdump kernel.
2. Also, OOL handler was unlikely the reason for crash in almost all
   the kdump scenarios, which meant we had a sane OOL handler from
   crashed kernel that we branched to.
3. On most 64-bit POWER server processors, page size is large enough
   that marking interrupt vector code as executable (see commit
   429d2e83) leads to marking OOL handler code from crashed kernel,
   that sits right below interrupt vector code from kdump kernel, as
   executable as well.

Let us fix this undependable code path by moving these OOL handlers below
__end_interrupts marker to make sure we also copy these handlers to real
address 0x100 when running a relocatable kernel. Because the interrupt
vectors branching to these OOL handlers are not long enough to use
LOAD_HANDLER() for branching as discussed above.

This fix has been tested successfully in kdump scenario, on a lpar with 4K page
size by using different default/production kernel and kdump kernel.

Signed-off-by: Hari Bathini 
Signed-off-by: Mahesh Salgaonkar 
---

changes from v2:
2. Move the OOL handlers before __end_interrupts marker instead of moving the 
__end_interrupts marker
3. Leave __end_handlers marker as is.

 arch/powerpc/kernel/exceptions-64s.S |   29 +++--
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 7716ceb..9ac3a38 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -953,6 +953,25 @@ hv_facility_unavailable_relon_trampoline:
 #endif
STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
 
+   /*
+* Out-Of-Line handlers for relocation-on interrupt vectors
+*
+* We need these OOL handlers to be below __end_interrupts
+* marker to enusre we also copy these OOL handlers along
+* with the interrupt vectors to real address 0x100 when
+* running a relocatable kernel. Because the interrupt
+* vectors branching to these OOL handlers are not long
+* enough to use LOAD_HANDLER() for branching.
+*/
+   STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
+   MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
+
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+   STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
+
/* Other future vectors */
.align  7
.globl  __end_interrupts
@@ -1234,16 +1253,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
.globl  __end_handlers
 __end_handlers:
 
-   /* Equivalents to the above handlers for relocation-on interrupt 
vectors */
-   STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
-   MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
-
-   STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
-   STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
-   STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavai

Re: [PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-04-01 Thread Hari Bathini



On 04/01/2016 11:44 AM, Michael Ellerman wrote:

On Wed, 2016-03-30 at 23:49 +0530, Hari Bathini wrote:

Some of the interrupt vectors on 64-bit POWER server processors  are
only 32 bytes long (8 instructions), which is not enough for the full

...

Let us fix this undependable code path by moving these OOL handlers below
__end_interrupts marker to make sure we also copy these handlers to real
address 0x100 when running a relocatable kernel. Because the interrupt
vectors branching to these OOL handlers are not long enough to use
LOAD_HANDLER() for branching as discussed above.


...

changes from v2:
2. Move the OOL handlers before __end_interrupts marker instead of moving the 
__end_interrupts marker
3. Leave __end_handlers marker as is.

Hi Hari,

Thanks for trying this. In the end I've decided it's not a good option.

If you build an allmodconfig, and turn on CONFIG_RELOCATABLE, and then look at
the disassembly, you see this:

   c0006ffc:   48 00 29 04 b   c0009900 
<.ret_from_except>
   
   c0007000 <__end_handlers>:


At 0x7000 we have the FWNMI area, which is fixed and can't move. As you see
above we end up with only 4 bytes of space between the end of the handlers and
the FWNMI area.

So any tiny change that adds two more instructions prior to 0x7000 will then
fail to build.


Hi Michael,

I agree. But the OOL handlers that are moved up in v3 were below
0x7000 earlier as well and moving them below __end_interrupts marker
shouldn't make any difference in terms of space consumption at least in
comparison between v2 & v3. So, I guess picking either v2 or v3
doesn't change this for better.

Also, there is code between __end_interrupts and __end_handlers
that is not location dependent as long as it is within 64K (0x1)
that can be moved above 0x8000, if need be.

For these reasons, I feel v3 is better going forward as it keeps
__start_interrupts to __end_interrupts code compact and
leaves alone the code that doesn't need to be copied to real 0.

Am I missing something here?

Thanks
Hari


None of that's your fault, it's just the nature of the code in there, it's very
space constrained.

For now I'll take your v2, but I'll edit the comment and drop the removal of
__end_handlers.

cheers



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-04-01 Thread Hari Bathini



On 04/01/2016 04:07 PM, Michael Ellerman wrote:

On Fri, 2016-04-01 at 12:23 +0530, Hari Bathini wrote:

On 04/01/2016 11:44 AM, Michael Ellerman wrote:

On Wed, 2016-03-30 at 23:49 +0530, Hari Bathini wrote:

Some of the interrupt vectors on 64-bit POWER server processors  are
only 32 bytes long (8 instructions), which is not enough for the full

...

Let us fix this undependable code path by moving these OOL handlers below
__end_interrupts marker to make sure we also copy these handlers to real
address 0x100 when running a relocatable kernel. Because the interrupt
vectors branching to these OOL handlers are not long enough to use
LOAD_HANDLER() for branching as discussed above.


...

changes from v2:
2. Move the OOL handlers before __end_interrupts marker instead of moving the 
__end_interrupts marker
3. Leave __end_handlers marker as is.

Hi Hari,

Thanks for trying this. In the end I've decided it's not a good option.

If you build an allmodconfig, and turn on CONFIG_RELOCATABLE, and then look at
the disassembly, you see this:

c0006ffc:   48 00 29 04 b   c0009900 
<.ret_from_except>

c0007000 <__end_handlers>:


At 0x7000 we have the FWNMI area, which is fixed and can't move. As you see
above we end up with only 4 bytes of space between the end of the handlers and
the FWNMI area.

So any tiny change that adds two more instructions prior to 0x7000 will then
fail to build.

Hi Michael,

I agree. But the OOL handlers that are moved up in v3 were below
0x7000 earlier as well and moving them below __end_interrupts marker
shouldn't make any difference in terms of space consumption at least in
comparison between v2 & v3. So, I guess picking either v2 or v3
doesn't change this for better.

It does make a difference, due to alignment. Prior to your patch we have ~24
bytes free.


Hi Michael,

Hmmm.. I thought ~24 bytes was not such a difference but with the scenario
you mentioned it does sound critical. Actually, this patch came into being
for want of another 8~12 bytes. So, I should have known better about
space constraint.




Also, there is code between __end_interrupts and __end_handlers
that is not location dependent as long as it is within 64K (0x1)
that can be moved above 0x8000, if need be.

That's true, but that sort of change is unlikely to backport well. And we need
to backport this fix to everything.


That does sound like a maintainer's nightmare.


But if you can get that to work I'll consider it. I tried quickly but couldn't
get it working, due to problems with the feature else sections being too far
away from.


Same case. May need sometime to get that right.
Also, exploring holes between __start_interrupts & __end_interrupts.
Will try and get back on this soon.
If none of this works, we have v2 anyway.

Thanks
Hari

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v4 3/3] ppc64/book3s: remove __end_handlers marker

2016-04-07 Thread Hari Bathini
__end_handlers marker was intended to mark down upto code that gets
called from exception prologs. But that hasn't kept pace with code
changes. Case in point, slb_miss_realmode being called from exception
prolog code but isn't below __end_handlers marker. So, __end_handlers
marker is as good as a comment but could be misleading at times if
it isn't in sync with the code, as is the case now. So, let us avoid
this confusion by having a better comment and removing __end_handlers
marker altogether.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/exceptions-64s.S |   13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index c193ebd..80f9fc4 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -764,11 +764,10 @@ kvmppc_skip_Hinterrupt:
 #endif
 
 /*
- * Code from here down to __end_handlers is invoked from the
- * exception prologs above.  Because the prologs assemble the
- * addresses of these handlers using the LOAD_HANDLER macro,
- * which uses an ori instruction, these handlers must be in
- * the first 64k of the kernel image.
+ * Ensure that any handlers that get invoked from the exception prologs
+ * above are below the first 64KB (0x1) of the kernel image because
+ * the prologs assemble the addresses of these handlers using the
+ * LOAD_HANDLER macro, which uses an ori instruction.
  */
 
 /*** Common interrupt handlers ***/
@@ -1243,10 +1242,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
bl  vsx_unavailable_exception
b   ret_from_except
 
-   .align  7
-   .globl  __end_handlers
-__end_handlers:
-
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
 /*
  * Data area reserved for FWNMI option.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v4 2/3] ppc64/book3s: make some room for common interrupt vector code

2016-04-07 Thread Hari Bathini
With the previous patch, we choke out whatever little space is left
below 0x7000 (FWNMI hard block) while there is a hole of ~1400 bytes
below __end_interrupts marker when CONFIG_CBE_RAS is disabled.
Considering CONFIG_CBE_RAS is not enabled by default for BOOK3S, this
is not a desirable scenario especially when we have to worry about
each additional instruction that goes below 0x7000.

Memory region from 0x1800 to 0x4000 is dedicated for common interrupt
vector code. Also, we never hit an interrupt below 0x300 when IR=DR=1
implying memory region between 0x4000 to 0x4300 can also be used for
common interrupt vector code. So, we can effectively use memory region
between 0x1800 to 0x4300 for common interrupt vector code.

This patch tries to free up some space below 0x7000 by rearranging the
common interrupt vector code. The approach here is to avoid large holes
below 0x4300 for any kernel configuration. For this, let us move common
interrupt vector code that only gets enabled with CONFIG_CBE_RAS above
0x8000, as it doesn't need to be too close to the call sites and can be
branched to with LOAD_HANDLER() as long as it is within the first 64KB
(0x1) of the kernel image. Instead, lets move common interrupt vector
code marked h_instr_storage_common, facility_unavailable_common &
hv_facility_unavailable_common below 0x4300. This leaves ~250 bytes
free below 0x4300 and ~1150 bytes free below 0x7000 - enough space to
stop worrying about every additional instruction that goes below 0x7000.

This patch assumes at least commit 376af594, part of the patch series
that starts with commit 468a3302, is part of the code to avoid messy
compilation issues like:

relocation truncated to fit: R_PPC64_REL14 against `.text'+1c90
Makefile:864: recipe for target 'vmlinux' failed

I tested this patch successfully on ppc64, ppc64le lpars and baremetal
environments. Couldn't test it on IBM cell blade though but expecting no
problems with this patch in IBM cell blade environment as well. If
someone can test this patch in cell platform, it would be great.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/exceptions-64s.S |   20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index f76b2f3..c193ebd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -786,6 +786,7 @@ kvmppc_skip_Hinterrupt:
STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception)
STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception)
STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception)
+   STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
STD_EXCEPTION_COMMON(0xe40, emulation_assist, 
emulation_assist_interrupt)
STD_EXCEPTION_COMMON_ASYNC(0xe60, hmi_exception, handle_hmi_exception)
 #ifdef CONFIG_PPC_DOORBELL
@@ -794,6 +795,9 @@ kvmppc_skip_Hinterrupt:
STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception)
 #endif
STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, 
performance_monitor_exception)
+   STD_EXCEPTION_COMMON(0xf60, facility_unavailable, 
facility_unavailable_exception)
+   STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, 
facility_unavailable_exception)
+
STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, 
instruction_breakpoint_exception)
STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception)
 #ifdef CONFIG_ALTIVEC
@@ -801,11 +805,6 @@ kvmppc_skip_Hinterrupt:
 #else
STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception)
 #endif
-#ifdef CONFIG_CBE_RAS
-   STD_EXCEPTION_COMMON(0x1200, cbe_system_error, 
cbe_system_error_exception)
-   STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
-   STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
-#endif /* CONFIG_CBE_RAS */
 
/*
 * Relocation-on interrupts: A subset of the interrupts can be delivered
@@ -1029,8 +1028,6 @@ instruction_access_common:
li  r5,0x400
b   do_hash_page/* Try to handle as hpte fault */
 
-   STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
-
 /*
  * Here is the common SLB miss user that is used when going to virtual
  * mode for SLB misses, that is currently not used
@@ -1246,9 +1243,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
bl  vsx_unavailable_exception
b   ret_from_except
 
-   STD_EXCEPTION_COMMON(0xf60, facility_unavailable, 
facility_unavailable_exception)
-   STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, 
facility_unavailable_exception)
-
.align  7
.globl  __end_handlers
 __end_handlers:
@@ -1268,6 +1262,12 @@ fwnmi_data_area:
. = 0x8000
 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
 
+#ifdef CONFIG_CBE_RAS
+   STD_EXCEPTION_CO

[PATCH v4 1/3] ppc64/book3s: fix branching to out of line handlers in relocation kernel

2016-04-07 Thread Hari Bathini
Some of the interrupt vectors on 64-bit POWER server processors  are
only 32 bytes long (8 instructions), which is not enough for the full
first-level interrupt handler. For these we need to branch to an out-
of-line (OOL) handler. But when we are running a relocatable kernel,
interrupt vectors till __end_interrupts marker are copied down to real
address 0x100. So, branching to labels (read OOL handlers) outside this
section should be handled differently (see LOAD_HANDLER()), considering
relocatable kernel, which would need atleast 4 instructions.

However, branching from interrupt vector means that we corrupt the CFAR
(come-from address register) on POWER7 and later processors as mentioned
in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains
the part up to the point where the CFAR is saved in the PACA should be
part of the short interrupt vectors before we branch out to OOL handlers.

But as mentioned already, there are interrupt vectors on 64-bit POWER server
processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.),
which cannot accomodate the above two cases at the same time owing to space
constraint. Currently, in these interrupt vectors, we simply branch out to
OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when
running a relocatable kernel (eg. kdump case). While this has been the case
for sometime now and kdump is used widely, we were fortunate not to see any
problems so far, for three reasons:

1. In almost all cases, production kernel (relocatable) is used for
   kdump as well, which would mean that crashed kernel's OOL handler
   would be at the same place where we endup branching to, from short
   interrupt vector of kdump kernel.
2. Also, OOL handler was unlikely the reason for crash in almost all
   the kdump scenarios, which meant we had a sane OOL handler from
   crashed kernel that we branched to.
3. On most 64-bit POWER server processors, page size is large enough
   that marking interrupt vector code as executable (see commit
   429d2e83) leads to marking OOL handler code from crashed kernel,
   that sits right below interrupt vector code from kdump kernel, as
   executable as well.

Let us fix this undependable code path by moving these OOL handlers below
__end_interrupts marker to make sure we also copy these handlers to real
address 0x100 when running a relocatable kernel. Because the interrupt
vectors branching to these OOL handlers are not long enough to use
LOAD_HANDLER() for branching as discussed above.

This fix has been tested successfully in kdump scenario, on a lpar with 4K page
size by using different default/production kernel and kdump kernel.

Signed-off-by: Hari Bathini 
Signed-off-by: Mahesh Salgaonkar 
---

Michael, I did test this patchset in different scenarios. But if you feel
the change is too radical, we could go with version2. But I thought this was
worth a shot.

changes from v3:
1. No changes in this patch except for a spellcheck
2. A new patch that tries to free up space below 0x7000 (2/3)
3. A new patch to remove __end_handlers marker (3/3)


 arch/powerpc/kernel/exceptions-64s.S |   29 +++--
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 7716ceb..f76b2f3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -953,6 +953,25 @@ hv_facility_unavailable_relon_trampoline:
 #endif
STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
 
+   /*
+* Out-Of-Line handlers for relocation-on interrupt vectors
+*
+* We need these OOL handlers to be below __end_interrupts
+* marker to ensure we also copy these OOL handlers along
+* with the interrupt vectors to real address 0x100 when
+* running a relocatable kernel. Because the interrupt
+* vectors branching to these OOL handlers are not long
+* enough to use LOAD_HANDLER() for branching.
+*/
+   STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
+   MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
+
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
+   STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+   STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
+
/* Other future vectors */
.align  7
.globl  __end_interrupts
@@ -1234,16 +1253,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
.globl  __end_handlers
 __end_handlers:
 
-   /* Equivalents to the above handlers for relocation-on interrupt 
vectors */
-   STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
-   MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doo

Re: [v4, 2/3] ppc64/book3s: make some room for common interrupt vector code

2016-04-17 Thread Hari Bathini



On 04/15/2016 06:29 PM, Michael Ellerman wrote:

On Fri, 2016-04-15 at 21:06 +1000, Michael Ellerman wrote:

Hi Hari,

Thanks for persisting with this.

On Thu, 2016-07-04 at 21:58:50 UTC, Hari Bathini wrote:

With the previous patch, we choke out whatever little space is left
below 0x7000 (FWNMI hard block) while there is a hole of ~1400 bytes
below __end_interrupts marker when CONFIG_CBE_RAS is disabled.
Considering CONFIG_CBE_RAS is not enabled by default for BOOK3S, this
is not a desirable scenario especially when we have to worry about
each additional instruction that goes below 0x7000.

Memory region from 0x1800 to 0x4000 is dedicated for common interrupt
vector code. Also, we never hit an interrupt below 0x300 when IR=DR=1
implying memory region between 0x4000 to 0x4300 can also be used for
common interrupt vector code. So, we can effectively use memory region
between 0x1800 to 0x4300 for common interrupt vector code.

On Power9 the system-call-vectored instruction will use the region at 0x3000, so
moving code into that space is not a good long term plan.

I'll take your v2 and put it in next next week.

I'll add this fixes line, which I think is correct:

Fixes: c1fb6816fb1b ("powerpc: Add relocation on exception vector handlers")


Yeah. Thanks!


cheers

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: on crash, kexec'ed kernel needs all CPUs are online

2015-11-04 Thread Hari Bathini

On 10/16/2015 12:30 AM, Laurent Vivier wrote:

On kexec, all secondary offline CPUs are onlined before
starting the new kernel, this is not done in the case of kdump.

If kdump is configured and a kernel crash occurs whereas
some secondaries CPUs are offline (SMT=off),
the new kernel is not able to start them and displays some
"Processor X is stuck.".

Starting with POWER8, subcore logic relies on all threads of
core being booted. So, on startup kernel tries to start all
threads, and asks OPAL (or RTAS) to start all CPUs (including
threads). If a CPU has been offlined by the previous kernel,
it has not been returned to OPAL, and thus OPAL cannot restart
it: this CPU has been lost...

Signed-off-by: Laurent Vivier



Hi Laurent,

Sorry for jumping too late into this.
Are you seeing this issue even with the below patches:

pseries:
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c1caae3de46a072d0855729aed6e793e536a4a55

opal/powernv:
https://github.com/open-power/skiboot/commit/9ee56b5

Thanks
Hari


---
  arch/powerpc/kernel/crash.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 51dbace..3ca9452 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -19,6 +19,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include 

  #include 
@@ -299,11 +300,30 @@ int crash_shutdown_unregister(crash_shutdown_t handler)
  }
  EXPORT_SYMBOL(crash_shutdown_unregister);
  
+/*

+ * The next kernel will try to start all secondary CPUs and if
+ * there are not online it will fail to start them.
+ *
+ */
+static void wake_offline_cpus(void)
+{
+   int cpu = 0;
+
+   for_each_present_cpu(cpu) {
+   if (!cpu_online(cpu)) {
+   pr_info("kexec: Waking offline cpu %d.\n", cpu);
+   cpu_up(cpu);
+   }
+   }
+}
+
  void default_machine_crash_shutdown(struct pt_regs *regs)
  {
unsigned int i;
int (*old_handler)(struct pt_regs *regs);
  
+	wake_offline_cpus();

+
/*
 * This function is only called after the system
 * has panicked or is otherwise in a critical state.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: on crash, kexec'ed kernel needs all CPUs are online

2015-11-05 Thread Hari Bathini

On 11/05/2015 07:02 AM, David Gibson wrote:

On Wed, 4 Nov 2015 14:54:51 +0100
Laurent Vivier  wrote:



On 04/11/2015 13:34, Hari Bathini wrote:

On 10/16/2015 12:30 AM, Laurent Vivier wrote:

On kexec, all secondary offline CPUs are onlined before
starting the new kernel, this is not done in the case of kdump.

If kdump is configured and a kernel crash occurs whereas
some secondaries CPUs are offline (SMT=off),
the new kernel is not able to start them and displays some
"Processor X is stuck.".

Starting with POWER8, subcore logic relies on all threads of
core being booted. So, on startup kernel tries to start all
threads, and asks OPAL (or RTAS) to start all CPUs (including
threads). If a CPU has been offlined by the previous kernel,
it has not been returned to OPAL, and thus OPAL cannot restart
it: this CPU has been lost...

Signed-off-by: Laurent Vivier


Hi Laurent,

Hi Hari,


Sorry for jumping too late into this.

better late than never :)


Are you seeing this issue even with the below patches:

pseries:
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c1caae3de46a072d0855729aed6e793e536a4a55

Unfortunately, this is unlikely to be relevant - this fixes a failure
while setting up the kexec.  The problem we see occurs once we've
booted the second kernel and it's attempting to bring up secondary CPUs.


opal/powernv:
https://github.com/open-power/skiboot/commit/9ee56b5

Very interesting. Is there a way to have a firmware with the fix ?

 From Laurent's analysis of the crash, I don't think this will be




relevant either, but I'm not sure.  It would be very interesting to
know which (if any) released firmwares include this patch so we can
test it.


Hi Laurent/David,

I am not so sure on this. While I get back on this, can you confirm you are
seeing the issue in both PowerVM (pseries) and baremetal (powernv). What is
the kernel version where the issue is seen for PowerVM and/or baremetal.
Also, for baremetal, can you mention the OPAL version on which the issue is
reproducible. If a bug is raised for this, I would be happy to be 
pointed to,

to get more information on this.

Thanks
Hari




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 00/18] Add FADump support on PowerNV platform

2019-02-27 Thread Hari Bathini



On 27/02/19 9:07 AM, Daniel Axtens wrote:

Hi Hari,



Hi Daniel,



Firmware-Assisted Dump (FADump) is currently supported only on pseries
platform. This patch series adds support for powernv platform too.

The first and third patches refactor the FADump code to make use of common
code across multiple platforms. The fifth patch adds basic FADump support
for powernv platform. Patches seven & eight honour reserved-ranges DT node
while reserving/releasing memory used by FADump. The next patch processes
CPU state data provided by firmware to create and append core notes to the
ELF core file. The tenth patch adds support for preserving crash data for
subsequent boots (useful in cases like petitboot). Patch twelve provides
support to export opalcore. This is to make debugging of failures in OPAL
code easier. The subsequent patch ensures vmcore processing is skipped
when only OPAL core is exported by f/w. The next patch provides option to
release the kernel memory used to export opalcore. Patch seventeen adds
backup area (an area populated before crash and used in the capture kernel
to setup vmcore file robustly) support on PowerNV platform. The remaining
patches update Firmware-Assisted Dump documentation appropriately.

Note that the quantam of increase in robustness due to patch seventeen may
not be worth breaking backward compatibility for older kernel versions.
Would like to hear thoughts from others on it.

The patch series is tested with the latest firmware plus the below skiboot
changes for MPIPL support:

 https://patchwork.ozlabs.org/project/skiboot/list/?series=78497
 ("MPIPL support")


If I want to test this, is there some userspace tooling that will
extract a fadump from a rebooted system and allow me to examine it as I
would with a kdump (e.g. with crash)?  I did look at
Documentation/powerpc/firmware-assisted-dump.txt but it seems to only
cover the kernel layer.


This patches export two dump files: /proc/vmcore (kernel), /proc/opalcore 
(OPAL).
If you are only interested in kernel dump, then passing fadump=on to the kernel
and enabling/starting kdump-tools/kdump service, shipped with distro, would 
ensure
dump is captured to /var/crash dir and rebooted but please be aware that the
script would not copy /proc/opalcore to disk yet. Need to update scripts once 
this
changes make it upstream..

Thanks
Hari



Re: [PATCH 00/18] Add FADump support on PowerNV platform

2019-02-27 Thread Hari Bathini

Hi Nick,


On 27/02/19 9:48 AM, Nicholas Piggin wrote:

Hari Bathini's on February 22, 2019 3:35 am:

Firmware-Assisted Dump (FADump) is currently supported only on pseries
platform. This patch series adds support for powernv platform too.

The first and third patches refactor the FADump code to make use of common
code across multiple platforms. The fifth patch adds basic FADump support
for powernv platform. Patches seven & eight honour reserved-ranges DT node
while reserving/releasing memory used by FADump. The next patch processes
CPU state data provided by firmware to create and append core notes to the
ELF core file. The tenth patch adds support for preserving crash data for
subsequent boots (useful in cases like petitboot). Patch twelve provides
support to export opalcore. This is to make debugging of failures in OPAL
code easier. The subsequent patch ensures vmcore processing is skipped
when only OPAL core is exported by f/w. The next patch provides option to
release the kernel memory used to export opalcore. Patch seventeen adds
backup area (an area populated before crash and used in the capture kernel
to setup vmcore file robustly) support on PowerNV platform. The remaining
patches update Firmware-Assisted Dump documentation appropriately.

Note that the quantam of increase in robustness due to patch seventeen may
not be worth breaking backward compatibility for older kernel versions.
Would like to hear thoughts from others on it.

The patch series is tested with the latest firmware plus the below skiboot
changes for MPIPL support:

 https://patchwork.ozlabs.org/project/skiboot/list/?series=78497
 ("MPIPL support")

---

Hari Bathini (18):
   powerpc/fadump: move internal fadump code to a new file
   powerpc/fadump: Improve fadump documentation
   pseries/fadump: move out platform specific support from generic code
   powerpc/fadump: use FADump instead of fadump for how it is pronounced
   powerpc/fadump: enable fadump support on OPAL based POWER platform
   powerpc/fadump: Update documentation about OPAL platform support
   powerpc/fadump: consider reserved ranges while reserving memory
   powerpc/fadump: consider reserved ranges while releasing memory
   powernv/fadump: process architected register state data provided by 
firmware
   powernv/fadump: add support to preserve crash data on FADUMP disabled 
kernel
   powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP
   powerpc/powernv: export /proc/opalcore for analysing opal crashes
   powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists
   powernv/opalcore: provide an option to invalidate /proc/opalcore file
   powernv/fadump: consider f/w load area
   powernv/fadump: update documentation about option to release opalcore
   powernv/fadump: use backup area to map PIR to logical CPUs

The need to map firmware identifiers like PIR to Linux numbering comes
up in a few places, OPAL msglog, pdbg debugger, etc. I wonder if we
could have Linux register its logical CPU numbers with OPAL after it
boots. Would that help with your usage?


The logical to PIR map of crashing kernel is needed in the capture kernel
(the kernel booted after crash to save the dump) that processes the register
data provided by f/w. Not sure if the logical to PIR map would be guaranteed
to be the same for both the crashing kernel and capture kernel.

Actually, I don't see any value-add in using the logical to PIR map in 
processing
the register data provided by f/w. pSeries isn't doing that and has been 
reliable.
Intention was to get inputs from others on whether it is worth it..


   powerpc/fadump: Update documentation about backup area support


  Documentation/powerpc/firmware-assisted-dump.txt |  208 ++--
  arch/powerpc/Kconfig |   23
  arch/powerpc/include/asm/fadump.h|  190 ---
  arch/powerpc/include/asm/opal-api.h  |   58 +
  arch/powerpc/include/asm/opal.h  |1
  arch/powerpc/kernel/Makefile |6
  arch/powerpc/kernel/fadump.c | 1199 --
  arch/powerpc/kernel/fadump_internal.c|  297 +
  arch/powerpc/kernel/fadump_internal.h|  250 +

I don't have much knowledge of fadump code, so I'll nitpick instead :P

Why are you calling it fadump_internal, what's internal about it? You
have the framework for the ops table etc here, which makes the platform
code have to #include "../kernel/fadump_internal.h", and suggests it's
not so internal. Seems like it would be fine just to go in
include/asm/fadump.h and kernel fadump.c?


Intention was to use that file to put common code used by platform specific code
on both pSeries & PowerNV. How about fadump_common instead of fadump_internal
to put that in perspective?




  arch/powerpc/kernel/prom.

[PATCH v2 00/16] Add FADump support on PowerNV platform

2019-04-16 Thread Hari Bathini
Firmware-Assisted Dump (FADump) is currently supported only on pseries
platform. This patch series adds support for powernv platform too.

The first and third patches refactor the FADump code to make use of common
code across multiple platforms. The fifth patch adds basic FADump support
for powernv platform. Patches seven & eight honour reserved-ranges DT node
while reserving/releasing memory used by FADump. The next patch processes
CPU state data provided by firmware to create and append core notes to the
ELF core file. The tenth patch adds support for preserving crash data for
subsequent boots (useful in cases like petitboot). Patch twelve provides
support to export opalcore. This is to make debugging of failures in OPAL
code easier. The subsequent patch ensures vmcore processing is skipped
when only OPAL core is exported by f/w. The next patch provides option to
release the kernel memory used to export opalcore. The remaining patches
update Firmware-Assisted Dump documentation appropriately.

The patch series is tested with the latest firmware plus the below skiboot
changes for MPIPL support:

https://patchwork.ozlabs.org/project/skiboot/list/?series=102588
("MPIPL support")


Changes in v2:
  * Rebased to latest upstream kernel version.
  * Updated according to latest OPAL changes.
  * Dropped patch seventeen from previous version as the quantam of increase
in robustness due it doesn't seem worth breaking backward compatibility
for older kernel versions.
---

Hari Bathini (16):
  powerpc/fadump: move internal fadump code to a new file
  powerpc/fadump: Improve fadump documentation
  pseries/fadump: move out platform specific support from generic code
  powerpc/fadump: use FADump instead of fadump for how it is pronounced
  powerpc/fadump: enable fadump support on OPAL based POWER platform
  powerpc/fadump: Update documentation about OPAL platform support
  powerpc/fadump: consider reserved ranges while reserving memory
  powerpc/fadump: consider reserved ranges while releasing memory
  powernv/fadump: process architected register state data provided by 
firmware
  powernv/fadump: add support to preserve crash data on FADUMP disabled 
kernel
  powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP
  powerpc/powernv: export /proc/opalcore for analysing opal crashes
  powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists
  powernv/opalcore: provide an option to invalidate /proc/opalcore file
  powernv/fadump: consider f/w load area
  powernv/fadump: update documentation about option to release opalcore


 Documentation/powerpc/firmware-assisted-dump.txt |  193 ++--
 arch/powerpc/Kconfig |   23 
 arch/powerpc/include/asm/fadump.h|  190 
 arch/powerpc/include/asm/opal-api.h  |   58 +
 arch/powerpc/include/asm/opal.h  |1 
 arch/powerpc/kernel/Makefile |6 
 arch/powerpc/kernel/fadump-common.c  |  205 
 arch/powerpc/kernel/fadump-common.h  |  222 
 arch/powerpc/kernel/fadump.c | 1163 --
 arch/powerpc/kernel/prom.c   |4 
 arch/powerpc/platforms/powernv/Makefile  |3 
 arch/powerpc/platforms/powernv/opal-call.c   |1 
 arch/powerpc/platforms/powernv/opal-core.c   |  602 +++
 arch/powerpc/platforms/powernv/opal-fadump.c |  562 +++
 arch/powerpc/platforms/powernv/opal-fadump.h |  116 ++
 arch/powerpc/platforms/pseries/Makefile  |1 
 arch/powerpc/platforms/pseries/rtas-fadump.c |  534 ++
 arch/powerpc/platforms/pseries/rtas-fadump.h |   96 ++
 18 files changed, 2998 insertions(+), 982 deletions(-)
 create mode 100644 arch/powerpc/kernel/fadump-common.c
 create mode 100644 arch/powerpc/kernel/fadump-common.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h



[PATCH v2 01/16] powerpc/fadump: move internal fadump code to a new file

2019-04-16 Thread Hari Bathini
Refactoring fadump code means internal fadump code is referenced from
different places. For ease, move internal code to a new file.

Signed-off-by: Hari Bathini 
---

Changes in v2:
* Using fadump-common.* instead of fadump_internal.*


 arch/powerpc/include/asm/fadump.h   |  112 
 arch/powerpc/kernel/Makefile|2 
 arch/powerpc/kernel/fadump-common.c |  184 +
 arch/powerpc/kernel/fadump-common.h |  126 +++
 arch/powerpc/kernel/fadump.c|  194 ++-
 5 files changed, 324 insertions(+), 294 deletions(-)
 create mode 100644 arch/powerpc/kernel/fadump-common.c
 create mode 100644 arch/powerpc/kernel/fadump-common.h

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 188776b..028a8ef 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -24,34 +24,6 @@
 
 #ifdef CONFIG_FA_DUMP
 
-/*
- * The RMA region will be saved for later dumping when kernel crashes.
- * RMA is Real Mode Area, the first block of logical memory address owned
- * by logical partition, containing the storage that may be accessed with
- * translate off.
- */
-#define RMA_START  0x0
-#define RMA_END(ppc64_rma_size)
-
-/*
- * On some Power systems where RMO is 128MB, it still requires minimum of
- * 256MB for kernel to boot successfully. When kdump infrastructure is
- * configured to save vmcore over network, we run into OOM issue while
- * loading modules related to network setup. Hence we need aditional 64M
- * of memory to avoid OOM issue.
- */
-#define MIN_BOOT_MEM   (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
-   + (0x1UL << 26))
-
-/* The upper limit percentage for user specified boot memory size (25%) */
-#define MAX_BOOT_MEM_RATIO 4
-
-#define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt)
-
-/* Alignement per CMA requirement. */
-#define FADUMP_CMA_ALIGNMENT   (PAGE_SIZE <<   \
-   max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
-
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA  0x0001
 #define FADUMP_HPTE_REGION 0x0002
@@ -60,18 +32,9 @@
 /* Dump request flag */
 #define FADUMP_REQUEST_FLAG0x0001
 
-/* FAD commands */
-#define FADUMP_REGISTER1
-#define FADUMP_UNREGISTER  2
-#define FADUMP_INVALIDATE  3
-
 /* Dump status flag */
 #define FADUMP_ERROR_FLAG  0x2000
 
-#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
-
-#define CPU_UNKNOWN(~((u32)0))
-
 /* Utility macros */
 #define SKIP_TO_NEXT_CPU(reg_entry)\
 ({ \
@@ -125,59 +88,8 @@ struct fadump_mem_struct {
struct fadump_section   rmr_region;
 };
 
-/* Firmware-assisted dump configuration details. */
-struct fw_dump {
-   unsigned long   cpu_state_data_size;
-   unsigned long   hpte_region_size;
-   unsigned long   boot_memory_size;
-   unsigned long   reserve_dump_area_start;
-   unsigned long   reserve_dump_area_size;
-   /* cmd line option during boot */
-   unsigned long   reserve_bootvar;
-
-   unsigned long   fadumphdr_addr;
-   unsigned long   cpu_notes_buf;
-   unsigned long   cpu_notes_buf_size;
-
-   int ibm_configure_kernel_dump;
-
-   unsigned long   fadump_enabled:1;
-   unsigned long   fadump_supported:1;
-   unsigned long   dump_active:1;
-   unsigned long   dump_registered:1;
-   unsigned long   nocma:1;
-};
-
-/*
- * Copy the ascii values for first 8 characters from a string into u64
- * variable at their respective indexes.
- * e.g.
- *  The string "FADMPINF" will be converted into 0x4641444d50494e46
- */
-static inline u64 str_to_u64(const char *str)
-{
-   u64 val = 0;
-   int i;
-
-   for (i = 0; i < sizeof(val); i++)
-   val = (*str) ? (val << 8) | *str++ : val << 8;
-   return val;
-}
-#define STR_TO_HEX(x)  str_to_u64(x)
-#define REG_ID(x)  str_to_u64(x)
-
-#define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF")
 #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE")
 
-/* The firmware-assisted dump format.
- *
- * The register save area is an area in the partition's memory used to preserve
- * the register contents (CPU state data) for the active CPUs during a firmware
- * assisted dump. The dump format contains register save area header followed
- * by register entries. Each list of registers for a CPU starts with
- * "CPUSTRT" and ends with "CPUEND".
- */
-
 /* Register save area header. */
 struct fadump_reg_save_area_header {
__be64  magic_number;
@@ -185,29 +97,9 @@ struct fad

[PATCH v2 02/16] powerpc/fadump: Improve fadump documentation

2019-04-16 Thread Hari Bathini
The figures depicting FADump's (Firmware-Assisted Dump) memory layout
are missing some finer details like different memory regions and what
they represent. Improve the documentation by updating those details.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   65 --
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 18c5fee..059993b 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -74,8 +74,9 @@ as follows:
there is crash data available from a previous boot. During
the early boot OS will reserve rest of the memory above
boot memory size effectively booting with restricted memory
-   size. This will make sure that the second kernel will not
-   touch any of the dump memory area.
+   size. This will make sure that this kernel (also, referred
+   to as second kernel or capture kernel) will not touch any
+   of the dump memory area.
 
 -- User-space tools will read /proc/vmcore to obtain the contents
of memory, which holds the previous crashed kernel dump in ELF
@@ -125,48 +126,52 @@ space memory except the user pages that were present in 
CMA region.
 
   o Memory Reservation during first kernel
 
-  Low memory Top of memory
-  0  boot memory size   |
-  |   ||<--Reserved dump area -->|  |
-  V   V|   Permanent Reservation |  V
-  +---+--/ /---+---++---++--+
-  |   ||CPU|HPTE|  DUMP |ELF |  |
-  +---+--/ /---+---++---++--+
-|   ^
-|   |
-\   /
- ---
-  Boot memory content gets transferred to
-  reserved area by firmware at the time of
-  crash
+  Low memoryTop of memory
+  0  boot memory size  |<--Reserved dump area --->|  |
+  |   ||   Permanent Reservation  |  |
+  V   V|   (Preserve area)|  V
+  +---+--/ /---+---+++---++--+
+  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
+  +---+--/ /---+---+++---++--+
+|   ^  ^
+|   |  |
+\   /  |
+ --- FADump Header
+  Boot memory content gets transferred   (meta area)
+  to reserved area by firmware at the
+  time of crash
+
Fig. 1
 
+
   o Memory Reservation during second kernel after crash
 
-  Low memoryTop of memory
-  0  boot memory size   |
-  |   |<- Reserved dump area --- -->|
-  V   V V
-  +---+--/ /---+---++---++--+
-  |   ||CPU|HPTE|  DUMP |ELF |  |
-  +---+--/ /---+---++---++--+
+  Low memoryTop of memory
+  0  boot memory size|
+  |   |<- Reserved dump area --->|
+  V   V|< Preserve area ->|  V
+  +---+--/ /---+---+++---++--+
+  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
+  +---+--/ /---+---+++---++--+
 |  |
 V  V
Used by second/proc/vmcore
kernel to boot
Fig. 2
 
-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The dump data available through
-/proc/vmcore will be in ELF format. Hence the existing kdump
-infrastructure (kdump scripts) to save the dump works fine with
-minor modifications.
+Currently the dump will be copied from /proc/vmcore to a new file upon
+user intervention. The dump data available through /proc/vmcore will be
+in ELF format. Hence the existing kdump infrastructure (kdump scripts)
+to save the dump works fine with minor modifications. KDump scripts on
+major Distro releases have already been modified to work seemlessly (no
+user intervention in s

[PATCH v2 03/16] pseries/fadump: move out platform specific support from generic code

2019-04-16 Thread Hari Bathini
Introduce callbacks for platform specific operations like register,
unregister, invalidate & such, and move pseries specific code into
platform code.

Signed-off-by: Hari Bathini 
---

Changes in v2:
* pSeries specific fadump code files are named rtas-fadump.*
  instead of pseries_fadump.*


 arch/powerpc/include/asm/fadump.h|   75 
 arch/powerpc/kernel/fadump-common.h  |   39 ++
 arch/powerpc/kernel/fadump.c |  501 ++--
 arch/powerpc/platforms/pseries/Makefile  |1 
 arch/powerpc/platforms/pseries/rtas-fadump.c |  538 ++
 arch/powerpc/platforms/pseries/rtas-fadump.h |   96 +
 6 files changed, 711 insertions(+), 539 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 028a8ef..d27cde7 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -24,79 +24,8 @@
 
 #ifdef CONFIG_FA_DUMP
 
-/* Firmware provided dump sections */
-#define FADUMP_CPU_STATE_DATA  0x0001
-#define FADUMP_HPTE_REGION 0x0002
-#define FADUMP_REAL_MODE_REGION0x0011
-
-/* Dump request flag */
-#define FADUMP_REQUEST_FLAG0x0001
-
-/* Dump status flag */
-#define FADUMP_ERROR_FLAG  0x2000
-
-/* Utility macros */
-#define SKIP_TO_NEXT_CPU(reg_entry)\
-({ \
-   while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))  \
-   reg_entry++;\
-   reg_entry++;\
-})
-
 extern int crashing_cpu;
 
-/* Kernel Dump section info */
-struct fadump_section {
-   __be32  request_flag;
-   __be16  source_data_type;
-   __be16  error_flags;
-   __be64  source_address;
-   __be64  source_len;
-   __be64  bytes_dumped;
-   __be64  destination_address;
-};
-
-/* ibm,configure-kernel-dump header. */
-struct fadump_section_header {
-   __be32  dump_format_version;
-   __be16  dump_num_sections;
-   __be16  dump_status_flag;
-   __be32  offset_first_dump_section;
-
-   /* Fields for disk dump option. */
-   __be32  dd_block_size;
-   __be64  dd_block_offset;
-   __be64  dd_num_blocks;
-   __be32  dd_offset_disk_path;
-
-   /* Maximum time allowed to prevent an automatic dump-reboot. */
-   __be32  max_time_auto;
-};
-
-/*
- * Firmware Assisted dump memory structure. This structure is required for
- * registering future kernel dump with power firmware through rtas call.
- *
- * No disk dump option. Hence disk dump path string section is not included.
- */
-struct fadump_mem_struct {
-   struct fadump_section_headerheader;
-
-   /* Kernel dump sections */
-   struct fadump_section   cpu_state_data;
-   struct fadump_section   hpte_region;
-   struct fadump_section   rmr_region;
-};
-
-#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE")
-
-/* Register save area header. */
-struct fadump_reg_save_area_header {
-   __be64  magic_number;
-   __be32  version;
-   __be32  num_cpu_offset;
-};
-
 extern int is_fadump_memory_area(u64 addr, ulong size);
 extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data);
@@ -111,5 +40,5 @@ extern void fadump_cleanup(void);
 static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
-#endif
-#endif
+#endif /* !CONFIG_FA_DUMP */
+#endif /* __PPC64_FA_DUMP_H__ */
diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index 8ccd96d..f926145 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -47,6 +47,12 @@
 #define FADUMP_UNREGISTER  2
 #define FADUMP_INVALIDATE  3
 
+/* Firmware-Assited Dump platforms */
+enum fadump_platform_type {
+   FADUMP_PLATFORM_UNKNOWN = 0,
+   FADUMP_PLATFORM_PSERIES,
+};
+
 #define FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
 
 #define CPU_UNKNOWN(~((u32)0))
@@ -91,6 +97,9 @@ struct fad_crash_memory_ranges {
unsigned long long  size;
 };
 
+/* Platform specific callback functions */
+struct fadump_ops;
+
 /* Firmware-assisted dump configuration details. */
 struct fw_dump {
unsigned long   cpu_state_data_size;
@@ -98,6 +107,8 @@ struct fw_dump {
unsigned long   boot_memory_size;
unsigned long   reserve_dump_area_start;
unsigned long   reserve_dump_area_size;
+   unsigned long   meta_

[PATCH v2 04/16] powerpc/fadump: use FADump instead of fadump for how it is pronounced

2019-04-16 Thread Hari Bathini
Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   56 +++---
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 059993b..62e75ef 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -8,18 +8,18 @@ a crashed system, and to do so from a fully-reset system, and
 to minimize the total elapsed time until the system is back
 in production use.
 
-- Firmware assisted dump (fadump) infrastructure is intended to replace
+- Firmware-Assisted Dump (FADump) infrastructure is intended to replace
   the existing phyp assisted dump.
 - Fadump uses the same firmware interfaces and memory reservation model
   as phyp assisted dump.
-- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore
+- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore
   in the ELF format in the same way as kdump. This helps us reuse the
   kdump infrastructure for dump capture and filtering.
 - Unlike phyp dump, userspace tool does not need to refer any sysfs
   interface while reading /proc/vmcore.
-- Unlike phyp dump, fadump allows user to release all the memory reserved
+- Unlike phyp dump, FADump allows user to release all the memory reserved
   for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
-- Once enabled through kernel boot parameter, fadump can be
+- Once enabled through kernel boot parameter, FADump can be
   started/stopped through /sys/kernel/fadump_registered interface (see
   sysfs files section below) and can be easily integrated with kdump
   service start/stop init scripts.
@@ -33,7 +33,7 @@ dump offers several strong, practical advantages:
in a clean, consistent state.
 -- Once the dump is copied out, the memory that held the dump
is immediately available to the running kernel. And therefore,
-   unlike kdump, fadump doesn't need a 2nd reboot to get back
+   unlike kdump, FADump doesn't need a 2nd reboot to get back
the system to the production configuration.
 
 The above can only be accomplished by coordination with,
@@ -61,7 +61,7 @@ as follows:
  boot successfully. For syntax of crashkernel= parameter,
  refer to Documentation/kdump/kdump.txt. If any offset is
  provided in crashkernel= parameter, it will be ignored
- as fadump uses a predefined offset to reserve memory
+ as FADump uses a predefined offset to reserve memory
  for boot memory dump preservation in case of a crash.
 
 -- After the low memory (boot memory) area has been saved, the
@@ -120,7 +120,7 @@ blocking this significant chunk of memory from production 
kernel.
 Hence, the implementation uses the Linux kernel's Contiguous Memory
 Allocator (CMA) for memory reservation if CMA is configured for kernel.
 With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this fadump will
+use it, while kernel is prevented from using it. With this FADump will
 still be able to capture all of the kernel memory and most of the user
 space memory except the user pages that were present in CMA region.
 
@@ -170,14 +170,14 @@ KDump, as dump mechanism.
 The tools to examine the dump will be same as the ones
 used for kdump.
 
-How to enable firmware-assisted dump (fadump):
+How to enable firmware-assisted dump (FADump):
 -
 
 1. Set config option CONFIG_FA_DUMP=y and build kernel.
-2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-   By default, fadump reserved memory will be initialized as CMA area.
-   Alternatively, user can boot linux kernel with 'fadump=nocma' to
-   prevent fadump to use CMA.
+2. Boot into linux kernel with 'FADump=on' kernel cmdline option.
+   By default, FADump reserved memory will be initialized as CMA area.
+   Alternatively, user can boot linux kernel with 'FADump=nocma' to
+   prevent FADump to use CMA.
 3. Optionally, user can also set 'crashkernel=' kernel cmdline
to specify size of the memory to reserve for boot memory dump
preservation.
@@ -190,7 +190,7 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been 
deprecated. Instead
  option is set at kernel cmdline.
   3. if user wants to capture all of user space memory and ok with
  reserved memory not available to production system, then
- 'fadump=nocma' kernel parameter can be used to fallback to
+ 'FADump=nocma' kernel parameter can be used to fallback to
  old behaviour.
 
 Sysfs/debugfs files:
@@ -203,29 +203,29 @@ Here is the list of files under kernel sysfs:
 
  /sys/kernel/fadump_enabled
 
-This is used to display the fadump status.
-0 = fadump is disa

[PATCH v2 05/16] powerpc/fadump: enable fadump support on OPAL based POWER platform

2019-04-16 Thread Hari Bathini
From: Hari Bathini 

Firmware-assisted dump support is enabled for OPAL based POWER platforms
in P9 firmware. Make the corresponding updates in kernel to enable fadump
support for such platforms.

Signed-off-by: Hari Bathini 
---

Changes in v2:
* Updated API number for FADump according to recent OPAL changes


 arch/powerpc/Kconfig |5 
 arch/powerpc/include/asm/opal-api.h  |   35 ++
 arch/powerpc/include/asm/opal.h  |1 
 arch/powerpc/kernel/fadump-common.c  |   27 ++
 arch/powerpc/kernel/fadump-common.h  |   44 ++-
 arch/powerpc/kernel/fadump.c |  259 ++
 arch/powerpc/platforms/powernv/Makefile  |1 
 arch/powerpc/platforms/powernv/opal-call.c   |1 
 arch/powerpc/platforms/powernv/opal-fadump.c |  375 ++
 arch/powerpc/platforms/powernv/opal-fadump.h |   40 +++
 arch/powerpc/platforms/pseries/rtas-fadump.c |   18 -
 11 files changed, 716 insertions(+), 90 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2d0be82..2366a84 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -556,7 +556,7 @@ config CRASH_DUMP
 
 config FA_DUMP
bool "Firmware-assisted dump"
-   depends on PPC64 && PPC_RTAS
+   depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
select CRASH_CORE
select CRASH_DUMP
help
@@ -567,7 +567,8 @@ config FA_DUMP
  is meant to be a kdump replacement offering robustness and
  speed not possible without system firmware assistance.
 
- If unsure, say "N"
+ If unsure, say "y". Only special kernels like petitboot may
+ need to say "N" here.
 
 config IRQ_ALL_CPUS
bool "Distribute interrupts on all CPUs by default"
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b..75471c2 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -210,7 +210,8 @@
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
 #defineOPAL_NX_COPROC_INIT 167
-#define OPAL_LAST  167
+#define OPAL_CONFIGURE_FADUMP  173
+#define OPAL_LAST  173
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -972,6 +973,37 @@ struct opal_sg_list {
 };
 
 /*
+ * Firmware-Assisted Dump (FADump)
+ */
+
+/* The maximum number of dump sections supported by OPAL */
+#define OPAL_FADUMP_NR_SECTIONS64
+
+/* Kernel Dump section info */
+struct opal_fadump_section {
+   u8  src_type;
+   u8  reserved[7];
+   __be64  src_addr;
+   __be64  src_size;
+   __be64  dest_addr;
+   __be64  dest_size;
+};
+
+/*
+ * FADump memory structure for registering dump support with
+ * POWER f/w through opal call.
+ */
+struct opal_fadump_mem_struct {
+
+   __be16  section_size;   /*sizeof(struct fadump_section) */
+   __be16  section_count;  /* number of sections */
+   __be32  crashing_cpu;   /* Thread on which OPAL crashed */
+   __be64  reserved;
+
+   struct opal_fadump_section  section[OPAL_FADUMP_NR_SECTIONS];
+};
+
+/*
  * Dump region ID range usable by the OS
  */
 #define OPAL_DUMP_REGION_HOST_START0x80
@@ -1051,6 +1083,7 @@ enum {
OPAL_REBOOT_NORMAL  = 0,
OPAL_REBOOT_PLATFORM_ERROR  = 1,
OPAL_REBOOT_FULL_IPL= 2,
+   OPAL_REBOOT_OS_ERROR= 3,
 };
 
 /* Argument to OPAL_PCI_TCE_KILL */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index a55b01c..2123b3f 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -43,6 +43,7 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t 
bdfn,
uint64_t PE_handle);
 int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
uint64_t rate_phys, uint32_t size);
+int64_t opal_configure_fadump(uint64_t command, void *data, uint64_t 
data_size);
 int64_t opal_console_write(int64_t term_number, __be64 *length,
   const uint8_t *buffer);
 int64_t opal_console_read(int64_t term_number, __be64 *length,
diff --git a/arch/powerpc/kernel/fadump-common.c 
b/arch/powerpc/kernel/fadump-common.c
index 0182886..514bbb5 100644
--- a/arch/powerpc/kernel/fadump-common.c
+++ b/arch/powerpc/kernel/fadump-common.c
@@ -10,6 +10,9 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#undef DEBUG
+#define pr_fmt(fmt) "f

[PATCH v2 06/16] powerpc/fadump: Update documentation about OPAL platform support

2019-04-16 Thread Hari Bathini
With FADump support now available on both pseries and OPAL platforms,
update FADump documentation with these details.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   90 --
 1 file changed, 51 insertions(+), 39 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 62e75ef..844a229 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -70,7 +70,8 @@ as follows:
normal.
 
 -- The freshly booted kernel will notice that there is a new
-   node (ibm,dump-kernel) in the device tree, indicating that
+   node (ibm,dump-kernel on PSeries or ibm,opal/dump/result-table
+   on OPAL platform) in the device tree, indicating that
there is crash data available from a previous boot. During
the early boot OS will reserve rest of the memory above
boot memory size effectively booting with restricted memory
@@ -93,7 +94,9 @@ as follows:
 
 Please note that the firmware-assisted dump feature
 is only available on Power6 and above systems with recent
-firmware versions.
+firmware versions on PSeries (PowerVM) platform and Power9
+and above systems with recent firmware versions on PowerNV
+(OPAL) platform.
 
 Implementation details:
 --
@@ -108,57 +111,66 @@ that are run. If there is dump data, then the
 /sys/kernel/fadump_release_mem file is created, and the reserved
 memory is held.
 
-If there is no waiting dump data, then only the memory required
-to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is usually reserved at an offset greater than boot memory
-size (see Fig. 1). This area is *not* released: this region will
-be kept permanently reserved, so that it can act as a receptacle
-for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur. Since this reserved
-memory area is used only after the system crash, there is no point in
-blocking this significant chunk of memory from production kernel.
-Hence, the implementation uses the Linux kernel's Contiguous Memory
-Allocator (CMA) for memory reservation if CMA is configured for kernel.
-With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this FADump will
-still be able to capture all of the kernel memory and most of the user
-space memory except the user pages that were present in CMA region.
+If there is no waiting dump data, then only the memory required to
+hold CPU state, HPTE region, boot memory dump, FADump header and
+elfcore header, is usually reserved at an offset greater than boot
+memory size (see Fig. 1). This area is *not* released: this region
+will be kept permanently reserved, so that it can act as a receptacle
+for a copy of the boot memory content in addition to CPU state and
+HPTE region, in the case a crash does occur.
+
+Since this reserved memory area is used only after the system crash,
+there is no point in blocking this significant chunk of memory from
+production kernel. Hence, the implementation uses the Linux kernel's
+Contiguous Memory Allocator (CMA) for memory reservation if CMA is
+configured for kernel. With CMA reservation this memory will be
+available for applications to use it, while kernel is prevented from
+using it. With this FADump will still be able to capture all of the
+kernel memory and most of the user space memory except the user pages
+that were present in CMA region.
 
   o Memory Reservation during first kernel
 
-  Low memoryTop of memory
-  0  boot memory size  |<--Reserved dump area --->|  |
-  |   ||   Permanent Reservation  |  |
-  V   V|   (Preserve area)|  V
-  +---+--/ /---+---+++---++--+
-  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
-  +---+--/ /---+---+++---++--+
-|   ^  ^
-|   |  |
-\   /  |
- --- FADump Header
-  Boot memory content gets transferred   (meta area)
-  to reserved area by firmware at the
-  time of crash
-
+  Low memory Top of memory
+  0  boot memory size|<--- Reserved dump area --->|   |
+  |   |  |Permanent Reservatio|   |
+  V   V  |   (Preserve area)  |   V
+  +---+/ /---+---++---+-+-+---+
+  |   |  |///|/

[PATCH v2 07/16] powerpc/fadump: consider reserved ranges while reserving memory

2019-04-16 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse reserved-ranges DT
node and reserve kernel memory falling in these ranges for F/W
purposes. Ensure memory in these ranges is not overlapped with
memory reserved for FADump.

Also, use a smaller offset, instead of the size of the memory to
be reserved, by which to skip memory before making another attempt
at reserving memory, after the previous attempt to reserve memory
for FADump failed due to memory holes and/or reserved ranges, to
reduce the likelihood of memory reservation failure.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/fadump-common.h |   11 +++
 arch/powerpc/kernel/fadump.c|  137 ++-
 2 files changed, 145 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index 8ad98db..ff764d4 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -101,6 +101,17 @@ struct fadump_memory_range {
unsigned long long  size;
 };
 
+/*
+ * Amount of memory (1024MB) to skip before making another attempt at
+ * reserving memory (after the previous attempt to reserve memory for
+ * FADump failed due to memory holes and/or reserved ranges) to reduce
+ * the likelihood of memory reservation failure.
+ */
+#define OFFSET_SIZE0x4000U
+
+/* Maximum no. of reserved ranges supported for processing. */
+#define MAX_RESERVED_RANGES128
+
 /* Maximum no. of real memory regions supported by the kernel */
 #define MAX_REAL_MEM_REGIONS   8
 
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 913ab6e..39b6670 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -53,6 +53,9 @@ int crash_memory_ranges_size;
 int crash_mem_ranges;
 int max_crash_mem_ranges;
 
+struct fadump_memory_range reserved_ranges[MAX_RESERVED_RANGES];
+int reserved_ranges_cnt;
+
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
 
@@ -116,12 +119,116 @@ int __init fadump_cma_init(void)
 static int __init fadump_cma_init(void) { return 1; }
 #endif /* CONFIG_CMA */
 
+/*
+ * Sort the reserved ranges in-place and merge adjacent ranges
+ * to minimize the reserved ranges count.
+ */
+static void __init sort_and_merge_reserved_ranges(void)
+{
+   unsigned long long base, size;
+   struct fadump_memory_range tmp_range;
+   int i, j, idx;
+
+   if (!reserved_ranges_cnt)
+   return;
+
+   /* Sort the reserved ranges */
+   for (i = 0; i < reserved_ranges_cnt; i++) {
+   idx = i;
+   for (j = i + 1; j < reserved_ranges_cnt; j++) {
+   if (reserved_ranges[idx].base > reserved_ranges[j].base)
+   idx = j;
+   }
+   if (idx != i) {
+   tmp_range = reserved_ranges[idx];
+   reserved_ranges[idx] = reserved_ranges[i];
+   reserved_ranges[i] = tmp_range;
+   }
+   }
+
+   /* Merge adjacent reserved ranges */
+   idx = 0;
+   for (i = 1; i < reserved_ranges_cnt; i++) {
+   base = reserved_ranges[i-1].base;
+   size = reserved_ranges[i-1].size;
+   if (reserved_ranges[i].base == (base + size))
+   reserved_ranges[idx].size += reserved_ranges[i].size;
+   else {
+   idx++;
+   if (i == idx)
+   continue;
+
+   reserved_ranges[idx] = reserved_ranges[i];
+   }
+   }
+   reserved_ranges_cnt = idx + 1;
+}
+
+static int __init add_reserved_range(unsigned long base,
+unsigned long size)
+{
+   int i;
+
+   if (reserved_ranges_cnt == MAX_RESERVED_RANGES) {
+   /* Compact reserved ranges and try again. */
+   sort_and_merge_reserved_ranges();
+   if (reserved_ranges_cnt == MAX_RESERVED_RANGES)
+   return 0;
+   }
+
+   i = reserved_ranges_cnt++;
+   reserved_ranges[i].base = base;
+   reserved_ranges[i].size = size;
+   return 1;
+}
+
+/*
+ * Scan reserved-ranges to consider them while reserving/releasing
+ * memory for FADump.
+ */
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node)
+{
+   int len, ret;
+   unsigned long i;
+   const __be32 *prop;
+
+   /* reserved-ranges already scanned */
+   if (reserved_ranges_cnt != 0)
+   return;
+
+   prop = of_get_flat_dt_prop(node, "reserved-ranges", &len);
+
+   if (!prop)
+   return;
+
+   /*
+* Each reserved range is an (address,size) pair, 2 cells each,
+* totalling 4 cells per range.
+*/
+   for (i =

[PATCH v2 08/16] powerpc/fadump: consider reserved ranges while releasing memory

2019-04-16 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse 'reserved-ranges' DT
node to reserve kernel memory falling in these ranges for firmware
purposes. Along with the preserved area memory, also ensure memory
in reserved ranges is not overlapped with memory released by capture
kernel aftering saving vmcore. Also, fix the off-by-one error in
fadump_release_reserved_area function while releasing memory.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/fadump.c |   59 +-
 1 file changed, 41 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 39b6670..fd06571 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -123,7 +123,7 @@ static int __init fadump_cma_init(void) { return 1; }
  * Sort the reserved ranges in-place and merge adjacent ranges
  * to minimize the reserved ranges count.
  */
-static void __init sort_and_merge_reserved_ranges(void)
+static void sort_and_merge_reserved_ranges(void)
 {
unsigned long long base, size;
struct fadump_memory_range tmp_range;
@@ -164,8 +164,7 @@ static void __init sort_and_merge_reserved_ranges(void)
reserved_ranges_cnt = idx + 1;
 }
 
-static int __init add_reserved_range(unsigned long base,
-unsigned long size)
+static int add_reserved_range(unsigned long base, unsigned long size)
 {
int i;
 
@@ -1126,33 +1125,57 @@ static void fadump_release_reserved_area(unsigned long 
start, unsigned long end)
if (tend == end_pfn)
break;
 
-   start_pfn = tend + 1;
+   start_pfn = tend;
}
}
 }
 
 /*
- * Release the memory that was reserved in early boot to preserve the memory
- * contents. The released memory will be available for general use.
+ * Release the memory that was reserved during early boot to preserve the
+ * crash'ed kernel's memory contents except reserved dump area (permanent
+ * reservation) and reserved ranges used by F/W. The released memory will
+ * be available for general use.
  */
 static void fadump_release_memory(unsigned long begin, unsigned long end)
 {
+   int i;
unsigned long ra_start, ra_end;
-
-   ra_start = fw_dump.reserve_dump_area_start;
-   ra_end = ra_start + fw_dump.reserve_dump_area_size;
+   unsigned long tstart;
 
/*
-* exclude the dump reserve area. Will reuse it for next
-* fadump registration.
+* Add memory to permanently preserve to reserved ranges list
+* and exclude all these ranges while releasing memory.
 */
-   if (begin < ra_end && end > ra_start) {
-   if (begin < ra_start)
-   fadump_release_reserved_area(begin, ra_start);
-   if (end > ra_end)
-   fadump_release_reserved_area(ra_end, end);
-   } else
-   fadump_release_reserved_area(begin, end);
+   i = add_reserved_range(fw_dump.reserve_dump_area_start,
+  fw_dump.reserve_dump_area_size);
+   if (i == 0) {
+   /*
+* Reached the MAX reserved ranges count. To ensure reserved
+* dump area is excluded (as it will be reused for next
+* FADump registration), ignore the last reserved range and
+* add reserved dump area instead.
+*/
+   reserved_ranges_cnt--;
+   add_reserved_range(fw_dump.reserve_dump_area_start,
+  fw_dump.reserve_dump_area_size);
+   }
+   sort_and_merge_reserved_ranges();
+
+   tstart = begin;
+   for (i = 0; i < reserved_ranges_cnt; i++) {
+   ra_start = reserved_ranges[i].base;
+   ra_end = ra_start + reserved_ranges[i].size;
+
+   if (tstart >= ra_end)
+   continue;
+
+   if (tstart < ra_start)
+   fadump_release_reserved_area(tstart, ra_start);
+   tstart = ra_end;
+   }
+
+   if (tstart < end)
+   fadump_release_reserved_area(tstart, end);
 }
 
 static void fadump_invalidate_release_mem(void)



[PATCH v2 09/16] powernv/fadump: process architected register state data provided by firmware

2019-04-16 Thread Hari Bathini
From: Hari Bathini 

Firmware provides architected register state data at the time of crash.
Process this data and build CPU notes to append to ELF core.

Signed-off-by: Hari Bathini 
Signed-off-by: Vasant Hegde 
---

Changes in v2:
* Updated reg type values according to recent OPAL changes


 arch/powerpc/include/asm/opal-api.h  |   23 +++
 arch/powerpc/kernel/fadump-common.h  |3 
 arch/powerpc/platforms/powernv/opal-fadump.c |  187 --
 arch/powerpc/platforms/powernv/opal-fadump.h |4 +
 4 files changed, 206 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 75471c2..91f2735 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -976,6 +976,29 @@ struct opal_sg_list {
  * Firmware-Assisted Dump (FADump)
  */
 
+/* FADump thread header for register entries */
+struct opal_fadump_thread_hdr {
+   __be32  pir;
+   /* 0x00 - 0x0F - The corresponding stop state of the core */
+   u8  core_state;
+   u8  reserved[3];
+
+   __be32  offset; /* Offset to Register Entries array */
+   __be32  ecnt;   /* Number of entries */
+   __be32  esize;  /* Alloc size of each array entry in bytes */
+   __be32  eactsz; /* Actual size of each array entry in bytes */
+} __packed;
+
+#define OPAL_REG_TYPE_GPR  0x01
+#define OPAL_REG_TYPE_SPR  0x02
+
+/* FADump register entry. */
+struct opal_fadump_reg_entry {
+   __be32  reg_type;
+   __be32  reg_num;
+   __be64  reg_val;
+};
+
 /* The maximum number of dump sections supported by OPAL */
 #define OPAL_FADUMP_NR_SECTIONS64
 
diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index ff764d4..8d47382 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -117,6 +117,9 @@ struct fadump_memory_range {
 
 /* Firmware-assisted dump configuration details. */
 struct fw_dump {
+   unsigned long   cpu_state_destination_addr;
+   unsigned long   cpu_state_data_version;
+   unsigned long   cpu_state_entry_size;
unsigned long   cpu_state_data_size;
unsigned long   hpte_region_size;
unsigned long   boot_memory_size;
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c 
b/arch/powerpc/platforms/powernv/opal-fadump.c
index da8480d..853f663 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -94,6 +94,12 @@ static void update_fadump_config(struct fw_dump *fadump_conf,
 
last_end = base + size;
j++;
+   } else if (fdm->section[i].src_type ==
+  OPAL_FADUMP_CPU_STATE_DATA) {
+   fadump_conf->cpu_state_destination_addr =
+   be64_to_cpu(fdm->section[i].dest_addr);
+   fadump_conf->cpu_state_data_size =
+   be64_to_cpu(fdm->section[i].dest_size);
}
}
fadump_conf->rmr_regions_cnt = j;
@@ -199,6 +205,75 @@ static int opal_invalidate_fadump(struct fw_dump 
*fadump_conf)
return 0;
 }
 
+static inline void fadump_set_regval_regnum(struct pt_regs *regs, u32 reg_type,
+   u32 reg_num, u64 reg_val)
+{
+   if (reg_type == OPAL_REG_TYPE_GPR) {
+   if (reg_num < 32)
+   regs->gpr[reg_num] = reg_val;
+   return;
+   }
+
+   switch (reg_num) {
+   case 2000:
+   regs->nip = reg_val;
+   break;
+   case 2001:
+   regs->msr = reg_val;
+   break;
+   case 9:
+   regs->ctr = reg_val;
+   break;
+   case 8:
+   regs->link = reg_val;
+   break;
+   case 1:
+   regs->xer = reg_val;
+   break;
+   case 2002:
+   regs->ccr = reg_val;
+   break;
+   case 19:
+   regs->dar = reg_val;
+   break;
+   case 18:
+   regs->dsisr = reg_val;
+   break;
+   }
+}
+
+static inline void fadump_read_registers(char *bufp, unsigned int regs_cnt,
+unsigned int reg_entry_size,
+struct pt_regs *regs)
+{
+   int i;
+   struct opal_fadump_reg_entry *reg_entry;
+
+   memset(regs, 0, sizeof(struct pt_regs));
+
+   for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+   reg_entry = (struct opal_fadump_reg_entry *)bufp;
+   fadump_set_regval_regnum(regs,
+

[PATCH v2 10/16] powernv/fadump: add support to preserve crash data on FADUMP disabled kernel

2019-04-16 Thread Hari Bathini
Add a new kernel config option, CONFIG_PRESERVE_FA_DUMP that ensures
that crash data, from previously crash'ed kernel, is preserved. This
helps in cases where FADump is not enabled but the subsequent memory
preserving kernel boot is likely to process this crash data. One
typical usecase for this config option is petitboot kernel.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/Kconfig |9 +
 arch/powerpc/include/asm/fadump.h|9 +++--
 arch/powerpc/kernel/Makefile |6 +++
 arch/powerpc/kernel/fadump-common.h  |8 
 arch/powerpc/kernel/fadump.c |   47 +++---
 arch/powerpc/kernel/prom.c   |4 +-
 arch/powerpc/platforms/powernv/Makefile  |1 +
 arch/powerpc/platforms/powernv/opal-fadump.c |   37 +++-
 8 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2366a84..ac3259e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -570,6 +570,15 @@ config FA_DUMP
  If unsure, say "y". Only special kernels like petitboot may
  need to say "N" here.
 
+config PRESERVE_FA_DUMP
+   bool "Preserve Firmware-assisted dump"
+   depends on PPC64 && PPC_POWERNV && !FA_DUMP
+   help
+ On a kernel with FA_DUMP disabled, this option helps to preserve
+ crash data from a previously crash'ed kernel. Useful when the next
+ memory preserving kernel boot would process this crash data.
+ Petitboot kernel is the typical usecase for this option.
+
 config IRQ_ALL_CPUS
bool "Distribute interrupts on all CPUs by default"
depends on SMP
diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index d27cde7..d09b77b 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -27,9 +27,6 @@
 extern int crashing_cpu;
 
 extern int is_fadump_memory_area(u64 addr, ulong size);
-extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
- int depth, void *data);
-extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
 extern int should_fadump_crash(void);
@@ -41,4 +38,10 @@ static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 #endif /* !CONFIG_FA_DUMP */
+
+#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
+extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
+ int depth, void *data);
+extern int fadump_reserve_mem(void);
+#endif
 #endif /* __PPC64_FA_DUMP_H__ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fbecfba..42c24f8 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -65,7 +65,11 @@ obj-$(CONFIG_EEH)  += eeh.o eeh_pe.o eeh_dev.o 
eeh_cache.o \
  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)   += smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump.o
-obj-$(CONFIG_FA_DUMP)  += fadump.o fadump-common.o
+ifeq ($(CONFIG_FA_DUMP),y)
+obj-y  += fadump.o fadump-common.o
+else
+obj-$(CONFIG_PRESERVE_FA_DUMP) += fadump.o
+endif
 ifdef CONFIG_PPC32
 obj-$(CONFIG_E500) += idle_e500.o
 endif
diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index 8d47382..1bd3aeb 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -13,6 +13,7 @@
 #ifndef __PPC64_FA_DUMP_INTERNAL_H__
 #define __PPC64_FA_DUMP_INTERNAL_H__
 
+#ifndef CONFIG_PRESERVE_FA_DUMP
 /*
  * The RMA region will be saved for later dumping when kernel crashes.
  * RMA is Real Mode Area, the first block of logical memory address owned
@@ -88,6 +89,7 @@ struct fadump_crash_info_header {
 
 /* Platform specific callback functions */
 struct fadump_ops;
+#endif /* !CONFIG_PRESERVE_FA_DUMP */
 
 /* Firmware-Assited Dump platforms */
 enum fadump_platform_type {
@@ -157,9 +159,12 @@ struct fw_dump {
unsigned long   nocma:1;
 
enum fadump_platform_type   fadump_platform;
+#ifndef CONFIG_PRESERVE_FA_DUMP
struct fadump_ops   *ops;
+#endif
 };
 
+#ifndef CONFIG_PRESERVE_FA_DUMP
 struct fadump_ops {
ulong   (*init_fadump_mem_struct)(struct fw_dump *fadump_config);
int (*register_fadump)(struct fw_dump *fadump_config);
@@ -181,8 +186,9 @@ u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs 
*regs);
 void fadump_update_elfcore_header(struct fw_dump *fadump_config, char *bufp);
 int is_boot_memory_area_contiguous(struct fw_dump *fadump_conf);
 int is_res

[PATCH v2 11/16] powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP

2019-04-16 Thread Hari Bathini
Kernel config option CONFIG_PRESERVE_FA_DUMP is introduced to ensure
crash data, from previously crash'ed kernel, is preserved. Update
documentation with this details.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |9 +
 1 file changed, 9 insertions(+)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 844a229..fa35593 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -98,6 +98,15 @@ firmware versions on PSeries (PowerVM) platform and Power9
 and above systems with recent firmware versions on PowerNV
 (OPAL) platform.
 
+On OPAL based machines, system first boots into an intermittent
+kernel (referred to as petitboot kernel) before booting into the
+capture kernel. This kernel would have minimal kernel and/or
+userspace support to process crash data. Such kernel needs to
+preserve previously crash'ed kernel's memory for the subsequent
+capture kernel boot to process this crash data. Kernel config
+option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel
+to ensure that crash data is preserved to process later.
+
 Implementation details:
 --
 



[PATCH v2 12/16] powerpc/powernv: export /proc/opalcore for analysing opal crashes

2019-04-16 Thread Hari Bathini
From: Hari Bathini 

Export /proc/opalcore file to analyze opal crashes. Since opalcore can
be generated independent of CONFIG_FA_DUMP support in kernel, add this
support under a new kernel config option CONFIG_OPAL_CORE. Also, avoid
code duplication by moving common code used for processing the register
state data to export /proc/vmcore and/or /proc/opalcore file(s).

Signed-off-by: Hari Bathini 
---
 arch/powerpc/Kconfig |9 
 arch/powerpc/platforms/powernv/Makefile  |1 
 arch/powerpc/platforms/powernv/opal-core.c   |  563 ++
 arch/powerpc/platforms/powernv/opal-fadump.c |   94 +---
 arch/powerpc/platforms/powernv/opal-fadump.h |   72 +++
 5 files changed, 669 insertions(+), 70 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ac3259e..2c76203 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -579,6 +579,15 @@ config PRESERVE_FA_DUMP
  memory preserving kernel boot would process this crash data.
  Petitboot kernel is the typical usecase for this option.
 
+config OPAL_CORE
+   bool "Export OPAL memory as /proc/opalcore"
+   depends on PPC64 && PPC_POWERNV
+   help
+ This option uses the MPIPL support in firmware to provide
+ an ELF core of OPAL memory after a crash. The ELF core is
+ exported as /proc/opalcore file which is helpful in debugging
+ opal crashes using GDB.
+
 config IRQ_ALL_CPUS
bool "Distribute interrupts on all CPUs by default"
depends on SMP
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index b4a8022..e659afd 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -8,6 +8,7 @@ obj-y   += opal-kmsg.o opal-powercap.o 
opal-psr.o opal-sensor-groups.o
 obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_FA_DUMP)  += opal-fadump.o
 obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o
+obj-$(CONFIG_OPAL_CORE)+= opal-core.o
 obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE) += pci-cxl.o
 obj-$(CONFIG_EEH)  += eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/opal-core.c 
b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000..8bf687d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,563 @@
+/*
+ * Interface for exporting the OPAL ELF core.
+ * Heavily inspired from fs/proc/vmcore.c
+ *
+ * Copyright 2018-2019, IBM Corp.
+ * Author: Hari Bathini 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "opalcore: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "../../kernel/fadump-common.h"
+#include "opal-fadump.h"
+
+#define MAX_PT_LOAD_CNT8
+
+/* NT_AUXV note related info */
+#define AUXV_CNT   1
+#define AUXV_DESC_SZ   (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
+
+struct opalcore_config {
+   unsigned intnum_cpus;
+   /* PIR value of crashing CPU */
+   unsigned intcrashing_cpu;
+
+   /* CPU state data info from F/W */
+   unsigned long   cpu_state_destination_addr;
+   unsigned long   cpu_state_data_size;
+   unsigned long   cpu_state_entry_size;
+
+   /* OPAL memory to be exported as PT_LOAD segments */
+   unsigned long   ptload_addr[MAX_PT_LOAD_CNT];
+   unsigned long   ptload_size[MAX_PT_LOAD_CNT];
+   unsigned long   ptload_cnt;
+
+   /* Pointer to the first PT_LOAD in the ELF core file */
+   Elf64_Phdr  *ptload_phdr;
+
+   /* Total size of opalcore file. */
+   size_t  opalcore_size;
+
+   struct proc_dir_entry   *proc_opalcore;
+
+   /* Buffer for all the ELF core headers and the PT_NOTE */
+   size_t  opalcorebuf_sz;
+   char*opalcorebuf;
+
+   /* NT_AUXV buffer */
+   charauxv_buf[AUXV_DESC_SZ];
+};
+
+struct opalcore {
+   struct list_head list;
+   unsigned long long paddr;
+   unsigned long long size;
+   loff_t offset;
+};
+
+static LIST_HEAD(opalcore_list);
+static struct opalcore_config *oc_conf;
+static const struct opal_fadump_mem_struct *fdm_active;
+
+/*
+ * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
+ * by kernel, SIGTERM otherwise.
+ */
+bool kernel_initiated;
+
+static struct opalcore * __init get_new

[PATCH v2 13/16] powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists

2019-04-16 Thread Hari Bathini
If OPAL crashes when the kernel is not registered for FADump, F/W still
exports OPAL core through result-table DT node. Make sure '/proc/vmcore'
processing is skipped as only data relevant to OPAL core is exported in
such scenario.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/platforms/powernv/opal-fadump.c |   12 
 1 file changed, 12 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c 
b/arch/powerpc/platforms/powernv/opal-fadump.c
index 65db21a..f530df0 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -108,6 +108,18 @@ static void update_fadump_config(struct fw_dump 
*fadump_conf,
be64_to_cpu(fdm->section[i].dest_size);
}
}
+
+   /*
+* If dump is active and no kernel memory region is found in
+* result-table, it means OPAL crashed on system with MPIPL
+* support and the kernel was not registered for FADump at the
+* time of crash. Skip processing /proc/vmcore in that case.
+*/
+   if (j == 0) {
+   fadump_conf->dump_active = 0;
+   return;
+   }
+
fadump_conf->rmr_regions_cnt = j;
pr_debug("Real memory regions count: %lu\n",
 fadump_conf->rmr_regions_cnt);



[PATCH v2 14/16] powernv/opalcore: provide an option to invalidate /proc/opalcore file

2019-04-16 Thread Hari Bathini
Writing '1' to /sys/kernel/fadump_release_opalcore would release the
memory held by kernel in exporting /proc/opalcore file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/platforms/powernv/opal-core.c |   39 
 1 file changed, 39 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-core.c 
b/arch/powerpc/platforms/powernv/opal-core.c
index 8bf687d..5503b8b 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -19,6 +19,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -532,6 +534,36 @@ static void opalcore_cleanup(void)
 }
 __exitcall(opalcore_cleanup);
 
+static ssize_t fadump_release_opalcore_store(struct kobject *kobj,
+struct kobj_attribute *attr,
+const char *buf, size_t count)
+{
+   int input = -1;
+
+   if (kstrtoint(buf, 0, &input))
+   return -EINVAL;
+
+   if (input == 1) {
+   if (oc_conf == NULL) {
+   pr_err("'/proc/opalcore' file does not exist!\n");
+   return -EPERM;
+   }
+
+   /*
+* Take away '/proc/opalcore' and release all memory
+* used for exporting this file.
+*/
+   opalcore_cleanup();
+   } else
+   return -EINVAL;
+
+   return count;
+}
+
+static struct kobj_attribute opalcore_rel_attr = 
__ATTR(fadump_release_opalcore,
+   0200, NULL,
+   fadump_release_opalcore_store);
+
 /* Init function for opalcore module. */
 static int __init opalcore_init(void)
 {
@@ -558,6 +590,13 @@ static int __init opalcore_init(void)
 &proc_opalcore_operations);
if (oc_conf->proc_opalcore)
proc_set_size(oc_conf->proc_opalcore, oc_conf->opalcore_size);
+
+   rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr);
+   if (rc) {
+   pr_warn("unable to create sysfs file fadump_release_opalcore 
(%d)\n",
+   rc);
+   }
+
return 0;
 }
 fs_initcall(opalcore_init);



[PATCH v2 15/16] powernv/fadump: consider f/w load area

2019-04-16 Thread Hari Bathini
OPAL loads kernel & initrd at 512MB offset (256MB size), also exported
as ibm,opal/dump/fw-load-area. So, if boot memory size of FADump is
less than 768MB, kernel memory to be exported as '/proc/vmcore' would
be overwritten by f/w while loading kernel & initrd. To avoid such a
scenario, enforce a minimum boot memory size of 768MB on OPAL platform.

Also, skip using FADump if a newer F/W version loads kernel & initrd
above 768MB.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/fadump-common.h  |   15 +--
 arch/powerpc/kernel/fadump.c |8 
 arch/powerpc/platforms/powernv/opal-fadump.c |   23 +++
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index 1bd3aeb..f59fdc7 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -24,14 +24,25 @@
 #define RMA_END(ppc64_rma_size)
 
 /*
+ * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum
+ * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't
+ * mess with crash'ed kernel's memory during MPIPL.
+ */
+#define OPAL_MIN_BOOT_MEM  (0x3000UL)
+
+/*
  * On some Power systems where RMO is 128MB, it still requires minimum of
  * 256MB for kernel to boot successfully. When kdump infrastructure is
  * configured to save vmcore over network, we run into OOM issue while
  * loading modules related to network setup. Hence we need additional 64M
  * of memory to avoid OOM issue.
  */
-#define MIN_BOOT_MEM   (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
-   + (0x1UL << 26))
+#define PSERIES_MIN_BOOT_MEM   (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : \
+RMA_END) + (0x1UL << 26))
+
+#define MIN_BOOT_MEM   ((fw_dump.fadump_platform ==\
+FADUMP_PLATFORM_POWERNV) ? OPAL_MIN_BOOT_MEM : \
+PSERIES_MIN_BOOT_MEM)
 
 /* The upper limit percentage for user specified boot memory size (25%) */
 #define MAX_BOOT_MEM_RATIO 4
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index ba26169..3c3adc2 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -582,6 +582,14 @@ int __init fadump_reserve_mem(void)
ALIGN(fw_dump.boot_memory_size,
FADUMP_CMA_ALIGNMENT);
 #endif
+
+   if ((fw_dump.fadump_platform == FADUMP_PLATFORM_POWERNV) &&
+   (fw_dump.boot_memory_size < OPAL_MIN_BOOT_MEM)) {
+   pr_err("Can't enable fadump with boot memory size 
(0x%lx) less than 0x%lx\n",
+  fw_dump.boot_memory_size, OPAL_MIN_BOOT_MEM);
+   goto error_out;
+   }
+
fw_dump.rmr_source_len = fw_dump.boot_memory_size;
if (!fadump_get_rmr_regions()) {
pr_err("Too many holes in boot memory area to enable 
fadump\n");
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c 
b/arch/powerpc/platforms/powernv/opal-fadump.c
index f530df0..0a22257 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -528,6 +528,29 @@ int __init opal_dt_scan_fadump(struct fw_dump 
*fadump_conf, ulong node)
fadump_conf->cpu_state_entry_size =
of_read_number(prop, 1);
}
+   } else {
+   int i, len;
+
+   prop = of_get_flat_dt_prop(dn, "fw-load-area", &len);
+   if (prop) {
+   /*
+* Each f/w load area is an (address,size) pair,
+* 2 cells each, totalling 4 cells per range.
+*/
+   for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+   u64 base, end;
+
+   base = of_read_number(prop + (i * 4) + 0, 2);
+   end = base;
+   end += of_read_number(prop + (i * 4) + 2, 2);
+   if (end > OPAL_MIN_BOOT_MEM) {
+   pr_err("F/W load area: 0x%llx-0x%llx\n",
+  base, end);
+   pr_err("F/W version not supported!\n");
+   return 1;
+   }
+   }
+   }
}
 
fadump_conf->ops= &opal_fadump_ops;



[PATCH v2 16/16] powernv/fadump: update documentation about option to release opalcore

2019-04-16 Thread Hari Bathini
With /proc/opalcore support available on OPAL based machines and an
option to release memory used by kernel in exporting /proc/opalcore,
update FADump documentation with these details.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   19 +++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index fa35593..6411449 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -107,6 +107,16 @@ capture kernel boot to process this crash data. Kernel 
config
 option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel
 to ensure that crash data is preserved to process later.
 
+-- On OPAL based machines (PowerNV), if the kernel is build with
+   CONFIG_OPAL_CORE=y, OPAL memory at the time of crash is also
+   exported as /proc/opalcore file. This procfs file is helpful
+   in debugging OPAL crashes with GDB. The kernel memory used
+   for exporting this procfs file can be released by echo'ing
+   '1' to /sys/kernel/fadump_release_opalcore node.
+
+   e.g.
+ # echo 1 > /sys/kernel/fadump_release_opalcore
+
 Implementation details:
 --
 
@@ -260,6 +270,15 @@ Here is the list of files under kernel sysfs:
 enhanced to use this interface to release the memory reserved for
 dump and continue without 2nd reboot.
 
+ /sys/kernel/fadump_release_opalcore
+
+This file is available only on OPAL based machines when FADump is
+active during capture kernel. This is used to release the memory
+used by the kernel to export /proc/opalcore file. To release this
+memory, echo '1' to it:
+
+echo 1  > /sys/kernel/fadump_release_opalcore
+
 Here is the list of files under powerpc debugfs:
 (Assuming debugfs is mounted on /sys/kernel/debug directory.)
 



Re: [PATCH 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents

2020-03-02 Thread Hari Bathini



On 11/02/20 8:29 AM, Pingfan Liu wrote:
> A bug is observed on pseries by taking the following steps on rhel:
> -1. drmgr -c mem -r -q 5
> -2. echo c > /proc/sysrq-trigger
> 
> And then, the failure looks like:
> kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/
> kdump: saving vmcore-dmesg.txt
> kdump: saving vmcore-dmesg.txt complete
> kdump: saving vmcore
>  Checking for memory holes : [  0.0 %] /  
>  Checking for memory holes : [100.0 %] |  
>  Excluding unnecessary pages   : [100.0 %] \  
>  Copying data  : [  0.3 %] -  
> eta: 38s[   44.337636] hash-mmu: mm: Hashing failure ! 
> EA=0x7fffba40 access=0x8004 current=makedumpfile
> [   44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
> psize 2 pte=0xc0005504
> [   44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 
> access=0x8004 current=makedumpfile
> [   44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
> psize 2 pte=0xc0005504
> [   44.337708] makedumpfile[469]: unhandled signal 7 at 7fffba40 nip 
> 7fffbbc4d7fc lr 00011356ca3c code 2
> [   44.338548] Core dump to |/bin/false pipe failed
> /lib/kdump-lib-initramfs.sh: line 98:   469 Bus error   
> $CORE_COLLECTOR /proc/vmcore 
> $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete
> kdump: saving vmcore failed
> 
> * Root cause *
>   After analyzing, it turns out that in the current implementation,
> when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as
> the code __remove_memory() comes before drmem_update_dt().
> 
> From a viewpoint of listener and publisher, the publisher notifies the
> listener before data is ready.  This introduces a problem where udev
> launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before
> updating. And in capture kernel, makedumpfile will access the memory based
> on the stale dt info, and hit a SIGBUS error due to an un-existed lmb.
> 
> * Fix *
>   In order to fix this issue, update dt before __remove_memory(), and
> accordingly the same rule in hot-add path.
> 
> This will introduce extra dt updating payload for each involved lmb when 
> hotplug.
> But it should be fine since drmem_update_dt() is memory based operation and
> hotplug is not a hot path.
> 
> Signed-off-by: Pingfan Liu 
> Cc: Michael Ellerman 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Hari Bathini 
> To: linuxppc-dev@lists.ozlabs.org
> Cc: ke...@lists.infradead.org

KDump fails to capture vmcore as we end up looking at a stale elfcore hdr
with udev event happening before DT update. Resolved with these patches.
For the series:

Tested-by: Hari Bathini 



[PATCH 1/2] powerpc/fadump: use static allocation for reserved memory ranges

2020-03-10 Thread Hari Bathini
At times, memory ranges have to be looked up during early boot, when
kernel couldn't be initialized for dynamic memory allocation. In fact,
reserved-ranges look up is needed during FADump memory reservation.
Without accounting for reserved-ranges in reserving memory for FADump,
MPIPL boot fails with memory corruption issues. So, extend memory
ranges handling to support static allocation and populate reserved
memory ranges during early boot.

Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing 
memory")
Cc: sta...@vger.kernel.org # v5.4+
Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump-internal.h |4 +
 arch/powerpc/kernel/fadump.c   |   77 
 2 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index c814a2b..8d61c8f 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -64,12 +64,14 @@ struct fadump_memory_range {
 };
 
 /* fadump memory ranges info */
+#define RNG_NAME_SZ16
 struct fadump_mrange_info {
-   charname[16];
+   charname[RNG_NAME_SZ];
struct fadump_memory_range  *mem_ranges;
u32 mem_ranges_sz;
u32 mem_range_cnt;
u32 max_mem_ranges;
+   boolis_static;
 };
 
 /* Platform specific callback functions */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index ff0114a..7fcf4a8f 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -38,8 +38,17 @@ static void __init fadump_reserve_crash_area(u64 base);
 
 #ifndef CONFIG_PRESERVE_FA_DUMP
 static DEFINE_MUTEX(fadump_mutex);
-struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
-struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
+struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false 
};
+
+#define RESERVED_RNGS_SZ   16384 /* 16K - 128 entries */
+#define RESERVED_RNGS_CNT  (RESERVED_RNGS_SZ / \
+sizeof(struct fadump_memory_range))
+static struct fadump_memory_range rngs[RESERVED_RNGS_CNT];
+struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs,
+  RESERVED_RNGS_SZ, 0,
+  RESERVED_RNGS_CNT, true };
+
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node);
 
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
@@ -108,6 +117,11 @@ static int __init fadump_cma_init(void) { return 1; }
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data)
 {
+   if (depth == 0) {
+   early_init_dt_scan_reserved_ranges(node);
+   return 0;
+   }
+
if (depth != 1)
return 0;
 
@@ -726,10 +740,14 @@ void fadump_free_cpu_notes_buf(void)
 
 static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info)
 {
+   if (mrange_info->is_static) {
+   mrange_info->mem_range_cnt = 0;
+   return;
+   }
+
kfree(mrange_info->mem_ranges);
-   mrange_info->mem_ranges = NULL;
-   mrange_info->mem_ranges_sz = 0;
-   mrange_info->max_mem_ranges = 0;
+   memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0,
+  (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ));
 }
 
 /*
@@ -786,6 +804,12 @@ static inline int fadump_add_mem_range(struct 
fadump_mrange_info *mrange_info,
if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) {
int ret;
 
+   if (mrange_info->is_static) {
+   pr_err("Reached array size limit for %s memory 
ranges\n",
+  mrange_info->name);
+   return -ENOSPC;
+   }
+
ret = fadump_alloc_mem_ranges(mrange_info);
if (ret)
return ret;
@@ -1202,20 +1226,19 @@ static void sort_and_merge_mem_ranges(struct 
fadump_mrange_info *mrange_info)
  * Scan reserved-ranges to consider them while reserving/releasing
  * memory for FADump.
  */
-static inline int fadump_scan_reserved_mem_ranges(void)
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node)
 {
-   struct device_node *root;
const __be32 *prop;
int len, ret = -1;
unsigned long i;
 
-   root = of_find_node_by_path("/");
-   if (!root)
-   retu

[PATCH 2/2] powerpc/fadump: consider reserved ranges while reserving memory

2020-03-10 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse reserved-ranges DT
node and reserve kernel memory falling in these ranges for F/W
purposes. Memory reserved for FADump should not overlap with these
ranges as it could corrupt memory meant for F/W or crash'ed kernel
memory to be exported as vmcore.

But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's
bottom up allocation mode"), memblock_find_in_range() is being used to
find the appropriate area to reserve memory for FADump, which can't
account for reserved-ranges as these ranges are reserved only after
FADump memory reservation.

With reserved-ranges now being populated during early boot, look out
for these memory ranges while reserving memory for FADump. Without
this change, MPIPL on PowerNV systems aborts with hostboot failure,
when memory reserved for FADump is less than 4096MB.

Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up 
allocation mode")
Cc: sta...@vger.kernel.org # v5.4+
Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/fadump.c |   76 --
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 7fcf4a8f..ab83be9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -443,10 +443,70 @@ static int __init fadump_get_boot_mem_regions(void)
return ret;
 }
 
+/*
+ * Returns true, if the given range overlaps with reserved memory ranges
+ * starting at idx. Also, updates idx to index of overlapping memory range
+ * with the given memory range.
+ * False, otherwise.
+ */
+static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx)
+{
+   bool ret = false;
+   int i;
+
+   for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) {
+   u64 rbase = reserved_mrange_info.mem_ranges[i].base;
+   u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size;
+
+   if (end <= rbase)
+   break;
+
+   if ((end > rbase) &&  (base < rend)) {
+   *idx = i;
+   ret = true;
+   break;
+   }
+   }
+
+   return ret;
+}
+
+/*
+ * Locate a suitable memory area to reserve memory for FADump. While at it,
+ * lookup reserved-ranges & avoid overlap with them, as they are used by F/W.
+ */
+static u64 __init fadump_locate_reserve_mem(u64 base, u64 size)
+{
+   struct fadump_memory_range *mrngs;
+   phys_addr_t mstart, mend;
+   int idx = 0;
+   u64 i;
+
+   mrngs = reserved_mrange_info.mem_ranges;
+   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+   &mstart, &mend, NULL) {
+   pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n",
+i, mstart, mend, base);
+
+   if (mstart > base)
+   base = PAGE_ALIGN(mstart);
+
+   while ((mend > base) && ((mend - base) >= size)) {
+   if (!overlaps_reserved_ranges(base, base + size, &idx))
+   goto out;
+
+   base = mrngs[idx].base + mrngs[idx].size;
+   base = PAGE_ALIGN(base);
+   }
+   }
+
+out:
+   return base;
+}
+
 int __init fadump_reserve_mem(void)
 {
-   u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE;
-   bool is_memblock_bottom_up = memblock_bottom_up();
+   u64 base, size, mem_boundary, bootmem_min;
int ret = 1;
 
if (!fw_dump.fadump_enabled)
@@ -467,9 +527,9 @@ int __init fadump_reserve_mem(void)
PAGE_ALIGN(fadump_calculate_reserve_size());
 #ifdef CONFIG_CMA
if (!fw_dump.nocma) {
-   align = FADUMP_CMA_ALIGNMENT;
fw_dump.boot_memory_size =
-   ALIGN(fw_dump.boot_memory_size, align);
+   ALIGN(fw_dump.boot_memory_size,
+ FADUMP_CMA_ALIGNMENT);
}
 #endif
 
@@ -537,13 +597,9 @@ int __init fadump_reserve_mem(void)
 * Reserve memory at an offset closer to bottom of the RAM to
 * minimize the impact of memory hot-remove operation.
 */
-   memblock_set_bottom_up(true);
-   base = memblock_find_in_range(base, mem_boundary, size, align);
-
-   /* Restore the previous allocation mode */
-   memblock_set_bottom_up(is_memblock_bottom_up);
+   base = fadump_locate_reserve_mem(base, size);
 
-   if (!base) {
+   if (base > (mem_boundary - size)) {
pr_err("Failed to find memory chunk for 
reservation!\n");
goto error_out;
}



Re: [PATCHv3] powerpc/crashkernel: take "mem=" option into account

2020-03-26 Thread Hari Bathini
Hello Pingfan,

Thanks for the patch..

On 19/02/20 7:48 PM, Pingfan Liu wrote:
> 'mem=" option is an easy way to put high pressure on memory during some
> test. Hence after applying the memory limit, instead of total mem, the
> actual usable memory should be considered when reserving mem for
> crashkernel. Otherwise the boot up may experience OOM issue.
> 
> E.g. it would reserve 4G prior to the change and 512M afterward, if passing
> crashkernel="2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G", and
> mem=5G on a 256G machine.
> 
> This issue is powerpc specific because it puts higher priority on fadump
> and kdump reservation than on "mem=". Referring the following code:
>   if (fadump_reserve_mem() == 0)
>   reserve_crashkernel();
>   ...
>   /* Ensure that total memory size is page-aligned. */
>   limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
>   memblock_enforce_memory_limit(limit);
> 
> While on other arches, the effect of "mem=" takes a higher priority and pass
> through memblock_phys_mem_size() before calling reserve_crashkernel().
> 
> Signed-off-by: Pingfan Liu 
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Hari Bathini 
> Cc: Michael Ellerman 
> Cc: ke...@lists.infradead.org
> ---
> v2 -> v3: improve commit log
>  arch/powerpc/kernel/machine_kexec.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/machine_kexec.c 
> b/arch/powerpc/kernel/machine_kexec.c
> index c4ed328..eec96dc 100644
> --- a/arch/powerpc/kernel/machine_kexec.c
> +++ b/arch/powerpc/kernel/machine_kexec.c
> @@ -114,11 +114,12 @@ void machine_kexec(struct kimage *image)
>  
>  void __init reserve_crashkernel(void)
>  {
> - unsigned long long crash_size, crash_base;
> + unsigned long long crash_size, crash_base, total_mem_sz;
>   int ret;
>  
> + total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size();
>   /* use common parsing */
> - ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> + ret = parse_crashkernel(boot_command_line, total_mem_sz,
>   &crash_size, &crash_base);
>   if (ret == 0 && crash_size > 0) {
>   crashk_res.start = crash_base;

memory_limit is adjusted after this with the below snippet:

/* Crash kernel trumps memory limit */
if (memory_limit && memory_limit <= crashk_res.end) {   

memory_limit = crashk_res.end + 1;
printk("Adjusted memory limit for crashkernel, now 0x%llx\n",   

   memory_limit);   

} 

So, either the above snippet must be dropped or the print below should use an 
updated total_mem_sz
based on adjusted memory_limit. I would prefer the latter..

> @@ -185,7 +186,7 @@ void __init reserve_crashkernel(void)
>   "for crashkernel (System RAM: %ldMB)\n",
>   (unsigned long)(crash_size >> 20),
>   (unsigned long)(crashk_res.start >> 20),
> - (unsigned long)(memblock_phys_mem_size() >> 20));
> + (unsigned long)(total_mem_sz >> 20));
>  
>   if (!memblock_is_region_memory(crashk_res.start, crash_size) ||
>   memblock_reserve(crashk_res.start, crash_size)) {
> 

-- 
- Hari



Re: [PATCHv4] powerpc/crashkernel: take "mem=" option into account

2020-04-01 Thread Hari Bathini



On 01/04/20 7:30 PM, Pingfan Liu wrote:
> 'mem=" option is an easy way to put high pressure on memory during some
> test. Hence after applying the memory limit, instead of total mem, the
> actual usable memory should be considered when reserving mem for
> crashkernel. Otherwise the boot up may experience OOM issue.
> 
> E.g. it would reserve 4G prior to the change and 512M afterward, if passing
> crashkernel="2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G", and
> mem=5G on a 256G machine.
> 
> This issue is powerpc specific because it puts higher priority on fadump
> and kdump reservation than on "mem=". Referring the following code:
> if (fadump_reserve_mem() == 0)
> reserve_crashkernel();
> ...
> /* Ensure that total memory size is page-aligned. */
> limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
> memblock_enforce_memory_limit(limit);
> 
> While on other arches, the effect of "mem=" takes a higher priority and pass
> through memblock_phys_mem_size() before calling reserve_crashkernel().
>> Signed-off-by: Pingfan Liu 
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Hari Bathini 
> Cc: Michael Ellerman 
> Cc: ke...@lists.infradead.org
> ---
> v3 -> v4: fix total_mem_sz based on adjusted memory_limit


Thanks for the update.

Reviewed-by: Hari Bathini 



Re: [PATCH v2 3/4] Documentation/ABI: mark /sys/kernel/fadump_* sysfs files deprecated

2019-10-21 Thread Hari Bathini



On 18/10/19 6:35 PM, Sourabh Jain wrote:
> The /sys/kernel/fadump_* sysfs files are replicated under

[...]

> +Note: The following FADump sysfs files are deprecated.
> +
> +Deprecated   Alternative
> +
> ---
> +/sys/kernel/fadump_enabled   /sys/kernel/fadump/fadump_enabled
> +/sys/kernel/fadump_registered/sys/kernel/fadump/fadump_registered
> +/sys/kernel/fadump_release_mem   
> /sys/kernel/fadump/fadump_release_mem

/sys/kernel/fadump/* looks tidy instead of /sys/kernel/fadump/fadump_* 
I mean, /sys/kernel/fadump/fadump_enabled => /sys/kernel/fadump/enabled and 
such..

- Hari



Re: [PATCH] powerpc/fadump: Remove duplicate message.

2019-10-24 Thread Hari Bathini


Michal, thanks for looking into this.

On 23/10/19 11:26 PM, Michal Suchanek wrote:
> There is duplicate message about lack of support by firmware in
> fadump_reserve_mem and setup_fadump. Due to different capitalization it
> is clear that the one in setup_fadump is shown on boot. Remove the
> duplicate that is not shown.

Actually, the message in fadump_reserve_mem() is logged. fadump_reserve_mem()
executes first and sets fw_dump.fadump_enabled to `0`, if fadump is not 
supported.
So, the other message in setup_fadump() doesn't get logged anymore with recent
changes. The right thing to do would be to remove similar message in 
setup_fadump() instead.

- Hari



Re: [PATCH v2 3/4] Documentation/ABI: mark /sys/kernel/fadump_* sysfs files deprecated

2019-11-05 Thread Hari Bathini



On 05/11/19 2:24 PM, Sourabh Jain wrote:
> 
> 
> On 10/21/19 1:11 PM, Hari Bathini wrote:
>>
>>
>> On 18/10/19 6:35 PM, Sourabh Jain wrote:
>>> The /sys/kernel/fadump_* sysfs files are replicated under
>>
>> [...]
>>
>>> +Note: The following FADump sysfs files are deprecated.
>>> +
>>> +Deprecated   Alternative
>>> +
>>> ---
>>> +/sys/kernel/fadump_enabled   /sys/kernel/fadump/fadump_enabled
>>> +/sys/kernel/fadump_registered
>>> /sys/kernel/fadump/fadump_registered
>>> +/sys/kernel/fadump_release_mem   
>>> /sys/kernel/fadump/fadump_release_mem
>>
>> /sys/kernel/fadump/* looks tidy instead of /sys/kernel/fadump/fadump_* 
>> I mean, /sys/kernel/fadump/fadump_enabled => /sys/kernel/fadump/enabled and 
>> such..
> 
> 
> 
> Could you please confirm whether you want to address the sysfs file path 
> differently or
> actually changing the sysfs file name from fadump_enabled to enabled.

I meant, given the path "/sys/kernel/fadump/", the prefix fadump_ is redundant.
If there are no conventions that we should retain the same file name, I suggest
to drop the fadump_ prefix and just call them enabled, registered, etc..

- Hari



Re: [PATCH v3] powerpc/fadump: when fadump is supported register the fadump sysfs files.

2019-11-08 Thread Hari Bathini



On 07/11/19 10:17 PM, Michal Suchanek wrote:
> Currently it is not possible to distinguish the case when fadump is
> supported by firmware and disabled in kernel and completely unsupported
> using the kernel sysfs interface. User can investigate the devicetree
> but it is more reasonable to provide sysfs files in case we get some
> fadumpv2 in the future.
> 
> With this patch sysfs files are available whenever fadump is supported
> by firmware.
> 
> There is duplicate message about lack of support by firmware in
> fadump_reserve_mem and setup_fadump. Remove the duplicate message in
> setup_fadump.

Thanks for doing this, Michal.
Exporting the node will be helpful in finding if FADump is supported,
given FADump is now supported on two different platforms...

Reviewed-by: Hari Bathini 

> 
> Signed-off-by: Michal Suchanek 
> ---
> v2: move the sysfs initialization earlier to avoid condition nesting
> v3: remove duplicate message
> ---
>  arch/powerpc/kernel/fadump.c | 15 ++-
>  1 file changed, 6 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index ed59855430b9..ff0114aeba9b 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -1466,16 +1466,15 @@ static void fadump_init_files(void)
>   */
>  int __init setup_fadump(void)
>  {
> - if (!fw_dump.fadump_enabled)
> - return 0;
> -
> - if (!fw_dump.fadump_supported) {
> - printk(KERN_ERR "Firmware-assisted dump is not supported on"
> - " this hardware\n");
> + if (!fw_dump.fadump_supported)
>   return 0;
> - }
>  
> + fadump_init_files();
>   fadump_show_config();
> +
> + if (!fw_dump.fadump_enabled)
> + return 1;
> +
>   /*
>* If dump data is available then see if it is valid and prepare for
>* saving it to the disk.
> @@ -1492,8 +1491,6 @@ int __init setup_fadump(void)
>   else if (fw_dump.reserve_dump_area_size)
>   fw_dump.ops->fadump_init_mem_struct(&fw_dump);
>  
> - fadump_init_files();
> -
>   return 1;
>  }
>  subsys_initcall(setup_fadump);
> 

-- 
- Hari



[PATCH v2 1/2] powerpc/fadump: use static allocation for reserved memory ranges

2020-04-20 Thread Hari Bathini
At times, memory ranges have to be looked up during early boot, when
kernel couldn't be initialized for dynamic memory allocation. In fact,
reserved-ranges look up is needed during FADump memory reservation.
Without accounting for reserved-ranges in reserving memory for FADump,
MPIPL boot fails with memory corruption issues. So, extend memory
ranges handling to support static allocation and populate reserved
memory ranges during early boot.

Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing 
memory")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump-internal.h |4 +
 arch/powerpc/kernel/fadump.c   |   77 
 2 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index c814a2b..8d61c8f 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -64,12 +64,14 @@ struct fadump_memory_range {
 };
 
 /* fadump memory ranges info */
+#define RNG_NAME_SZ16
 struct fadump_mrange_info {
-   charname[16];
+   charname[RNG_NAME_SZ];
struct fadump_memory_range  *mem_ranges;
u32 mem_ranges_sz;
u32 mem_range_cnt;
u32 max_mem_ranges;
+   boolis_static;
 };
 
 /* Platform specific callback functions */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 59e60a9..679277b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -40,8 +40,17 @@ struct kobject *fadump_kobj;
 
 #ifndef CONFIG_PRESERVE_FA_DUMP
 static DEFINE_MUTEX(fadump_mutex);
-struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
-struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
+struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false 
};
+
+#define RESERVED_RNGS_SZ   16384 /* 16K - 128 entries */
+#define RESERVED_RNGS_CNT  (RESERVED_RNGS_SZ / \
+sizeof(struct fadump_memory_range))
+static struct fadump_memory_range rngs[RESERVED_RNGS_CNT];
+struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs,
+  RESERVED_RNGS_SZ, 0,
+  RESERVED_RNGS_CNT, true };
+
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node);
 
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
@@ -110,6 +119,11 @@ static int __init fadump_cma_init(void) { return 1; }
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data)
 {
+   if (depth == 0) {
+   early_init_dt_scan_reserved_ranges(node);
+   return 0;
+   }
+
if (depth != 1)
return 0;
 
@@ -728,10 +742,14 @@ void fadump_free_cpu_notes_buf(void)
 
 static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info)
 {
+   if (mrange_info->is_static) {
+   mrange_info->mem_range_cnt = 0;
+   return;
+   }
+
kfree(mrange_info->mem_ranges);
-   mrange_info->mem_ranges = NULL;
-   mrange_info->mem_ranges_sz = 0;
-   mrange_info->max_mem_ranges = 0;
+   memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0,
+  (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ));
 }
 
 /*
@@ -788,6 +806,12 @@ static inline int fadump_add_mem_range(struct 
fadump_mrange_info *mrange_info,
if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) {
int ret;
 
+   if (mrange_info->is_static) {
+   pr_err("Reached array size limit for %s memory 
ranges\n",
+  mrange_info->name);
+   return -ENOSPC;
+   }
+
ret = fadump_alloc_mem_ranges(mrange_info);
if (ret)
return ret;
@@ -1204,20 +1228,19 @@ static void sort_and_merge_mem_ranges(struct 
fadump_mrange_info *mrange_info)
  * Scan reserved-ranges to consider them while reserving/releasing
  * memory for FADump.
  */
-static inline int fadump_scan_reserved_mem_ranges(void)
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node)
 {
-   struct device_node *root;
const __be32 *prop;
int len, ret = -1;
unsigned long i;
 
-   root = of_find_node_by_path("/");
-   if (!root)
-   return ret;
+   /* reserved-ranges alrea

[PATCH v2 2/2] powerpc/fadump: consider reserved ranges while reserving memory

2020-04-20 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse reserved-ranges DT
node and reserve kernel memory falling in these ranges for F/W
purposes. Memory reserved for FADump should not overlap with these
ranges as it could corrupt memory meant for F/W or crash'ed kernel
memory to be exported as vmcore.

But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's
bottom up allocation mode"), memblock_find_in_range() is being used to
find the appropriate area to reserve memory for FADump, which can't
account for reserved-ranges as these ranges are reserved only after
FADump memory reservation.

With reserved-ranges now being populated during early boot, look out
for these memory ranges while reserving memory for FADump. Without
this change, MPIPL on PowerNV systems aborts with hostboot failure,
when memory reserved for FADump is less than 4096MB.

Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up 
allocation mode")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
---

Changes in v2:
* Add an out parameter 'found' for fadump_locate_reserve_mem() and set it to 
"true"
  when a suitable memory area is located.


 arch/powerpc/kernel/fadump.c |   81 +-
 1 file changed, 71 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 679277b..0ffe69c 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -445,10 +445,73 @@ static int __init fadump_get_boot_mem_regions(void)
return ret;
 }
 
+/*
+ * Returns true, if the given range overlaps with reserved memory ranges
+ * starting at idx. Also, updates idx to index of overlapping memory range
+ * with the given memory range.
+ * False, otherwise.
+ */
+static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx)
+{
+   bool ret = false;
+   int i;
+
+   for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) {
+   u64 rbase = reserved_mrange_info.mem_ranges[i].base;
+   u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size;
+
+   if (end <= rbase)
+   break;
+
+   if ((end > rbase) &&  (base < rend)) {
+   *idx = i;
+   ret = true;
+   break;
+   }
+   }
+
+   return ret;
+}
+
+/*
+ * Locate a suitable memory area to reserve memory for FADump. While at it,
+ * lookup reserved-ranges & avoid overlap with them, as they are used by F/W.
+ */
+static u64 __init fadump_locate_reserve_mem(u64 base, u64 size, bool *found)
+{
+   struct fadump_memory_range *mrngs;
+   phys_addr_t mstart, mend;
+   int idx = 0;
+   u64 i;
+
+   *found = false;
+   mrngs = reserved_mrange_info.mem_ranges;
+   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+   &mstart, &mend, NULL) {
+   pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n",
+i, mstart, mend, base);
+
+   if (mstart > base)
+   base = PAGE_ALIGN(mstart);
+
+   while ((mend > base) && ((mend - base) >= size)) {
+   if (!overlaps_reserved_ranges(base, base+size, &idx)) {
+   *found = true;
+   goto out;
+   }
+
+   base = mrngs[idx].base + mrngs[idx].size;
+   base = PAGE_ALIGN(base);
+   }
+   }
+
+out:
+   return base;
+}
+
 int __init fadump_reserve_mem(void)
 {
-   u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE;
-   bool is_memblock_bottom_up = memblock_bottom_up();
+   u64 base, size, mem_boundary, bootmem_min;
int ret = 1;
 
if (!fw_dump.fadump_enabled)
@@ -469,9 +532,9 @@ int __init fadump_reserve_mem(void)
PAGE_ALIGN(fadump_calculate_reserve_size());
 #ifdef CONFIG_CMA
if (!fw_dump.nocma) {
-   align = FADUMP_CMA_ALIGNMENT;
fw_dump.boot_memory_size =
-   ALIGN(fw_dump.boot_memory_size, align);
+   ALIGN(fw_dump.boot_memory_size,
+ FADUMP_CMA_ALIGNMENT);
}
 #endif
 
@@ -535,17 +598,15 @@ int __init fadump_reserve_mem(void)
pr_debug("Reserve dump area start address: 0x%lx\n",
 fw_dump.reserve_dump_area_start);
} else {
+   bool found = false;
+
/*
 * Reserve memory at an offset closer to bottom of the RAM to
 * minimize the impact of memory hot-remove oper

Re: [PATCH 2/2] powerpc/fadump: consider reserved ranges while reserving memory

2020-04-20 Thread Hari Bathini



On 20/04/20 10:50 AM, Mahesh J Salgaonkar wrote:
> On 2020-03-11 01:57:10 Wed, Hari Bathini wrote:
>> Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
>> memory reservations") enabled support to parse reserved-ranges DT
>> node and reserve kernel memory falling in these ranges for F/W
>> purposes. Memory reserved for FADump should not overlap with these
>> ranges as it could corrupt memory meant for F/W or crash'ed kernel
>> memory to be exported as vmcore.
>>
>> But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's
>> bottom up allocation mode"), memblock_find_in_range() is being used to
>> find the appropriate area to reserve memory for FADump, which can't
>> account for reserved-ranges as these ranges are reserved only after
>> FADump memory reservation.
>>
>> With reserved-ranges now being populated during early boot, look out
>> for these memory ranges while reserving memory for FADump. Without
>> this change, MPIPL on PowerNV systems aborts with hostboot failure,
>> when memory reserved for FADump is less than 4096MB.
>>
>> Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up 
>> allocation mode")
>> Cc: sta...@vger.kernel.org # v5.4+
>> Signed-off-by: Hari Bathini 
>> ---
>>  arch/powerpc/kernel/fadump.c |   76 
>> --
>>  1 file changed, 66 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
>> index 7fcf4a8f..ab83be9 100644
>> --- a/arch/powerpc/kernel/fadump.c
>> +++ b/arch/powerpc/kernel/fadump.c
>> @@ -443,10 +443,70 @@ static int __init fadump_get_boot_mem_regions(void)
>>  return ret;
>>  }
>>  
>> +/*
>> + * Returns true, if the given range overlaps with reserved memory ranges
>> + * starting at idx. Also, updates idx to index of overlapping memory range
>> + * with the given memory range.
>> + * False, otherwise.
>> + */
>> +static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx)
>> +{
>> +bool ret = false;
>> +int i;
>> +
>> +for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) {
>> +u64 rbase = reserved_mrange_info.mem_ranges[i].base;
>> +u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size;
>> +
>> +if (end <= rbase)
>> +break;
>> +
>> +if ((end > rbase) &&  (base < rend)) {
>> +*idx = i;
>> +ret = true;
>> +break;
>> +}
>> +}
>> +
>> +return ret;
>> +}
>> +
>> +/*
>> + * Locate a suitable memory area to reserve memory for FADump. While at it,
>> + * lookup reserved-ranges & avoid overlap with them, as they are used by 
>> F/W.
>> + */
>> +static u64 __init fadump_locate_reserve_mem(u64 base, u64 size)
>> +{
>> +struct fadump_memory_range *mrngs;
>> +phys_addr_t mstart, mend;
>> +int idx = 0;
>> +u64 i;
>> +
>> +mrngs = reserved_mrange_info.mem_ranges;
>> +for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
>> +&mstart, &mend, NULL) {
>> +pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n",
>> + i, mstart, mend, base);
>> +
>> +if (mstart > base)
>> +base = PAGE_ALIGN(mstart);
>> +
>> +while ((mend > base) && ((mend - base) >= size)) {
>> +if (!overlaps_reserved_ranges(base, base + size, &idx))
>> +goto out;
>> +
>> +base = mrngs[idx].base + mrngs[idx].size;
>> +base = PAGE_ALIGN(base);
> 
> What happens when all the memory ranges found to be overlaped with
> reserved ranges ? Shoudn't this function return NULL ? Looks like in
> that case this function returns the last set base address which is
> either still overlaped or not big enough in size.

Thanks for the review, Mahesh. I overlooked that corner case.
Just posted v2 fixing it.

- Hari



[PATCH v2 1/2] powerpc/fadump: use static allocation for reserved memory ranges

2020-04-20 Thread Hari Bathini
At times, memory ranges have to be looked up during early boot, when
kernel couldn't be initialized for dynamic memory allocation. In fact,
reserved-ranges look up is needed during FADump memory reservation.
Without accounting for reserved-ranges in reserving memory for FADump,
MPIPL boot fails with memory corruption issues. So, extend memory
ranges handling to support static allocation and populate reserved
memory ranges during early boot.

Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing 
memory")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
Reviewed-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/fadump-internal.h |4 +
 arch/powerpc/kernel/fadump.c   |   77 
 2 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index c814a2b..8d61c8f 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -64,12 +64,14 @@ struct fadump_memory_range {
 };
 
 /* fadump memory ranges info */
+#define RNG_NAME_SZ16
 struct fadump_mrange_info {
-   charname[16];
+   charname[RNG_NAME_SZ];
struct fadump_memory_range  *mem_ranges;
u32 mem_ranges_sz;
u32 mem_range_cnt;
u32 max_mem_ranges;
+   boolis_static;
 };
 
 /* Platform specific callback functions */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 59e60a9..679277b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -40,8 +40,17 @@ struct kobject *fadump_kobj;
 
 #ifndef CONFIG_PRESERVE_FA_DUMP
 static DEFINE_MUTEX(fadump_mutex);
-struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
-struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
+struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false 
};
+
+#define RESERVED_RNGS_SZ   16384 /* 16K - 128 entries */
+#define RESERVED_RNGS_CNT  (RESERVED_RNGS_SZ / \
+sizeof(struct fadump_memory_range))
+static struct fadump_memory_range rngs[RESERVED_RNGS_CNT];
+struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs,
+  RESERVED_RNGS_SZ, 0,
+  RESERVED_RNGS_CNT, true };
+
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node);
 
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
@@ -110,6 +119,11 @@ static int __init fadump_cma_init(void) { return 1; }
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data)
 {
+   if (depth == 0) {
+   early_init_dt_scan_reserved_ranges(node);
+   return 0;
+   }
+
if (depth != 1)
return 0;
 
@@ -728,10 +742,14 @@ void fadump_free_cpu_notes_buf(void)
 
 static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info)
 {
+   if (mrange_info->is_static) {
+   mrange_info->mem_range_cnt = 0;
+   return;
+   }
+
kfree(mrange_info->mem_ranges);
-   mrange_info->mem_ranges = NULL;
-   mrange_info->mem_ranges_sz = 0;
-   mrange_info->max_mem_ranges = 0;
+   memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0,
+  (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ));
 }
 
 /*
@@ -788,6 +806,12 @@ static inline int fadump_add_mem_range(struct 
fadump_mrange_info *mrange_info,
if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) {
int ret;
 
+   if (mrange_info->is_static) {
+   pr_err("Reached array size limit for %s memory 
ranges\n",
+  mrange_info->name);
+   return -ENOSPC;
+   }
+
ret = fadump_alloc_mem_ranges(mrange_info);
if (ret)
return ret;
@@ -1204,20 +1228,19 @@ static void sort_and_merge_mem_ranges(struct 
fadump_mrange_info *mrange_info)
  * Scan reserved-ranges to consider them while reserving/releasing
  * memory for FADump.
  */
-static inline int fadump_scan_reserved_mem_ranges(void)
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node)
 {
-   struct device_node *root;
const __be32 *prop;
int len, ret = -1;
unsigned long i;
 
-   root = of_find_node_by_path("/");
-   if (!root)
-   retu

[PATCH v2 2/2] powerpc/fadump: consider reserved ranges while reserving memory

2020-04-20 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse reserved-ranges DT
node and reserve kernel memory falling in these ranges for F/W
purposes. Memory reserved for FADump should not overlap with these
ranges as it could corrupt memory meant for F/W or crash'ed kernel
memory to be exported as vmcore.

But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's
bottom up allocation mode"), memblock_find_in_range() is being used to
find the appropriate area to reserve memory for FADump, which can't
account for reserved-ranges as these ranges are reserved only after
FADump memory reservation.

With reserved-ranges now being populated during early boot, look out
for these memory ranges while reserving memory for FADump. Without
this change, MPIPL on PowerNV systems aborts with hostboot failure,
when memory reserved for FADump is less than 4096MB.

Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up 
allocation mode")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
---

Changes in v2:
* Add an out parameter 'found' for fadump_locate_reserve_mem() and set it to 
"true"
  when a suitable memory area is located.


 arch/powerpc/kernel/fadump.c |   81 +-
 1 file changed, 71 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 679277b..0ffe69c 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -445,10 +445,73 @@ static int __init fadump_get_boot_mem_regions(void)
return ret;
 }
 
+/*
+ * Returns true, if the given range overlaps with reserved memory ranges
+ * starting at idx. Also, updates idx to index of overlapping memory range
+ * with the given memory range.
+ * False, otherwise.
+ */
+static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx)
+{
+   bool ret = false;
+   int i;
+
+   for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) {
+   u64 rbase = reserved_mrange_info.mem_ranges[i].base;
+   u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size;
+
+   if (end <= rbase)
+   break;
+
+   if ((end > rbase) &&  (base < rend)) {
+   *idx = i;
+   ret = true;
+   break;
+   }
+   }
+
+   return ret;
+}
+
+/*
+ * Locate a suitable memory area to reserve memory for FADump. While at it,
+ * lookup reserved-ranges & avoid overlap with them, as they are used by F/W.
+ */
+static u64 __init fadump_locate_reserve_mem(u64 base, u64 size, bool *found)
+{
+   struct fadump_memory_range *mrngs;
+   phys_addr_t mstart, mend;
+   int idx = 0;
+   u64 i;
+
+   *found = false;
+   mrngs = reserved_mrange_info.mem_ranges;
+   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+   &mstart, &mend, NULL) {
+   pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n",
+i, mstart, mend, base);
+
+   if (mstart > base)
+   base = PAGE_ALIGN(mstart);
+
+   while ((mend > base) && ((mend - base) >= size)) {
+   if (!overlaps_reserved_ranges(base, base+size, &idx)) {
+   *found = true;
+   goto out;
+   }
+
+   base = mrngs[idx].base + mrngs[idx].size;
+   base = PAGE_ALIGN(base);
+   }
+   }
+
+out:
+   return base;
+}
+
 int __init fadump_reserve_mem(void)
 {
-   u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE;
-   bool is_memblock_bottom_up = memblock_bottom_up();
+   u64 base, size, mem_boundary, bootmem_min;
int ret = 1;
 
if (!fw_dump.fadump_enabled)
@@ -469,9 +532,9 @@ int __init fadump_reserve_mem(void)
PAGE_ALIGN(fadump_calculate_reserve_size());
 #ifdef CONFIG_CMA
if (!fw_dump.nocma) {
-   align = FADUMP_CMA_ALIGNMENT;
fw_dump.boot_memory_size =
-   ALIGN(fw_dump.boot_memory_size, align);
+   ALIGN(fw_dump.boot_memory_size,
+ FADUMP_CMA_ALIGNMENT);
}
 #endif
 
@@ -535,17 +598,15 @@ int __init fadump_reserve_mem(void)
pr_debug("Reserve dump area start address: 0x%lx\n",
 fw_dump.reserve_dump_area_start);
} else {
+   bool found = false;
+
/*
 * Reserve memory at an offset closer to bottom of the RAM to
 * minimize the impact of memory hot-remove oper

[PATCH v3 1/2] powerpc/fadump: use static allocation for reserved memory ranges

2020-04-20 Thread Hari Bathini
At times, memory ranges have to be looked up during early boot, when
kernel couldn't be initialized for dynamic memory allocation. In fact,
reserved-ranges look up is needed during FADump memory reservation.
Without accounting for reserved-ranges in reserving memory for FADump,
MPIPL boot fails with memory corruption issues. So, extend memory
ranges handling to support static allocation and populate reserved
memory ranges during early boot.

Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing 
memory")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
Reviewed-by: Mahesh Salgaonkar 
---

Changes in v3:
* No code change. Added Mahesh's 'Reviewed-by' tag.


 arch/powerpc/include/asm/fadump-internal.h |4 +
 arch/powerpc/kernel/fadump.c   |   77 
 2 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index c814a2b..8d61c8f 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -64,12 +64,14 @@ struct fadump_memory_range {
 };
 
 /* fadump memory ranges info */
+#define RNG_NAME_SZ16
 struct fadump_mrange_info {
-   charname[16];
+   charname[RNG_NAME_SZ];
struct fadump_memory_range  *mem_ranges;
u32 mem_ranges_sz;
u32 mem_range_cnt;
u32 max_mem_ranges;
+   boolis_static;
 };
 
 /* Platform specific callback functions */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 59e60a9..679277b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -40,8 +40,17 @@ struct kobject *fadump_kobj;
 
 #ifndef CONFIG_PRESERVE_FA_DUMP
 static DEFINE_MUTEX(fadump_mutex);
-struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
-struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
+struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false 
};
+
+#define RESERVED_RNGS_SZ   16384 /* 16K - 128 entries */
+#define RESERVED_RNGS_CNT  (RESERVED_RNGS_SZ / \
+sizeof(struct fadump_memory_range))
+static struct fadump_memory_range rngs[RESERVED_RNGS_CNT];
+struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs,
+  RESERVED_RNGS_SZ, 0,
+  RESERVED_RNGS_CNT, true };
+
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node);
 
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
@@ -110,6 +119,11 @@ static int __init fadump_cma_init(void) { return 1; }
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data)
 {
+   if (depth == 0) {
+   early_init_dt_scan_reserved_ranges(node);
+   return 0;
+   }
+
if (depth != 1)
return 0;
 
@@ -728,10 +742,14 @@ void fadump_free_cpu_notes_buf(void)
 
 static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info)
 {
+   if (mrange_info->is_static) {
+   mrange_info->mem_range_cnt = 0;
+   return;
+   }
+
kfree(mrange_info->mem_ranges);
-   mrange_info->mem_ranges = NULL;
-   mrange_info->mem_ranges_sz = 0;
-   mrange_info->max_mem_ranges = 0;
+   memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0,
+  (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ));
 }
 
 /*
@@ -788,6 +806,12 @@ static inline int fadump_add_mem_range(struct 
fadump_mrange_info *mrange_info,
if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) {
int ret;
 
+   if (mrange_info->is_static) {
+   pr_err("Reached array size limit for %s memory 
ranges\n",
+  mrange_info->name);
+   return -ENOSPC;
+   }
+
ret = fadump_alloc_mem_ranges(mrange_info);
if (ret)
return ret;
@@ -1204,20 +1228,19 @@ static void sort_and_merge_mem_ranges(struct 
fadump_mrange_info *mrange_info)
  * Scan reserved-ranges to consider them while reserving/releasing
  * memory for FADump.
  */
-static inline int fadump_scan_reserved_mem_ranges(void)
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node)
 {
-   struct device_node *root;
const __be32 *prop;
int len, ret = -1;
unsigned long i;
 
-   root 

[PATCH v3 2/2] powerpc/fadump: consider reserved ranges while reserving memory

2020-04-20 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse reserved-ranges DT
node and reserve kernel memory falling in these ranges for F/W
purposes. Memory reserved for FADump should not overlap with these
ranges as it could corrupt memory meant for F/W or crash'ed kernel
memory to be exported as vmcore.

But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's
bottom up allocation mode"), memblock_find_in_range() is being used to
find the appropriate area to reserve memory for FADump, which can't
account for reserved-ranges as these ranges are reserved only after
FADump memory reservation.

With reserved-ranges now being populated during early boot, look out
for these memory ranges while reserving memory for FADump. Without
this change, MPIPL on PowerNV systems aborts with hostboot failure,
when memory reserved for FADump is less than 4096MB.

Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up 
allocation mode")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
Reviewed-by: Mahesh Salgaonkar 
---

Changes in v3:
* Updated fadump_locate_reserve_mem() to use return '0' instead of an out 
parameter
  as suggested by Mahesh and added his 'Reviewed-by' tag with that change.


 arch/powerpc/kernel/fadump.c |   76 +-
 1 file changed, 67 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 679277b..63aac8b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -445,10 +445,72 @@ static int __init fadump_get_boot_mem_regions(void)
return ret;
 }
 
+/*
+ * Returns true, if the given range overlaps with reserved memory ranges
+ * starting at idx. Also, updates idx to index of overlapping memory range
+ * with the given memory range.
+ * False, otherwise.
+ */
+static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx)
+{
+   bool ret = false;
+   int i;
+
+   for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) {
+   u64 rbase = reserved_mrange_info.mem_ranges[i].base;
+   u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size;
+
+   if (end <= rbase)
+   break;
+
+   if ((end > rbase) &&  (base < rend)) {
+   *idx = i;
+   ret = true;
+   break;
+   }
+   }
+
+   return ret;
+}
+
+/*
+ * Locate a suitable memory area to reserve memory for FADump. While at it,
+ * lookup reserved-ranges & avoid overlap with them, as they are used by F/W.
+ */
+static u64 __init fadump_locate_reserve_mem(u64 base, u64 size)
+{
+   struct fadump_memory_range *mrngs;
+   phys_addr_t mstart, mend;
+   int idx = 0;
+   u64 i, ret = 0;
+
+   mrngs = reserved_mrange_info.mem_ranges;
+   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+   &mstart, &mend, NULL) {
+   pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n",
+i, mstart, mend, base);
+
+   if (mstart > base)
+   base = PAGE_ALIGN(mstart);
+
+   while ((mend > base) && ((mend - base) >= size)) {
+   if (!overlaps_reserved_ranges(base, base+size, &idx)) {
+   ret = base;
+   goto out;
+   }
+
+   base = mrngs[idx].base + mrngs[idx].size;
+   base = PAGE_ALIGN(base);
+   }
+   }
+
+out:
+   return ret;
+}
+
 int __init fadump_reserve_mem(void)
 {
-   u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE;
-   bool is_memblock_bottom_up = memblock_bottom_up();
+   u64 base, size, mem_boundary, bootmem_min;
int ret = 1;
 
if (!fw_dump.fadump_enabled)
@@ -469,9 +531,9 @@ int __init fadump_reserve_mem(void)
PAGE_ALIGN(fadump_calculate_reserve_size());
 #ifdef CONFIG_CMA
if (!fw_dump.nocma) {
-   align = FADUMP_CMA_ALIGNMENT;
fw_dump.boot_memory_size =
-   ALIGN(fw_dump.boot_memory_size, align);
+   ALIGN(fw_dump.boot_memory_size,
+ FADUMP_CMA_ALIGNMENT);
}
 #endif
 
@@ -539,11 +601,7 @@ int __init fadump_reserve_mem(void)
 * Reserve memory at an offset closer to bottom of the RAM to
 * minimize the impact of memory hot-remove operation.
 */
-   memblock_set_bottom_up(true);
-   base = memblock_find_in_range(base, mem_boundary, size, align);
-
-

Re: [PATCH v2 2/2] powerpc/fadump: merge adjacent memory ranges to reduce PT_LOAD segements

2018-08-08 Thread Hari Bathini




On Wednesday 08 August 2018 02:38 PM, Mahesh Jagannath Salgaonkar wrote:

On 08/07/2018 02:12 AM, Hari Bathini wrote:

With dynamic memory allocation support for crash memory ranges array,
there is no hard limit on the no. of crash memory ranges kernel could
export, but program headers count could overflow in the /proc/vmcore
ELF file while exporting each memory range as PT_LOAD segment. Reduce
the likelihood of a such scenario, by folding adjacent crash memory
ranges which minimizes the total number of PT_LOAD segments.

Signed-off-by: Hari Bathini 
---
  arch/powerpc/kernel/fadump.c |   45 ++
  1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 2ec5704..cd0c555 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -908,22 +908,41 @@ static int allocate_crash_memory_ranges(void)
  static inline int fadump_add_crash_memory(unsigned long long base,
  unsigned long long end)
  {
+   u64  start, size;
+   bool is_adjacent = false;
+
if (base == end)
return 0;
  
-	if (crash_mem_ranges == max_crash_mem_ranges) {

-   int ret;
+   /*
+* Fold adjacent memory ranges to bring down the memory ranges/
+* PT_LOAD segments count.
+*/
+   if (crash_mem_ranges) {
+   start = crash_memory_ranges[crash_mem_ranges-1].base;
+   size = crash_memory_ranges[crash_mem_ranges-1].size;
  
-		ret = allocate_crash_memory_ranges();

-   if (ret)
-   return ret;
+   if ((start + size) == base)
+   is_adjacent = true;
+   }
+   if (!is_adjacent) {
+   /* resize the array on reaching the limit */
+   if (crash_mem_ranges == max_crash_mem_ranges) {
+   int ret;
+
+   ret = allocate_crash_memory_ranges();
+   if (ret)
+   return ret;
+   }
+
+   start = base;
+   crash_memory_ranges[crash_mem_ranges].base = start;
+   crash_mem_ranges++;
}
  
+	crash_memory_ranges[crash_mem_ranges-1].size = (end - start);

pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
-   crash_mem_ranges, base, end - 1, (end - base));
-   crash_memory_ranges[crash_mem_ranges].base = base;
-   crash_memory_ranges[crash_mem_ranges].size = end - base;
-   crash_mem_ranges++;
+   (crash_mem_ranges - 1), start, end - 1, (end - start));
return 0;
  }
  
@@ -999,6 +1018,14 @@ static int fadump_setup_crash_memory_ranges(void)
  
  	pr_debug("Setup crash memory ranges.\n");

crash_mem_ranges = 0;
+
+   /* allocate memory for crash memory ranges for the first time */
+   if (!max_crash_mem_ranges) {
+   ret = allocate_crash_memory_ranges();
+   if (ret)
+   return ret;
+   }
+

I see that the check for (!is_adjacent) in first hunk already handles
the first time allocation. Do we need this ?


Right. This hunk in fadump_setup_crash_memory_ranges() is unnecessary. 
Can be dropped.
Also, I missed out on adding "#include ". Though it 
compiles fine with

upstream kernel, will add and post v3 just to be safe..


Rest looks fine to me.

Reviewed-by: Mahesh Salgaonkar 


Thanks for the review

- Hari



[PATCH v3 1/2] powerpc/fadump: handle crash memory ranges array index overflow

2018-08-08 Thread Hari Bathini
Crash memory ranges is an array of memory ranges of the crashing kernel
to be exported as a dump via /proc/vmcore file. The size of the array
is set based on INIT_MEMBLOCK_REGIONS, which works alright in most cases
where memblock memory regions count is less than INIT_MEMBLOCK_REGIONS
value. But this count can grow beyond INIT_MEMBLOCK_REGIONS value since
commit 142b45a72e22 ("memblock: Add array resizing support").

On large memory systems with a few DLPAR operations, the memblock memory
regions count could be larger than INIT_MEMBLOCK_REGIONS value. On such
systems, registering fadump results in crash or other system failures
like below:

  task: c7f39a290010 ti: cb738000 task.ti: cb738000
  NIP: c0047df4 LR: c00f9e58 CTR: c010f180
  REGS: cb73b570 TRAP: 0300   Tainted: G  L   X  (4.4.140+)
  MSR: 80009033   CR: 22004484  XER: 2000
  CFAR: c0008500 DAR: 07a45000 DSISR: 4000 SOFTE: 0
  GPR00: c00f9e58 cb73b7f0 c0f09a00 001a
  GPR04: c7f3bf774c90 0004 c0eb9a00 0800
  GPR08: 0804 07a45000 c0fa9a00 c7ffb169ca20
  GPR12: 22004482 cfa12c00 c7f3a0ea97a8 
  GPR16: c7f3a0ea9a50 cb73bd60 0118 0001fe80
  GPR20: 0118  c0b8c980 00d0
  GPR24: 07ffb0b1 c7ffb169c980  c0b8c980
  GPR28: 0004 c7ffb169c980 001a c7ffb169c980
  NIP [c0047df4] smp_send_reschedule+0x24/0x80
  LR [c00f9e58] resched_curr+0x138/0x160
  Call Trace:
  [cb73b7f0] [c00f9e58] resched_curr+0x138/0x160 (unreliable)
  [cb73b820] [c00fb538] check_preempt_curr+0xc8/0xf0
  [cb73b850] [c00fb598] ttwu_do_wakeup+0x38/0x150
  [cb73b890] [c00fc9c4] try_to_wake_up+0x224/0x4d0
  [cb73b900] [c011ef34] __wake_up_common+0x94/0x100
  [cb73b960] [c034a78c] ep_poll_callback+0xac/0x1c0
  [cb73b9b0] [c011ef34] __wake_up_common+0x94/0x100
  [cb73ba10] [c011f810] __wake_up_sync_key+0x70/0xa0
  [cb73ba60] [c067c3e8] sock_def_readable+0x58/0xa0
  [cb73ba90] [c07848ac] unix_stream_sendmsg+0x2dc/0x4c0
  [cb73bb70] [c0675a38] sock_sendmsg+0x68/0xa0
  [cb73bba0] [c067673c] ___sys_sendmsg+0x2cc/0x2e0
  [cb73bd30] [c0677dbc] __sys_sendmsg+0x5c/0xc0
  [cb73bdd0] [c06789bc] SyS_socketcall+0x36c/0x3f0
  [cb73be30] [c0009488] system_call+0x3c/0x100
  Instruction dump:
  4e800020 6000 6042 3c4c00ec 38421c30 7c0802a6 f8010010 6000
  3d42000a e92ab420 2fa9 4dde0020  2fa9 419e0044 7c0802a6
  ---[ end trace a6d1dd4bab5f8253 ]---

as array index overflow is not checked for while setting up crash memory
ranges causing memory corruption. To resolve this issue, dynamically
allocate memory for crash memory ranges and resize it incrementally,
in units of pagesize, on hitting array size limit.

Fixes: 2df173d9e85d ("fadump: Initialize elfcore header and add PT_LOAD program 
headers.")
Cc: sta...@vger.kernel.org
Cc: Mahesh Salgaonkar 
Signed-off-by: Hari Bathini 
Reviewed-by: Mahesh Salgaonkar 
---

Changes in v3:
* Included  for krelloc()


 arch/powerpc/include/asm/fadump.h |4 +-
 arch/powerpc/kernel/fadump.c  |   92 +++--
 2 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 5a23010..3abc738 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -195,8 +195,8 @@ struct fadump_crash_info_header {
struct cpumask  online_mask;
 };
 
-/* Crash memory ranges */
-#define INIT_CRASHMEM_RANGES   (INIT_MEMBLOCK_REGIONS + 2)
+/* Crash memory ranges size unit (pagesize) */
+#define CRASHMEM_RANGES_ALLOC_SIZE PAGE_SIZE
 
 struct fad_crash_memory_ranges {
unsigned long long  base;
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 07e8396..9f80a78 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -47,8 +48,10 @@ static struct fadump_mem_struct fdm;
 static const struct fadump_mem_struct *fdm_active;
 
 static DEFINE_MUTEX(fadump_mutex);
-struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES];
+struct fad_crash_memory_ranges *crash_memory_ranges;
+int crash_memory_ranges_size;
 int crash_mem_ranges;
+int max_crash_mem_ranges;
 
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node,
@@ -868,22 +871,67 @@ static int __init process

[PATCH v3 2/2] powerpc/fadump: merge adjacent memory ranges to reduce PT_LOAD segements

2018-08-08 Thread Hari Bathini
With dynamic memory allocation support for crash memory ranges array,
there is no hard limit on the no. of crash memory ranges kernel could
export, but program headers count could overflow in the /proc/vmcore
ELF file while exporting each memory range as PT_LOAD segment. Reduce
the likelihood of a such scenario, by folding adjacent crash memory
ranges which minimizes the total number of PT_LOAD segments.

Signed-off-by: Hari Bathini 
Reviewed-by: Mahesh Salgaonkar 
---

Changes in v3:
* Dropped unnecessary memory allocation hunk in 
fadump_setup_crash_memory_ranges()


 arch/powerpc/kernel/fadump.c |   37 -
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 9f80a78..5436600c 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -909,22 +909,41 @@ static int allocate_crash_memory_ranges(void)
 static inline int fadump_add_crash_memory(unsigned long long base,
  unsigned long long end)
 {
+   u64  start, size;
+   bool is_adjacent = false;
+
if (base == end)
return 0;
 
-   if (crash_mem_ranges == max_crash_mem_ranges) {
-   int ret;
+   /*
+* Fold adjacent memory ranges to bring down the memory ranges/
+* PT_LOAD segments count.
+*/
+   if (crash_mem_ranges) {
+   start = crash_memory_ranges[crash_mem_ranges-1].base;
+   size = crash_memory_ranges[crash_mem_ranges-1].size;
 
-   ret = allocate_crash_memory_ranges();
-   if (ret)
-   return ret;
+   if ((start + size) == base)
+   is_adjacent = true;
+   }
+   if (!is_adjacent) {
+   /* resize the array on reaching the limit */
+   if (crash_mem_ranges == max_crash_mem_ranges) {
+   int ret;
+
+   ret = allocate_crash_memory_ranges();
+   if (ret)
+   return ret;
+   }
+
+   start = base;
+   crash_memory_ranges[crash_mem_ranges].base = start;
+   crash_mem_ranges++;
}
 
+   crash_memory_ranges[crash_mem_ranges-1].size = (end - start);
pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
-   crash_mem_ranges, base, end - 1, (end - base));
-   crash_memory_ranges[crash_mem_ranges].base = base;
-   crash_memory_ranges[crash_mem_ranges].size = end - base;
-   crash_mem_ranges++;
+   (crash_mem_ranges - 1), start, end - 1, (end - start));
return 0;
 }
 



[PATCH] powerpc/fadump: cleanup crash memory ranges support

2018-08-17 Thread Hari Bathini
Commit 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array
index overflow") changed crash memory ranges to a dynamic array that
is reallocated on-demand with krealloc(). The relevant header for this
call was not included. The kernel compiles though. But be cautious and
add the header anyway.

Also, memory allocation logic in fadump_add_crash_memory() takes care
of memory allocation for crash memory ranges in all scenarios. Drop
unnecessary memory allocation in fadump_setup_crash_memory_ranges().

Fixes: 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array index 
overflow")
Cc: Mahesh Salgaonkar 
Signed-off-by: Hari Bathini 
---

* Actually posted a V3 with this changes but V2 made it!
- https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=59839


 arch/powerpc/kernel/fadump.c |8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 986ec47..a711d22 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1019,13 +1020,6 @@ static int fadump_setup_crash_memory_ranges(void)
pr_debug("Setup crash memory ranges.\n");
crash_mem_ranges = 0;
 
-   /* allocate memory for crash memory ranges for the first time */
-   if (!max_crash_mem_ranges) {
-   ret = allocate_crash_memory_ranges();
-   if (ret)
-   return ret;
-   }
-
/*
 * add the first memory chunk (RMA_START through boot_memory_size) as
 * a separate memory chunk. The reason is, at the time crash firmware



[PATCH] powerpc/fadump: re-register firmware-assisted dump if already registered

2018-09-14 Thread Hari Bathini
Firmware-Assisted Dump (FADump) needs to be registered again after any
memory hot add/remove operation to update the crash memory ranges. But
currently, the kernel returns '-EEXIST' if we try to register without
uregistering it first. This could expose the system to racing issues
while unregistering and registering FADump from userspace during udev
events. Spare the userspace of this and let it be taken care of in the
kernel space for a simpler interface.

Since this change, running 'echo 1 > /sys/kernel/fadump_registered'
would result in re-regisering (unregistering and registering) FADump,
if it was already registered.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/fadump.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index a711d22..761b28b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1444,8 +1444,8 @@ static ssize_t fadump_register_store(struct kobject *kobj,
break;
case 1:
if (fw_dump.dump_registered == 1) {
-   ret = -EEXIST;
-   goto unlock_out;
+   /* Un-register Firmware-assisted dump */
+   fadump_unregister_dump(&fdm);
}
/* Register Firmware-assisted dump */
ret = register_fadump();



Re: [PATCH] powerpc/fadump: re-register firmware-assisted dump if already registered

2018-09-14 Thread Hari Bathini




On Friday 14 September 2018 07:58 PM, Petr Tesarik wrote:

On Fri, 14 Sep 2018 19:36:02 +0530
Hari Bathini  wrote:


Firmware-Assisted Dump (FADump) needs to be registered again after any
memory hot add/remove operation to update the crash memory ranges. But
currently, the kernel returns '-EEXIST' if we try to register without
uregistering it first. This could expose the system to racing issues
while unregistering and registering FADump from userspace during udev
events. Spare the userspace of this and let it be taken care of in the
kernel space for a simpler interface.

Since this change, running 'echo 1 > /sys/kernel/fadump_registered'
would result in re-regisering (unregistering and registering) FADump,
if it was already registered.

Great improvement to the API!

Any suggestions what should be done in a client which tries to be
compatible with kernels before this change and after this change?


If `echo 1 > /sys/kernel/fadump_registered` fails, check for the output
of  `cat /sys/kernel/fadump_registered` and if it is still `1`, that 
indicates

old kernel and we are already registered. Treat it as success if being
registered is what we care about or unregister/register (if re-register
is the intention)..

Hope that helps..

Thanks
Hari



Re: [PATCH] powerpc/numa: Skip onlining a offline node in kdump path

2018-10-01 Thread Hari Bathini
el) boots properly.

Unlike regular kernels, which mark all available nodes as online, kdump
kernel only marks just enough nodes as online and marks the rest as
offline at boot.  However kdump kernel boots with all available CPUs.
With Commit 2ea626306810 ("powerpc/topology: Get topology for shared
processors at boot"), all CPUs are onlined on their respective nodes at
boot time. try_online_node() tries to online the offline nodes but fails
as all needed subsystems are not yet initialized.

As part of fix, detect and skip early onlining of a offline node.

Fixes: 2ea626306810 ("powerpc/topology: Get topology for shared processors at 
boot")
Reported-by: Pavithra Prakash 
Signed-off-by: Srikar Dronamraju 


Tested-by: Hari Bathini 


---
  arch/powerpc/mm/numa.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index e94148a1d7e4..d88139acdfe6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1217,9 +1217,10 @@ int find_and_online_cpu_nid(int cpu)
 * Need to ensure that NODE_DATA is initialized for a node from
 * available memory (see memblock_alloc_try_nid). If unable to
 * init the node, then default to nearest node that has memory
-* installed.
+* installed. Skip onlining a node if the subsystems are not
+* yet initialized.
 */
-   if (try_online_node(new_nid))
+   if (!topology_inited || try_online_node(new_nid))
new_nid = first_online_node;
  #else
/*




[PATCH v3 00/16] Add FADump support on PowerNV platform

2019-06-25 Thread Hari Bathini
Firmware-Assisted Dump (FADump) is currently supported only on pseries
platform. This patch series adds support for powernv platform too.

The first and third patches refactor the FADump code to make use of common
code across multiple platforms. The fifth patch adds basic FADump support
for powernv platform. Patches seven & eight honour reserved-ranges DT node
while reserving/releasing memory used by FADump. The next patch processes
CPU state data provided by firmware to create and append core notes to the
ELF core file. The tenth patch adds support for preserving crash data for
subsequent boots (useful in cases like petitboot). Patch twelve provides
support to export opalcore. This is to make debugging of failures in OPAL
code easier. The subsequent patch ensures vmcore processing is skipped
when only OPAL core is exported by f/w. The next patch provides option to
release the kernel memory used to export opalcore. The remaining patches
update Firmware-Assisted Dump documentation appropriately.

The patch series is tested with the latest firmware plus the below skiboot
changes for MPIPL support:

https://patchwork.ozlabs.org/project/skiboot/list/?series=114104
("MPIPL support")


Changes in v3:
  * Rebased to latest upstream kernel version.
  * Updated according to latest OPAL changes.
  * Using metadata tags instead of structs between kernel & OPAL.
  * Exporting OPAL core as /sys/firmware/opal/core (not /proc/opalcore)

---

Hari Bathini (16):
  powerpc/fadump: move internal fadump code to a new file
  powerpc/fadump: Improve fadump documentation
  pseries/fadump: move out platform specific support from generic code
  powerpc/fadump: use FADump instead of fadump for how it is pronounced
  powerpc/fadump: enable fadump support on OPAL based POWER platform
  powerpc/fadump: Update documentation about OPAL platform support
  powerpc/fadump: consider reserved ranges while reserving memory
  powerpc/fadump: consider reserved ranges while releasing memory
  powernv/fadump: process architected register state data provided by 
firmware
  powernv/fadump: add support to preserve crash data on FADUMP disabled 
kernel
  powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP
  powerpc/powernv: export /sys/firmware/opal/core for analysing opal crashes
  powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists
  powernv/opalcore: provide an option to invalidate /sys/firmware/opal/core 
file
  powernv/fadump: consider f/w load area
  powernv/fadump: update documentation about option to release opalcore


 Documentation/powerpc/firmware-assisted-dump.txt |  193 ++--
 arch/powerpc/Kconfig |   23 
 arch/powerpc/include/asm/fadump.h|  190 
 arch/powerpc/include/asm/opal-api.h  |   89 ++
 arch/powerpc/include/asm/opal.h  |4 
 arch/powerpc/kernel/Makefile |6 
 arch/powerpc/kernel/fadump-common.c  |  196 
 arch/powerpc/kernel/fadump-common.h  |  203 
 arch/powerpc/kernel/fadump.c | 1183 +-
 arch/powerpc/kernel/prom.c   |4 
 arch/powerpc/platforms/powernv/Makefile  |3 
 arch/powerpc/platforms/powernv/opal-call.c   |2 
 arch/powerpc/platforms/powernv/opal-core.c   |  634 
 arch/powerpc/platforms/powernv/opal-fadump.c |  661 
 arch/powerpc/platforms/powernv/opal-fadump.h |  117 ++
 arch/powerpc/platforms/pseries/Makefile  |1 
 arch/powerpc/platforms/pseries/rtas-fadump.c |  557 ++
 arch/powerpc/platforms/pseries/rtas-fadump.h |  106 ++
 18 files changed, 3187 insertions(+), 985 deletions(-)
 create mode 100644 arch/powerpc/kernel/fadump-common.c
 create mode 100644 arch/powerpc/kernel/fadump-common.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h



[PATCH v3 01/16] powerpc/fadump: move internal fadump code to a new file

2019-06-25 Thread Hari Bathini
Refactoring fadump code means internal fadump code is referenced from
different places. For ease, move internal code to a new file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h   |  112 
 arch/powerpc/kernel/Makefile|2 
 arch/powerpc/kernel/fadump-common.c |  184 +
 arch/powerpc/kernel/fadump-common.h |  126 +++
 arch/powerpc/kernel/fadump.c|  194 ++-
 5 files changed, 324 insertions(+), 294 deletions(-)
 create mode 100644 arch/powerpc/kernel/fadump-common.c
 create mode 100644 arch/powerpc/kernel/fadump-common.h

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 17d9b6a..a2d2533 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -11,34 +11,6 @@
 
 #ifdef CONFIG_FA_DUMP
 
-/*
- * The RMA region will be saved for later dumping when kernel crashes.
- * RMA is Real Mode Area, the first block of logical memory address owned
- * by logical partition, containing the storage that may be accessed with
- * translate off.
- */
-#define RMA_START  0x0
-#define RMA_END(ppc64_rma_size)
-
-/*
- * On some Power systems where RMO is 128MB, it still requires minimum of
- * 256MB for kernel to boot successfully. When kdump infrastructure is
- * configured to save vmcore over network, we run into OOM issue while
- * loading modules related to network setup. Hence we need aditional 64M
- * of memory to avoid OOM issue.
- */
-#define MIN_BOOT_MEM   (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
-   + (0x1UL << 26))
-
-/* The upper limit percentage for user specified boot memory size (25%) */
-#define MAX_BOOT_MEM_RATIO 4
-
-#define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt)
-
-/* Alignement per CMA requirement. */
-#define FADUMP_CMA_ALIGNMENT   (PAGE_SIZE <<   \
-   max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
-
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA  0x0001
 #define FADUMP_HPTE_REGION 0x0002
@@ -47,18 +19,9 @@
 /* Dump request flag */
 #define FADUMP_REQUEST_FLAG0x0001
 
-/* FAD commands */
-#define FADUMP_REGISTER1
-#define FADUMP_UNREGISTER  2
-#define FADUMP_INVALIDATE  3
-
 /* Dump status flag */
 #define FADUMP_ERROR_FLAG  0x2000
 
-#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
-
-#define CPU_UNKNOWN(~((u32)0))
-
 /* Utility macros */
 #define SKIP_TO_NEXT_CPU(reg_entry)\
 ({ \
@@ -112,59 +75,8 @@ struct fadump_mem_struct {
struct fadump_section   rmr_region;
 };
 
-/* Firmware-assisted dump configuration details. */
-struct fw_dump {
-   unsigned long   cpu_state_data_size;
-   unsigned long   hpte_region_size;
-   unsigned long   boot_memory_size;
-   unsigned long   reserve_dump_area_start;
-   unsigned long   reserve_dump_area_size;
-   /* cmd line option during boot */
-   unsigned long   reserve_bootvar;
-
-   unsigned long   fadumphdr_addr;
-   unsigned long   cpu_notes_buf;
-   unsigned long   cpu_notes_buf_size;
-
-   int ibm_configure_kernel_dump;
-
-   unsigned long   fadump_enabled:1;
-   unsigned long   fadump_supported:1;
-   unsigned long   dump_active:1;
-   unsigned long   dump_registered:1;
-   unsigned long   nocma:1;
-};
-
-/*
- * Copy the ascii values for first 8 characters from a string into u64
- * variable at their respective indexes.
- * e.g.
- *  The string "FADMPINF" will be converted into 0x4641444d50494e46
- */
-static inline u64 str_to_u64(const char *str)
-{
-   u64 val = 0;
-   int i;
-
-   for (i = 0; i < sizeof(val); i++)
-   val = (*str) ? (val << 8) | *str++ : val << 8;
-   return val;
-}
-#define STR_TO_HEX(x)  str_to_u64(x)
-#define REG_ID(x)  str_to_u64(x)
-
-#define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF")
 #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE")
 
-/* The firmware-assisted dump format.
- *
- * The register save area is an area in the partition's memory used to preserve
- * the register contents (CPU state data) for the active CPUs during a firmware
- * assisted dump. The dump format contains register save area header followed
- * by register entries. Each list of registers for a CPU starts with
- * "CPUSTRT" and ends with "CPUEND".
- */
-
 /* Register save area header. */
 struct fadump_reg_save_area_header {
__be64  magic_number;
@@ -172,29 +84,9 @@ struct fadump_reg_save_area_header {
__be32  num_cpu_offset;
 };
 
-/* Register

[PATCH v3 02/16] powerpc/fadump: Improve fadump documentation

2019-06-25 Thread Hari Bathini
The figures depicting FADump's (Firmware-Assisted Dump) memory layout
are missing some finer details like different memory regions and what
they represent. Improve the documentation by updating those details.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   65 --
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 18c5fee..059993b 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -74,8 +74,9 @@ as follows:
there is crash data available from a previous boot. During
the early boot OS will reserve rest of the memory above
boot memory size effectively booting with restricted memory
-   size. This will make sure that the second kernel will not
-   touch any of the dump memory area.
+   size. This will make sure that this kernel (also, referred
+   to as second kernel or capture kernel) will not touch any
+   of the dump memory area.
 
 -- User-space tools will read /proc/vmcore to obtain the contents
of memory, which holds the previous crashed kernel dump in ELF
@@ -125,48 +126,52 @@ space memory except the user pages that were present in 
CMA region.
 
   o Memory Reservation during first kernel
 
-  Low memory Top of memory
-  0  boot memory size   |
-  |   ||<--Reserved dump area -->|  |
-  V   V|   Permanent Reservation |  V
-  +---+--/ /---+---++---++--+
-  |   ||CPU|HPTE|  DUMP |ELF |  |
-  +---+--/ /---+---++---++--+
-|   ^
-|   |
-\   /
- ---
-  Boot memory content gets transferred to
-  reserved area by firmware at the time of
-  crash
+  Low memoryTop of memory
+  0  boot memory size  |<--Reserved dump area --->|  |
+  |   ||   Permanent Reservation  |  |
+  V   V|   (Preserve area)|  V
+  +---+--/ /---+---+++---++--+
+  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
+  +---+--/ /---+---+++---++--+
+|   ^  ^
+|   |  |
+\   /  |
+ --- FADump Header
+  Boot memory content gets transferred   (meta area)
+  to reserved area by firmware at the
+  time of crash
+
Fig. 1
 
+
   o Memory Reservation during second kernel after crash
 
-  Low memoryTop of memory
-  0  boot memory size   |
-  |   |<- Reserved dump area --- -->|
-  V   V V
-  +---+--/ /---+---++---++--+
-  |   ||CPU|HPTE|  DUMP |ELF |  |
-  +---+--/ /---+---++---++--+
+  Low memoryTop of memory
+  0  boot memory size|
+  |   |<- Reserved dump area --->|
+  V   V|< Preserve area ->|  V
+  +---+--/ /---+---+++---++--+
+  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
+  +---+--/ /---+---+++---++--+
 |  |
 V  V
Used by second/proc/vmcore
kernel to boot
Fig. 2
 
-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The dump data available through
-/proc/vmcore will be in ELF format. Hence the existing kdump
-infrastructure (kdump scripts) to save the dump works fine with
-minor modifications.
+Currently the dump will be copied from /proc/vmcore to a new file upon
+user intervention. The dump data available through /proc/vmcore will be
+in ELF format. Hence the existing kdump infrastructure (kdump scripts)
+to save the dump works fine with minor modifications. KDump scripts on
+major Distro releases have already been modified to work seemlessly (no
+user intervention in s

[PATCH v3 03/16] pseries/fadump: move out platform specific support from generic code

2019-06-25 Thread Hari Bathini
Introduce callbacks for platform specific operations like register,
unregister, invalidate & such, and move pseries specific code into
platform code.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h|   75 
 arch/powerpc/kernel/fadump-common.h  |   38 ++
 arch/powerpc/kernel/fadump.c |  500 ++---
 arch/powerpc/platforms/pseries/Makefile  |1 
 arch/powerpc/platforms/pseries/rtas-fadump.c |  529 ++
 arch/powerpc/platforms/pseries/rtas-fadump.h |   96 +
 6 files changed, 700 insertions(+), 539 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c
 create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index a2d2533..9a7652c 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -11,79 +11,8 @@
 
 #ifdef CONFIG_FA_DUMP
 
-/* Firmware provided dump sections */
-#define FADUMP_CPU_STATE_DATA  0x0001
-#define FADUMP_HPTE_REGION 0x0002
-#define FADUMP_REAL_MODE_REGION0x0011
-
-/* Dump request flag */
-#define FADUMP_REQUEST_FLAG0x0001
-
-/* Dump status flag */
-#define FADUMP_ERROR_FLAG  0x2000
-
-/* Utility macros */
-#define SKIP_TO_NEXT_CPU(reg_entry)\
-({ \
-   while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))  \
-   reg_entry++;\
-   reg_entry++;\
-})
-
 extern int crashing_cpu;
 
-/* Kernel Dump section info */
-struct fadump_section {
-   __be32  request_flag;
-   __be16  source_data_type;
-   __be16  error_flags;
-   __be64  source_address;
-   __be64  source_len;
-   __be64  bytes_dumped;
-   __be64  destination_address;
-};
-
-/* ibm,configure-kernel-dump header. */
-struct fadump_section_header {
-   __be32  dump_format_version;
-   __be16  dump_num_sections;
-   __be16  dump_status_flag;
-   __be32  offset_first_dump_section;
-
-   /* Fields for disk dump option. */
-   __be32  dd_block_size;
-   __be64  dd_block_offset;
-   __be64  dd_num_blocks;
-   __be32  dd_offset_disk_path;
-
-   /* Maximum time allowed to prevent an automatic dump-reboot. */
-   __be32  max_time_auto;
-};
-
-/*
- * Firmware Assisted dump memory structure. This structure is required for
- * registering future kernel dump with power firmware through rtas call.
- *
- * No disk dump option. Hence disk dump path string section is not included.
- */
-struct fadump_mem_struct {
-   struct fadump_section_headerheader;
-
-   /* Kernel dump sections */
-   struct fadump_section   cpu_state_data;
-   struct fadump_section   hpte_region;
-   struct fadump_section   rmr_region;
-};
-
-#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE")
-
-/* Register save area header. */
-struct fadump_reg_save_area_header {
-   __be64  magic_number;
-   __be32  version;
-   __be32  num_cpu_offset;
-};
-
 extern int is_fadump_memory_area(u64 addr, ulong size);
 extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data);
@@ -99,5 +28,5 @@ static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 static inline void fadump_cleanup(void) { }
-#endif
-#endif
+#endif /* !CONFIG_FA_DUMP */
+#endif /* __PPC64_FA_DUMP_H__ */
diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index 8ccd96d..1eb1397 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -47,6 +47,12 @@
 #define FADUMP_UNREGISTER  2
 #define FADUMP_INVALIDATE  3
 
+/* Firmware-Assited Dump platforms */
+enum fadump_platform_type {
+   FADUMP_PLATFORM_UNKNOWN = 0,
+   FADUMP_PLATFORM_PSERIES,
+};
+
 #define FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
 
 #define CPU_UNKNOWN(~((u32)0))
@@ -91,6 +97,9 @@ struct fad_crash_memory_ranges {
unsigned long long  size;
 };
 
+/* Platform specific callback functions */
+struct fadump_ops;
+
 /* Firmware-assisted dump configuration details. */
 struct fw_dump {
unsigned long   cpu_state_data_size;
@@ -98,6 +107,7 @@ struct fw_dump {
unsigned long   boot_memory_size;
unsigned long   reserve_dump_area_start;
unsigned long   reserve_dump_area_size;
+   unsigned long   preserv_area_start;
/* cmd line option during boot */
unsigned long   reserve_bootvar;
 
@

[PATCH v3 04/16] powerpc/fadump: use FADump instead of fadump for how it is pronounced

2019-06-25 Thread Hari Bathini
Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   56 +++---
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 059993b..62e75ef 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -8,18 +8,18 @@ a crashed system, and to do so from a fully-reset system, and
 to minimize the total elapsed time until the system is back
 in production use.
 
-- Firmware assisted dump (fadump) infrastructure is intended to replace
+- Firmware-Assisted Dump (FADump) infrastructure is intended to replace
   the existing phyp assisted dump.
 - Fadump uses the same firmware interfaces and memory reservation model
   as phyp assisted dump.
-- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore
+- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore
   in the ELF format in the same way as kdump. This helps us reuse the
   kdump infrastructure for dump capture and filtering.
 - Unlike phyp dump, userspace tool does not need to refer any sysfs
   interface while reading /proc/vmcore.
-- Unlike phyp dump, fadump allows user to release all the memory reserved
+- Unlike phyp dump, FADump allows user to release all the memory reserved
   for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
-- Once enabled through kernel boot parameter, fadump can be
+- Once enabled through kernel boot parameter, FADump can be
   started/stopped through /sys/kernel/fadump_registered interface (see
   sysfs files section below) and can be easily integrated with kdump
   service start/stop init scripts.
@@ -33,7 +33,7 @@ dump offers several strong, practical advantages:
in a clean, consistent state.
 -- Once the dump is copied out, the memory that held the dump
is immediately available to the running kernel. And therefore,
-   unlike kdump, fadump doesn't need a 2nd reboot to get back
+   unlike kdump, FADump doesn't need a 2nd reboot to get back
the system to the production configuration.
 
 The above can only be accomplished by coordination with,
@@ -61,7 +61,7 @@ as follows:
  boot successfully. For syntax of crashkernel= parameter,
  refer to Documentation/kdump/kdump.txt. If any offset is
  provided in crashkernel= parameter, it will be ignored
- as fadump uses a predefined offset to reserve memory
+ as FADump uses a predefined offset to reserve memory
  for boot memory dump preservation in case of a crash.
 
 -- After the low memory (boot memory) area has been saved, the
@@ -120,7 +120,7 @@ blocking this significant chunk of memory from production 
kernel.
 Hence, the implementation uses the Linux kernel's Contiguous Memory
 Allocator (CMA) for memory reservation if CMA is configured for kernel.
 With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this fadump will
+use it, while kernel is prevented from using it. With this FADump will
 still be able to capture all of the kernel memory and most of the user
 space memory except the user pages that were present in CMA region.
 
@@ -170,14 +170,14 @@ KDump, as dump mechanism.
 The tools to examine the dump will be same as the ones
 used for kdump.
 
-How to enable firmware-assisted dump (fadump):
+How to enable firmware-assisted dump (FADump):
 -
 
 1. Set config option CONFIG_FA_DUMP=y and build kernel.
-2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-   By default, fadump reserved memory will be initialized as CMA area.
-   Alternatively, user can boot linux kernel with 'fadump=nocma' to
-   prevent fadump to use CMA.
+2. Boot into linux kernel with 'FADump=on' kernel cmdline option.
+   By default, FADump reserved memory will be initialized as CMA area.
+   Alternatively, user can boot linux kernel with 'FADump=nocma' to
+   prevent FADump to use CMA.
 3. Optionally, user can also set 'crashkernel=' kernel cmdline
to specify size of the memory to reserve for boot memory dump
preservation.
@@ -190,7 +190,7 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been 
deprecated. Instead
  option is set at kernel cmdline.
   3. if user wants to capture all of user space memory and ok with
  reserved memory not available to production system, then
- 'fadump=nocma' kernel parameter can be used to fallback to
+ 'FADump=nocma' kernel parameter can be used to fallback to
  old behaviour.
 
 Sysfs/debugfs files:
@@ -203,29 +203,29 @@ Here is the list of files under kernel sysfs:
 
  /sys/kernel/fadump_enabled
 
-This is used to display the fadump status.
-0 = fadump is disa

[PATCH v3 05/16] powerpc/fadump: enable fadump support on OPAL based POWER platform

2019-06-25 Thread Hari Bathini
From: Hari Bathini 

Firmware-assisted dump support is enabled for OPAL based POWER platforms
in P9 firmware. Make the corresponding updates in kernel to enable fadump
support for such platforms.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/Kconfig |5 
 arch/powerpc/include/asm/opal-api.h  |   58 +++
 arch/powerpc/include/asm/opal.h  |4 
 arch/powerpc/kernel/fadump-common.c  |   18 +
 arch/powerpc/kernel/fadump-common.h  |   46 ++-
 arch/powerpc/kernel/fadump.c |  277 
 arch/powerpc/platforms/powernv/Makefile  |1 
 arch/powerpc/platforms/powernv/opal-call.c   |2 
 arch/powerpc/platforms/powernv/opal-fadump.c |  443 ++
 arch/powerpc/platforms/powernv/opal-fadump.h |   34 ++
 arch/powerpc/platforms/pseries/rtas-fadump.c |   38 ++
 11 files changed, 837 insertions(+), 89 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636..f124a9b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -557,7 +557,7 @@ config CRASH_DUMP
 
 config FA_DUMP
bool "Firmware-assisted dump"
-   depends on PPC64 && PPC_RTAS
+   depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
select CRASH_CORE
select CRASH_DUMP
help
@@ -568,7 +568,8 @@ config FA_DUMP
  is meant to be a kdump replacement offering robustness and
  speed not possible without system firmware assistance.
 
- If unsure, say "N"
+ If unsure, say "y". Only special kernels like petitboot may
+ need to say "N" here.
 
 config IRQ_ALL_CPUS
bool "Distribute interrupts on all CPUs by default"
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 09a8553..1762b1e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -208,7 +208,9 @@
 #define OPAL_HANDLE_HMI2   166
 #defineOPAL_NX_COPROC_INIT 167
 #define OPAL_XIVE_GET_VP_STATE 170
-#define OPAL_LAST  170
+#define OPAL_MPIPL_UPDATE  173
+#define OPAL_MPIPL_QUERY_TAG   174
+#define OPAL_LAST  174
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -979,6 +981,59 @@ struct opal_sg_list {
 };
 
 /*
+ * Firmware-Assisted Dump (FADump)
+ */
+
+/* The maximum number of dump sections supported by OPAL */
+#define OPAL_FADUMP_NR_SECTIONS64
+
+/* Kernel Dump region info */
+struct opal_fadump_region {
+   __be64  src;
+   __be64  dest;
+   __be64  size;
+} __attribute__((packed));
+
+/* FADump structure format version */
+#define MPIPL_FADUMP_VERSION   0x01
+
+/*
+ * Metadata type. Kernel uses this field to identify the
+ * type of data
+ */
+#define MPIPL_FADUMP_TYPE_CPU  0x00
+/* OPAL : 0x01 – 0x39 */
+#define MPIPL_FADUMP_TYPE_OPAL 0x01
+/* Firmware/SMF : 0x40 – 0x79 */
+#define MPIPL_FADUMP_TYPE_FW   0x40
+/* Kernel memory region : 0x80 – 0xb9 */
+#define MPIPL_FADUMP_TYPE_KERNEL   0x80
+/* Reserved for future use : 0xc0 – 0xff */
+#define MPIPL_FADUMP_TYPE_RESERVED 0xc0
+
+/* OPAL MPIPL FADump metadata */
+struct opal_mpipl_fadump {
+   u8  type;
+   u8  version;
+   u8  reserved[6];
+   __be32  crashing_pir;
+   __be32  cpu_data_version;
+   __be32  cpu_data_size;
+   __be32  region_cnt;
+
+   struct opal_fadump_region   region[OPAL_FADUMP_NR_SECTIONS];
+} __attribute__((packed));
+
+/* MPIPL update operations */
+enum mpipl_ops {
+   OPAL_MPIPL_REGISTER_TAG = 0,
+   OPAL_MPIPL_ADD_RANGE= 1,
+   OPAL_MPIPL_REMOVE_RANGE = 2,
+   OPAL_MPIPL_REMOVE_ALL   = 3,
+   OPAL_MPIPL_FREE_PRESERVED_MEMORY= 4,
+};
+
+/*
  * Dump region ID range usable by the OS
  */
 #define OPAL_DUMP_REGION_HOST_START0x80
@@ -1058,6 +1113,7 @@ enum {
OPAL_REBOOT_NORMAL  = 0,
OPAL_REBOOT_PLATFORM_ERROR  = 1,
OPAL_REBOOT_FULL_IPL= 2,
+   OPAL_REBOOT_OS_ERROR= 3,
 };
 
 /* Argument to OPAL_PCI_TCE_KILL */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 4ed5d57..4c99421 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -39,6 +39,10 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t 
bdfn,
uint64_t PE_handle);

[PATCH v3 06/16] powerpc/fadump: Update documentation about OPAL platform support

2019-06-25 Thread Hari Bathini
With FADump support now available on both pseries and OPAL platforms,
update FADump documentation with these details.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   90 --
 1 file changed, 51 insertions(+), 39 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 62e75ef..844a229 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -70,7 +70,8 @@ as follows:
normal.
 
 -- The freshly booted kernel will notice that there is a new
-   node (ibm,dump-kernel) in the device tree, indicating that
+   node (ibm,dump-kernel on PSeries or ibm,opal/dump/result-table
+   on OPAL platform) in the device tree, indicating that
there is crash data available from a previous boot. During
the early boot OS will reserve rest of the memory above
boot memory size effectively booting with restricted memory
@@ -93,7 +94,9 @@ as follows:
 
 Please note that the firmware-assisted dump feature
 is only available on Power6 and above systems with recent
-firmware versions.
+firmware versions on PSeries (PowerVM) platform and Power9
+and above systems with recent firmware versions on PowerNV
+(OPAL) platform.
 
 Implementation details:
 --
@@ -108,57 +111,66 @@ that are run. If there is dump data, then the
 /sys/kernel/fadump_release_mem file is created, and the reserved
 memory is held.
 
-If there is no waiting dump data, then only the memory required
-to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is usually reserved at an offset greater than boot memory
-size (see Fig. 1). This area is *not* released: this region will
-be kept permanently reserved, so that it can act as a receptacle
-for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur. Since this reserved
-memory area is used only after the system crash, there is no point in
-blocking this significant chunk of memory from production kernel.
-Hence, the implementation uses the Linux kernel's Contiguous Memory
-Allocator (CMA) for memory reservation if CMA is configured for kernel.
-With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this FADump will
-still be able to capture all of the kernel memory and most of the user
-space memory except the user pages that were present in CMA region.
+If there is no waiting dump data, then only the memory required to
+hold CPU state, HPTE region, boot memory dump, FADump header and
+elfcore header, is usually reserved at an offset greater than boot
+memory size (see Fig. 1). This area is *not* released: this region
+will be kept permanently reserved, so that it can act as a receptacle
+for a copy of the boot memory content in addition to CPU state and
+HPTE region, in the case a crash does occur.
+
+Since this reserved memory area is used only after the system crash,
+there is no point in blocking this significant chunk of memory from
+production kernel. Hence, the implementation uses the Linux kernel's
+Contiguous Memory Allocator (CMA) for memory reservation if CMA is
+configured for kernel. With CMA reservation this memory will be
+available for applications to use it, while kernel is prevented from
+using it. With this FADump will still be able to capture all of the
+kernel memory and most of the user space memory except the user pages
+that were present in CMA region.
 
   o Memory Reservation during first kernel
 
-  Low memoryTop of memory
-  0  boot memory size  |<--Reserved dump area --->|  |
-  |   ||   Permanent Reservation  |  |
-  V   V|   (Preserve area)|  V
-  +---+--/ /---+---+++---++--+
-  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
-  +---+--/ /---+---+++---++--+
-|   ^  ^
-|   |  |
-\   /  |
- --- FADump Header
-  Boot memory content gets transferred   (meta area)
-  to reserved area by firmware at the
-  time of crash
-
+  Low memory Top of memory
+  0  boot memory size|<--- Reserved dump area --->|   |
+  |   |  |Permanent Reservatio|   |
+  V   V  |   (Preserve area)  |   V
+  +---+/ /---+---++---+-+-+---+
+  |   |  |///|/

[PATCH v3 07/16] powerpc/fadump: consider reserved ranges while reserving memory

2019-06-25 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse reserved-ranges DT
node and reserve kernel memory falling in these ranges for F/W
purposes. Ensure memory in these ranges is not overlapped with
memory reserved for FADump.

Also, use a smaller offset, instead of the size of the memory to
be reserved, by which to skip memory before making another attempt
at reserving memory, after the previous attempt to reserve memory
for FADump failed due to memory holes and/or reserved ranges, to
reduce the likelihood of memory reservation failure.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/fadump-common.h |   11 +++
 arch/powerpc/kernel/fadump.c|  137 ++-
 2 files changed, 145 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index 36f4d71..555230e 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -101,6 +101,17 @@ struct fadump_memory_range {
unsigned long long  size;
 };
 
+/*
+ * Amount of memory (1024MB) to skip before making another attempt at
+ * reserving memory (after the previous attempt to reserve memory for
+ * FADump failed due to memory holes and/or reserved ranges) to reduce
+ * the likelihood of memory reservation failure.
+ */
+#define OFFSET_SIZE0x4000U
+
+/* Maximum no. of reserved ranges supported for processing. */
+#define MAX_RESERVED_RANGES128
+
 /* Maximum no. of real memory regions supported by the kernel */
 #define MAX_REAL_MEM_REGIONS   8
 
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index f2c2d4a..1b3df8b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -41,6 +41,9 @@ int crash_memory_ranges_size;
 int crash_mem_ranges;
 int max_crash_mem_ranges;
 
+struct fadump_memory_range reserved_ranges[MAX_RESERVED_RANGES];
+int reserved_ranges_cnt;
+
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
 
@@ -104,12 +107,116 @@ int __init fadump_cma_init(void)
 static int __init fadump_cma_init(void) { return 1; }
 #endif /* CONFIG_CMA */
 
+/*
+ * Sort the reserved ranges in-place and merge adjacent ranges
+ * to minimize the reserved ranges count.
+ */
+static void __init sort_and_merge_reserved_ranges(void)
+{
+   unsigned long long base, size;
+   struct fadump_memory_range tmp_range;
+   int i, j, idx;
+
+   if (!reserved_ranges_cnt)
+   return;
+
+   /* Sort the reserved ranges */
+   for (i = 0; i < reserved_ranges_cnt; i++) {
+   idx = i;
+   for (j = i + 1; j < reserved_ranges_cnt; j++) {
+   if (reserved_ranges[idx].base > reserved_ranges[j].base)
+   idx = j;
+   }
+   if (idx != i) {
+   tmp_range = reserved_ranges[idx];
+   reserved_ranges[idx] = reserved_ranges[i];
+   reserved_ranges[i] = tmp_range;
+   }
+   }
+
+   /* Merge adjacent reserved ranges */
+   idx = 0;
+   for (i = 1; i < reserved_ranges_cnt; i++) {
+   base = reserved_ranges[i-1].base;
+   size = reserved_ranges[i-1].size;
+   if (reserved_ranges[i].base == (base + size))
+   reserved_ranges[idx].size += reserved_ranges[i].size;
+   else {
+   idx++;
+   if (i == idx)
+   continue;
+
+   reserved_ranges[idx] = reserved_ranges[i];
+   }
+   }
+   reserved_ranges_cnt = idx + 1;
+}
+
+static int __init add_reserved_range(unsigned long base,
+unsigned long size)
+{
+   int i;
+
+   if (reserved_ranges_cnt == MAX_RESERVED_RANGES) {
+   /* Compact reserved ranges and try again. */
+   sort_and_merge_reserved_ranges();
+   if (reserved_ranges_cnt == MAX_RESERVED_RANGES)
+   return 0;
+   }
+
+   i = reserved_ranges_cnt++;
+   reserved_ranges[i].base = base;
+   reserved_ranges[i].size = size;
+   return 1;
+}
+
+/*
+ * Scan reserved-ranges to consider them while reserving/releasing
+ * memory for FADump.
+ */
+static void __init early_init_dt_scan_reserved_ranges(unsigned long node)
+{
+   int len, ret;
+   unsigned long i;
+   const __be32 *prop;
+
+   /* reserved-ranges already scanned */
+   if (reserved_ranges_cnt != 0)
+   return;
+
+   prop = of_get_flat_dt_prop(node, "reserved-ranges", &len);
+
+   if (!prop)
+   return;
+
+   /*
+* Each reserved range is an (address,size) pair, 2 cells each,
+* totalling 4 cells per range.
+*/
+   for (i =

[PATCH v3 08/16] powerpc/fadump: consider reserved ranges while releasing memory

2019-06-25 Thread Hari Bathini
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for
memory reservations") enabled support to parse 'reserved-ranges' DT
node to reserve kernel memory falling in these ranges for firmware
purposes. Along with the preserved area memory, also ensure memory
in reserved ranges is not overlapped with memory released by capture
kernel aftering saving vmcore. Also, fix the off-by-one error in
fadump_release_reserved_area function while releasing memory.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kernel/fadump.c |   59 +-
 1 file changed, 41 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 1b3df8b..ce8c0bf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -111,7 +111,7 @@ static int __init fadump_cma_init(void) { return 1; }
  * Sort the reserved ranges in-place and merge adjacent ranges
  * to minimize the reserved ranges count.
  */
-static void __init sort_and_merge_reserved_ranges(void)
+static void sort_and_merge_reserved_ranges(void)
 {
unsigned long long base, size;
struct fadump_memory_range tmp_range;
@@ -152,8 +152,7 @@ static void __init sort_and_merge_reserved_ranges(void)
reserved_ranges_cnt = idx + 1;
 }
 
-static int __init add_reserved_range(unsigned long base,
-unsigned long size)
+static int add_reserved_range(unsigned long base, unsigned long size)
 {
int i;
 
@@ -1127,33 +1126,57 @@ static void fadump_release_reserved_area(unsigned long 
start, unsigned long end)
if (tend == end_pfn)
break;
 
-   start_pfn = tend + 1;
+   start_pfn = tend;
}
}
 }
 
 /*
- * Release the memory that was reserved in early boot to preserve the memory
- * contents. The released memory will be available for general use.
+ * Release the memory that was reserved during early boot to preserve the
+ * crash'ed kernel's memory contents except reserved dump area (permanent
+ * reservation) and reserved ranges used by F/W. The released memory will
+ * be available for general use.
  */
 static void fadump_release_memory(unsigned long begin, unsigned long end)
 {
+   int i;
unsigned long ra_start, ra_end;
-
-   ra_start = fw_dump.reserve_dump_area_start;
-   ra_end = ra_start + fw_dump.reserve_dump_area_size;
+   unsigned long tstart;
 
/*
-* exclude the dump reserve area. Will reuse it for next
-* fadump registration.
+* Add memory to permanently preserve to reserved ranges list
+* and exclude all these ranges while releasing memory.
 */
-   if (begin < ra_end && end > ra_start) {
-   if (begin < ra_start)
-   fadump_release_reserved_area(begin, ra_start);
-   if (end > ra_end)
-   fadump_release_reserved_area(ra_end, end);
-   } else
-   fadump_release_reserved_area(begin, end);
+   i = add_reserved_range(fw_dump.reserve_dump_area_start,
+  fw_dump.reserve_dump_area_size);
+   if (i == 0) {
+   /*
+* Reached the MAX reserved ranges count. To ensure reserved
+* dump area is excluded (as it will be reused for next
+* FADump registration), ignore the last reserved range and
+* add reserved dump area instead.
+*/
+   reserved_ranges_cnt--;
+   add_reserved_range(fw_dump.reserve_dump_area_start,
+  fw_dump.reserve_dump_area_size);
+   }
+   sort_and_merge_reserved_ranges();
+
+   tstart = begin;
+   for (i = 0; i < reserved_ranges_cnt; i++) {
+   ra_start = reserved_ranges[i].base;
+   ra_end = ra_start + reserved_ranges[i].size;
+
+   if (tstart >= ra_end)
+   continue;
+
+   if (tstart < ra_start)
+   fadump_release_reserved_area(tstart, ra_start);
+   tstart = ra_end;
+   }
+
+   if (tstart < end)
+   fadump_release_reserved_area(tstart, end);
 }
 
 static void fadump_invalidate_release_mem(void)



[PATCH v3 09/16] powernv/fadump: process architected register state data provided by firmware

2019-06-25 Thread Hari Bathini
From: Hari Bathini 

Firmware provides architected register state data at the time of crash.
Process this data and build CPU notes to append to ELF core.

Signed-off-by: Hari Bathini 
Signed-off-by: Vasant Hegde 
---
 arch/powerpc/include/asm/opal-api.h  |   31 
 arch/powerpc/kernel/fadump-common.h  |3 
 arch/powerpc/platforms/powernv/opal-fadump.c |  197 --
 arch/powerpc/platforms/powernv/opal-fadump.h |2 
 4 files changed, 221 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 1762b1e..a60b09f 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -984,6 +984,37 @@ struct opal_sg_list {
  * Firmware-Assisted Dump (FADump)
  */
 
+#define CPU_STATE_DATA_VERSION 1
+
+/* FADump thread header for register entries */
+struct opal_fadump_thread_hdr {
+   __be32  pir;
+   /* 0x00 - 0x0F - The corresponding stop state of the core */
+   u8  core_state;
+   u8  reserved[3];
+
+   __be32  offset; /* Offset to Register Entries array */
+   __be32  ecnt;   /* Number of entries */
+   __be32  esize;  /* Alloc size of each array entry in bytes */
+   __be32  eactsz; /* Actual size of each array entry in bytes */
+} __attribute__((packed));
+
+/* Register types populated by f/w */
+#define OPAL_REG_TYPE_GPR  0x01
+#define OPAL_REG_TYPE_SPR  0x02
+
+/* ID numbers used by f/w while populating certain registers */
+#define REG_ID_NIP 0x7D0
+#define REG_ID_MSR 0x7D1
+#define REG_ID_CCR 0x7D2
+
+/* FADump register entry. */
+struct opal_fadump_reg_entry {
+   __be32  reg_type;
+   __be32  reg_num;
+   __be64  reg_val;
+} __attribute__((packed));
+
 /* The maximum number of dump sections supported by OPAL */
 #define OPAL_FADUMP_NR_SECTIONS64
 
diff --git a/arch/powerpc/kernel/fadump-common.h 
b/arch/powerpc/kernel/fadump-common.h
index 555230e..ebebe4d 100644
--- a/arch/powerpc/kernel/fadump-common.h
+++ b/arch/powerpc/kernel/fadump-common.h
@@ -117,6 +117,9 @@ struct fadump_memory_range {
 
 /* Firmware-assisted dump configuration details. */
 struct fw_dump {
+   unsigned long   cpu_state_destination_addr;
+   unsigned long   cpu_state_data_version;
+   unsigned long   cpu_state_entry_size;
unsigned long   cpu_state_data_size;
unsigned long   hpte_region_size;
unsigned long   boot_memory_size;
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c 
b/arch/powerpc/platforms/powernv/opal-fadump.c
index 7e6c46a..ed3c35b 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -29,6 +29,7 @@
 #include "opal-fadump.h"
 
 static const struct opal_fadump_mem_struct *opal_fdm_active;
+static const struct opal_mpipl_fadump *opal_cpu_metadata;
 static struct opal_fadump_mem_struct *opal_fdm;
 
 static void opal_set_preserv_area_start(struct fw_dump *fadump_conf)
@@ -229,6 +230,75 @@ static int opal_invalidate_fadump(struct fw_dump 
*fadump_conf)
return 0;
 }
 
+static inline void fadump_set_regval_regnum(struct pt_regs *regs, u32 reg_type,
+   u32 reg_num, u64 reg_val)
+{
+   if (reg_type == OPAL_REG_TYPE_GPR) {
+   if (reg_num < 32)
+   regs->gpr[reg_num] = reg_val;
+   return;
+   }
+
+   switch (reg_num) {
+   case SPRN_CTR:
+   regs->ctr = reg_val;
+   break;
+   case SPRN_LR:
+   regs->link = reg_val;
+   break;
+   case SPRN_XER:
+   regs->xer = reg_val;
+   break;
+   case SPRN_DAR:
+   regs->dar = reg_val;
+   break;
+   case SPRN_DSISR:
+   regs->dsisr = reg_val;
+   break;
+   case REG_ID_NIP:
+   regs->nip = reg_val;
+   break;
+   case REG_ID_MSR:
+   regs->msr = reg_val;
+   break;
+   case REG_ID_CCR:
+   regs->ccr = reg_val;
+   break;
+   }
+}
+
+static inline void fadump_read_registers(char *bufp, unsigned int regs_cnt,
+unsigned int reg_entry_size,
+struct pt_regs *regs)
+{
+   int i;
+   struct opal_fadump_reg_entry *reg_entry;
+
+   memset(regs, 0, sizeof(struct pt_regs));
+
+   for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+   reg_entry = (struct opal_fadump_reg_entry *)bufp;
+   fadump_set_regval_regnum(regs,
+be32_to_cpu(reg_entry->reg_type),
+be32_to_cpu(reg_entry->reg_num),
+  

  1   2   3   4   5   6   7   8   9   >