[PATCH] nvram: print no error message when nvram is not set as pstore backend
Pstore only supports one backend at a time. The preferred pstore backend is set by passing the pstore.backend= argument to the kernel at boot time. Currently, while trying to register with pstore, nvram throws an error message even when "pstore.backend != nvram", which is unnecessary. This patch removes the error message in case "pstore.backend != nvram". Signed-off-by: Hari Bathini --- arch/powerpc/kernel/nvram_64.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 1e703f8..bfdbcab 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -582,9 +582,10 @@ static int nvram_pstore_init(void) spin_lock_init(&nvram_pstore_info.buf_lock); rc = pstore_register(&nvram_pstore_info); - if (rc != 0) - pr_err("nvram: pstore_register() failed, defaults to " - "kmsg_dump; returned %d\n", rc); + if (rc && (rc != -EPERM)) + /* Print error only when pstore.backend == nvram */ + pr_err("nvram: pstore_register() failed, returned %d. " + "Defaults to kmsg_dump\n", rc); return rc; } ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] fadump: fix endianess issues in firmware assisted dump handling
Firmware-assisted dump (fadump) kernel code is not LE compliant. The below patch tries to fix this issue. Tested this patch with upstream kernel. Did some sanity testing for the LE fadump vmcore generated. Below output shows crash tool successfully opening LE fadump vmcore. # crash $vmlinux vmcore crash 7.0.5 Copyright (C) 2002-2014 Red Hat, Inc. Copyright (C) 2004, 2005, 2006, 2010 IBM Corporation Copyright (C) 1999-2006 Hewlett-Packard Co Copyright (C) 2005, 2006, 2011, 2012 Fujitsu Limited Copyright (C) 2006, 2007 VA Linux Systems Japan K.K. Copyright (C) 2005, 2011 NEC Corporation Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc. Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc. This program is free software, covered by the GNU General Public License, and you are welcome to change it and/or distribute copies of it under certain conditions. Enter "help copying" to see the conditions. This program has absolutely no warranty. Enter "help warranty" for details. crash: /boot/vmlinux-3.16.0-rc7-7-default+: no .gnu_debuglink section GNU gdb (GDB) 7.6 Copyright (C) 2013 Free Software Foundation, Inc. License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html> This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Type "show copying" and "show warranty" for details. This GDB was configured as "powerpc64le-unknown-linux-gnu"... KERNEL: /boot/vmlinux-3.16.0-rc7-7-default+ DUMPFILE: vmcore CPUS: 16 DATE: Sun Aug 24 14:31:28 2014 UPTIME: 00:02:57 LOAD AVERAGE: 0.05, 0.08, 0.04 TASKS: 256 NODENAME: linux-dhr2 RELEASE: 3.16.0-rc7-7-default+ VERSION: #54 SMP Mon Aug 18 14:08:23 EDT 2014 MACHINE: ppc64le (4116 Mhz) MEMORY: 40 GB PANIC: "Oops: Kernel access of bad area, sig: 11 [#1]" (check log for details) PID: 2234 COMMAND: "bash" TASK: c009652e4a30 [THREAD_INFO: c0096777c000] CPU: 2 STATE: TASK_RUNNING (PANIC) crash> Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/fadump.h | 52 --- arch/powerpc/kernel/fadump.c | 112 + arch/powerpc/platforms/pseries/lpar.c |9 ++- 3 files changed, 89 insertions(+), 84 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index a677456..493e72f 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -70,39 +70,39 @@ #define CPU_UNKNOWN(~((u32)0)) /* Utility macros */ -#define SKIP_TO_NEXT_CPU(reg_entry)\ -({ \ - while (reg_entry->reg_id != REG_ID("CPUEND")) \ - reg_entry++;\ - reg_entry++;\ +#define SKIP_TO_NEXT_CPU(reg_entry)\ +({ \ + while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) \ + reg_entry++;\ + reg_entry++;\ }) /* Kernel Dump section info */ struct fadump_section { - u32 request_flag; - u16 source_data_type; - u16 error_flags; - u64 source_address; - u64 source_len; - u64 bytes_dumped; - u64 destination_address; + __be32 request_flag; + __be16 source_data_type; + __be16 error_flags; + __be64 source_address; + __be64 source_len; + __be64 bytes_dumped; + __be64 destination_address; }; /* ibm,configure-kernel-dump header. */ struct fadump_section_header { - u32 dump_format_version; - u16 dump_num_sections; - u16 dump_status_flag; - u32 offset_first_dump_section; + __be32 dump_format_version; + __be16 dump_num_sections; + __be16 dump_status_flag; + __be32 offset_first_dump_section; /* Fields for disk dump option. */ - u32 dd_block_size; - u64 dd_block_offset; - u64 dd_num_blocks; - u32 dd_offset_disk_path; + __be32 dd_block_size; + __be64 dd_block_offset; + __be64 dd_num_blocks; + __be32 dd_offset_disk_path; /* Maximum time allowed to prevent an automatic dump-reboot
[PATCH v2] fadump: fix endianess issues in firmware assisted dump handling
Firmware-assisted dump (fadump) kernel code is not LE compliant. The below patch tries to fix this issue. Tested this patch with upstream kernel. Did some sanity testing for the LE fadump vmcore generated. Below output shows crash tool successfully opening LE fadump vmcore. # crash vmlinux vmcore crash 7.0.5 Copyright (C) 2002-2014 Red Hat, Inc. Copyright (C) 2004, 2005, 2006, 2010 IBM Corporation Copyright (C) 1999-2006 Hewlett-Packard Co Copyright (C) 2005, 2006, 2011, 2012 Fujitsu Limited Copyright (C) 2006, 2007 VA Linux Systems Japan K.K. Copyright (C) 2005, 2011 NEC Corporation Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc. Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc. This program is free software, covered by the GNU General Public License, and you are welcome to change it and/or distribute copies of it under certain conditions. Enter "help copying" to see the conditions. This program has absolutely no warranty. Enter "help warranty" for details. crash: vmlinux: no .gnu_debuglink section GNU gdb (GDB) 7.6 Copyright (C) 2013 Free Software Foundation, Inc. License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html> This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Type "show copying" and "show warranty" for details. This GDB was configured as "powerpc64le-unknown-linux-gnu"... KERNEL: vmlinux DUMPFILE: vmcore CPUS: 16 DATE: Wed Dec 31 19:00:00 1969 UPTIME: 00:03:28 LOAD AVERAGE: 0.46, 0.86, 0.41 TASKS: 268 NODENAME: linux-dhr2 RELEASE: 3.17.0-rc5-7-default VERSION: #6 SMP Tue Sep 30 01:06:34 EDT 2014 MACHINE: ppc64le (4116 Mhz) MEMORY: 40 GB PANIC: "Oops: Kernel access of bad area, sig: 11 [#1]" (check log for details) PID: 6223 COMMAND: "bash" TASK: c009661b2500 [THREAD_INFO: c00967ac] CPU: 2 STATE: TASK_RUNNING (PANIC) crash> Changes in v2: 1. Addressed casting related warnings. 2. Elaborated on why exceptions should not be changed to big endian during fadump boot. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/fadump.h | 52 --- arch/powerpc/kernel/fadump.c | 114 + arch/powerpc/platforms/pseries/lpar.c | 15 3 files changed, 96 insertions(+), 85 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index a677456..493e72f 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -70,39 +70,39 @@ #define CPU_UNKNOWN(~((u32)0)) /* Utility macros */ -#define SKIP_TO_NEXT_CPU(reg_entry)\ -({ \ - while (reg_entry->reg_id != REG_ID("CPUEND")) \ - reg_entry++;\ - reg_entry++;\ +#define SKIP_TO_NEXT_CPU(reg_entry)\ +({ \ + while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) \ + reg_entry++;\ + reg_entry++;\ }) /* Kernel Dump section info */ struct fadump_section { - u32 request_flag; - u16 source_data_type; - u16 error_flags; - u64 source_address; - u64 source_len; - u64 bytes_dumped; - u64 destination_address; + __be32 request_flag; + __be16 source_data_type; + __be16 error_flags; + __be64 source_address; + __be64 source_len; + __be64 bytes_dumped; + __be64 destination_address; }; /* ibm,configure-kernel-dump header. */ struct fadump_section_header { - u32 dump_format_version; - u16 dump_num_sections; - u16 dump_status_flag; - u32 offset_first_dump_section; + __be32 dump_format_version; + __be16 dump_num_sections; + __be16 dump_status_flag; + __be32 offset_first_dump_section; /* Fields for disk dump option. */ - u32 dd_block_size; - u64 dd_block_offset; - u64 dd_num_blocks; - u32 dd_offset_disk_path; + __be32 dd_block_size; + __be64 dd_block_offset; + __be64 dd_num_blocks; + __be32 d
[PATCH 0/2] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. --- Hari Bathini (2): pstore: Add pstore type id for firmware partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |2 arch/powerpc/kernel/nvram_64.c | 679 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 663 -- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 749 insertions(+), 659 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] pstore: Add pstore type id for firmware partition
This patch adds a pstore type id to be used for opal specific nvram partitions. Signed-off-by: Hari Bathini --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index fafb7a0..e83bb93 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -337,6 +337,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, "powerpc-common-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch while adding pstore support for powernv platform, moves common code for pseries and powernv to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |2 arch/powerpc/kernel/nvram_64.c | 679 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 663 -- 5 files changed, 745 insertions(+), 659 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include #include #include #include +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..a033fe9 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); #endif diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..8c439a3 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -54,6 +57,682 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + "ibm,rtas-log", +#endif + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/
Re: [2/2] pstore: add pstore support on powernv
On 12/04/2014 11:07 AM, Michael Ellerman wrote: On Wed, 2014-03-12 at 11:03:15 UTC, Hari Bathini wrote: This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch while adding pstore support for powernv platform, moves common code for pseries and powernv to arch/powerpc/kernel/nvram_64.c file. Please move the common code first in a separate patch. Unless there's some reason you absolutely can't do that. Sure, Michael. Let me make the changes as suggested and post the updated patch series. Thanks Hari cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/kdump: skip enabling big endian exception during crash
In LE kernel, we currently have a hack for kexec that resets the exception endian before starting a new kernel as the kernel that is loaded could be a big endian or a little endian kernel. In kdump case, resetting exception endian fails when one or more cpus is disabled. But in case of kdump, we can conveniently ignore resetting endianess as crashkernel is always of same endianess as primary kernel. This patch adds a new inline function to say if this is kdump path. This function is used at places where such a check is needed. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/kexec.h | 10 ++ arch/powerpc/kernel/machine_kexec_64.c |2 +- arch/powerpc/platforms/pseries/lpar.c |7 ++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 19c36cb..0d96d4d 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -86,6 +86,11 @@ extern int overlaps_crashkernel(unsigned long start, unsigned long size); extern void reserve_crashkernel(void); extern void machine_kexec_mask_interrupts(void); +static inline int is_kdump_path(void) +{ + return (crashing_cpu >= 0) ? 1 : 0; +} + #else /* !CONFIG_KEXEC */ static inline void crash_kexec_secondary(struct pt_regs *regs) { } @@ -106,6 +111,11 @@ static inline int crash_shutdown_unregister(crash_shutdown_t handler) return 0; } +static inline int is_kdump_path(void) +{ + return 0; +} + #endif /* CONFIG_KEXEC */ #endif /* ! __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 879b3aa..b4fe804 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -330,7 +330,7 @@ void default_machine_kexec(struct kimage *image) * using debugger IPI. */ - if (crashing_cpu == -1) + if (!is_kdump_path()) kexec_prepare_cpus(); pr_debug("kexec: Starting switchover sequence.\n"); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f6880d2..be41680 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "pseries.h" @@ -257,8 +258,12 @@ static void pSeries_lpar_hptab_clear(void) * * This is also called on boot when a fadump happens. In that case we * must not change the exception endian mode. +* +* This is also called during kdump which doesn't need resetting, as the +* the crashkernel is of same endainess as primary kernel. */ - if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active()) { + if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active() && + !is_kdump_path()) { long rc; rc = pseries_big_endian_exceptions(); ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 0/3] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. --- Hari Bathini (3): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for firmware partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |2 arch/powerpc/kernel/nvram_64.c | 681 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 665 -- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 751 insertions(+), 661 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/3] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|2 arch/powerpc/kernel/nvram_64.c | 660 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 716 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include #include #include #include +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..a033fe9 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); #endif diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..dbff7f0 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -54,6 +57,663 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + "ibm,rtas-log", +#endif + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* + * For capturing and compressing an oops or panic report... + + * big_oops_buf[] holds the uncompressed text we're capturing. + * + * oops_buf[] holds the compressed text, preceded by a oops header. + * oops heade
[PATCH v2 2/3] pstore: Add pstore type id for firmware partition
This patch adds a pstore type id to be used for opal specific nvram partitions. Signed-off-by: Hari Bathini --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, "powerpc-common-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 3/3] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index dbff7f0..3afbc91 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = "ibm,skiboot", + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = "of-config", @@ -479,6 +487,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = &skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time->tv_sec = 0; + time->tv_nsec = 0; + break; +#endif default: return 0; } @@ -554,8 +572,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include #include +#include #include static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2 1/3] powerpc/nvram: move generic code for nvram and pstore
On 12/17/2014 05:33 AM, Michael Ellerman wrote: On Tue, 2014-12-16 at 23:35 +0530, Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Sharing the code is great. But, you need to keep in mind that it is very common for us to build kernels with both POWERNV=y and PSERIES=y. So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things that are optional on pseries. Not things that we *shouldn't* be doing on powernv. For example the logic in nvram_init_oops_partition() looks like it might do the wrong thing for PSERIES=y POWERNV=y. True. It might do wrong thing when an incorrect value is passed by the caller. But since the caller is platform specific code [pseries_nvram_init_log_partitions() or opal_nvram_init_log_partitions() routine], with appropriate parameter passed, I haven't seen any issues while testing. diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..a033fe9 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); You should add an empty version of this for !PSERIES, so you don't have to ifdef all the call sites. Sure. Will update accordingly.. Thanks Hari cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] powerpc/kdump: Ignore failure in enabling big endian exception during crash
In LE kernel, we currently have a hack for kexec that resets the exception endian before starting a new kernel as the kernel that is loaded could be a big endian or a little endian kernel. In kdump case, resetting exception endian fails when one or more cpus is disabled. But we can ignore the failure and still go ahead, as in most cases crashkernel will be of same endianess as primary kernel and reseting endianess is not even needed in those cases. This patch adds a new inline function to say if this is kdump path. This function is used at places where such a check is needed. Changes from v1: Instead of skipping, ignore failure in enabling big endian exception during crash Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/kexec.h | 10 ++ arch/powerpc/kernel/machine_kexec_64.c |2 +- arch/powerpc/platforms/pseries/lpar.c | 10 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 19c36cb..0d96d4d 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -86,6 +86,11 @@ extern int overlaps_crashkernel(unsigned long start, unsigned long size); extern void reserve_crashkernel(void); extern void machine_kexec_mask_interrupts(void); +static inline int is_kdump_path(void) +{ + return (crashing_cpu >= 0) ? 1 : 0; +} + #else /* !CONFIG_KEXEC */ static inline void crash_kexec_secondary(struct pt_regs *regs) { } @@ -106,6 +111,11 @@ static inline int crash_shutdown_unregister(crash_shutdown_t handler) return 0; } +static inline int is_kdump_path(void) +{ + return 0; +} + #endif /* CONFIG_KEXEC */ #endif /* ! __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 879b3aa..b4fe804 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -330,7 +330,7 @@ void default_machine_kexec(struct kimage *image) * using debugger IPI. */ - if (crashing_cpu == -1) + if (!is_kdump_path()) kexec_prepare_cpus(); pr_debug("kexec: Starting switchover sequence.\n"); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 469751d..63214fa 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "pseries.h" @@ -257,6 +258,7 @@ static void pSeries_lpar_hptab_clear(void) * * This is also called on boot when a fadump happens. In that case we * must not change the exception endian mode. +* */ if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active()) { long rc; @@ -267,8 +269,14 @@ static void pSeries_lpar_hptab_clear(void) * out to the user, but at least this will stop us from * continuing on further and creating an even more * difficult to debug situation. +* +* But if we reaching here after a crash, no point panicking. +* Also, in kdump path, resetting endianess may not be needed +* as the crashkernel most of the times is of same endianess +* as primary kernel. So, let's ignore the failure and try +* kdump'ing anyway. */ - if (rc) + if (rc && !is_kdump_path()) panic("Could not enable big endian exceptions"); } #endif ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 0/3] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. Changes from v2: Added an empty version of clobbering_unread_rtas_event() routine for !PSERIES, to avoid ifdef at the call sites --- Hari Bathini (3): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for firmware partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |4 arch/powerpc/kernel/nvram_64.c | 677 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 665 --- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 749 insertions(+), 661 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|4 arch/powerpc/kernel/nvram_64.c | 656 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 714 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include #include #include #include +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..bcf6693 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); +#else +int clobbering_unread_rtas_event(void) { return 0; } #endif #ifdef CONFIG_PPC_RTAS_DAEMON diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..42e5c6a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -54,6 +57,659 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + "ibm,rtas-log", +#endif + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* + * For capturing and compressing an oops or panic report... + + * big_oops_buf[] holds the uncompressed text we're capturi
[PATCH v3 2/3] pstore: Add pstore type id for firmware partition
This patch adds a pstore type id to be used for opal specific nvram partitions. Signed-off-by: Hari Bathini --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, "powerpc-common-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 3/3] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 42e5c6a..293da88 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = "ibm,skiboot", + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = "of-config", @@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = &skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time->tv_sec = 0; + time->tv_nsec = 0; + break; +#endif default: return 0; } @@ -552,8 +570,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include #include +#include #include static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] PPC64: Adding symbols in vmcoreinfo to facilitate dump filtering
When CONFIG_SPARSEMEM_VMEMMAP option is used in kernel, makedumpfile fails to filter vmcore dump as it fails to do vmemmap translations. So far dump filtering on ppc64 never had to deal with vmemmap addresses seperately as vmemmap regions where mapped in zone normal. But with the inclusion of CONFIG_SPARSEMEM_VMEMMAP config option in kernel, this vmemmap address translation support becomes necessary for dump filtering. For vmemmap adress translation, few kernel symbols are needed by dump filtering tool. This patch adds those symbols to vmcoreinfo, which a dump filtering tool can use for filtering the kernel dump. Tested this changes successfully with makedumpfile tool that supports vmemmap to physical address translation outside zone normal. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/pgalloc-64.h |4 arch/powerpc/kernel/machine_kexec.c | 12 2 files changed, 16 insertions(+) diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b..33e507a 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -17,6 +17,10 @@ struct vmemmap_backing { unsigned long virt_addr; }; +#ifdef CONFIG_SPARSEMEM_VMEMMAP +extern struct vmemmap_backing *vmemmap_list; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an "index_size" so they know how to diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index e1ec57e..88a7fb4 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -75,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void) #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(contig_page_data); #endif +#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif } /* ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] PPC64: Adding symbols in vmcoreinfo to facilitate dump filtering
When CONFIG_SPARSEMEM_VMEMMAP option is set in kernel, makedumpfile tool fails to filter vmcore dump as it fails to do translations for vmemmap addresses that are mapped outside zone normal. For vmemmap adress translation support in this scenario, few kernel symbols are needed by dump filtering tool. This patch adds those symbols to vmcoreinfo, which a dump filtering tool can use for filtering the kernel dump. This changes are tested successfully with makedumpfile tool that supports vmemmap to physical address translation outside zone normal. Changes from v1: Updated patch decription and removed #ifdef around extern. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/pgalloc-64.h |2 ++ arch/powerpc/kernel/machine_kexec.c | 12 2 files changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b..3973e62 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -17,6 +17,8 @@ struct vmemmap_backing { unsigned long virt_addr; }; +extern struct vmemmap_backing *vmemmap_list; + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an "index_size" so they know how to diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index e1ec57e..88a7fb4 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -75,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void) #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(contig_page_data); #endif +#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif } /* ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore
On 01/14/2015 10:01 AM, Michael Ellerman wrote: On Wed, 2014-12-24 at 17:28 +0530, Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. As I said in my reply to the previous version: ... you need to keep in mind that it is very common for us to build kernels with both POWERNV=y and PSERIES=y. So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things that are optional on pseries. Not things that we *shouldn't* be doing on powernv. we could as well do away with the PPC_PSERIES flag in a couple of places in arch/powerpc/kernel/nvram_64.c, but doing that will unnecessarily add few extra variables for !PPC_PSERIES case. Please explain in your commit message how you have dealt with that. Sure. Will update the changelog Also, you broke the build for every config that doesn't have CONFIG_PPC_PSERIES, all 95 of them. This is pasemi_defconfig for example: My bad!clobbering_unread_rtas_event should have been static inline while defining under !PPC_PSERIES Thanks Hari LD arch/powerpc/mm/built-in.o arch/powerpc/mm/init_64.o: In function `clobbering_unread_rtas_event': init_64.c:(.opd+0x48): multiple definition of `clobbering_unread_rtas_event' arch/powerpc/mm/mem.o:mem.c:(.opd+0x90): first defined here arch/powerpc/mm/init_64.o: In function `.clobbering_unread_rtas_event': init_64.c:(.text+0x80): multiple definition of `.clobbering_unread_rtas_event' arch/powerpc/mm/mem.o:mem.c:(.text+0x2c0): first defined here CC arch/powerpc/kernel/udbg.o /home/kisskb/slave/src/scripts/Makefile.build:336: recipe for target 'arch/powerpc/mm/built-in.o' failed make[2]: *** [arch/powerpc/mm/built-in.o] Error 1 /home/kisskb/slave/src/Makefile:938: recipe for target 'arch/powerpc/mm' failed make[1]: *** [arch/powerpc/mm] Error 2 make[1]: *** Waiting for unfinished jobs cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore
On 01/15/2015 03:58 AM, Michael Ellerman wrote: On Wed, 2015-01-14 at 23:35 +0530, Hari Bathini wrote: On 01/14/2015 10:01 AM, Michael Ellerman wrote: On Wed, 2014-12-24 at 17:28 +0530, Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. As I said in my reply to the previous version: ... you need to keep in mind that it is very common for us to build kernels with both POWERNV=y and PSERIES=y. So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things that are optional on pseries. Not things that we *shouldn't* be doing on powernv. we could as well do away with the PPC_PSERIES flag in a couple of places in arch/powerpc/kernel/nvram_64.c, but doing that will unnecessarily add few extra variables for !PPC_PSERIES case. Yep. I'm happy for them to be there, I just want you to explain in the changelog that you've thought about the PSERIES=y POWERNV=y case and why the code makes sense for that configuration. Please explain in your commit message how you have dealt with that. Sure. Will update the changelog Thanks. Also, you broke the build for every config that doesn't have CONFIG_PPC_PSERIES, all 95 of them. This is pasemi_defconfig for example: My bad! clobbering_unread_rtas_event should have been static inline while defining under !PPC_PSERIES Correct. Please make sure you test build at least some of the other configurations in future. I realise it's too time consuming to build all of them, but ideally for every config symbol you use in your patch you need to build a kernel config where that symbol =y and =n (and =m if it's tristate). Sure, Michael. I will keep this in mind :) Thanks Hari cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 1/3] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|4 arch/powerpc/kernel/nvram_64.c | 656 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 714 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include #include #include #include +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..123d7ff 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); +#else +static inline int clobbering_unread_rtas_event(void) { return 0; } #endif #ifdef CONFIG_PPC_RTAS_DAEMON diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..42e5c6a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -54,6 +57,659 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + "ibm,rtas-log", +#endif + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* + * For capturing and compressing an oops or panic report... + + * big_oops_buf[] holds the uncompressed text we
[PATCH v4 0/3] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. Also, tested the patches successfully, on a kernel compiled with both CONFIG_PPC_PSERIES=y & CONFIG_PPC_POWERNV=y. Changes from v3: 1. Updated the changelog 2. Resolved compile issues with !CONFIG_PPC_PSERIES --- Hari Bathini (3): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for PPC64 opal nvram partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |4 arch/powerpc/kernel/nvram_64.c | 677 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 665 --- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 749 insertions(+), 661 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 2/3] pstore: Add pstore type id for PPC64 opal nvram partition
This patch adds a new PPC64 partition type to be used for opal specific nvram partition. A new partition type is needed as none of the existing type matches this partition type. Signed-off-by: Hari Bathini Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, "powerpc-common-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 3/3] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. It is a common practice to compile kernels with both CONFIG_PPC_PSERIES=y and CONFIG_PPC_POWERNV=y. The code in nvram_init_oops_partition() routine still works as intended, as the caller is platform specific code which passes the appropriate value for "rtas_partition_exists" parameter. In all other places, where CONFIG_PPC_PSERIES or CONFIG_PPC_POWERNV flag is used in this patchset, it is to reduce the kernel size in cases where this flag is not set and doesn't have any impact logic wise. Signed-off-by: Hari Bathini Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 42e5c6a..293da88 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = "ibm,skiboot", + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = "of-config", @@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = &skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time->tv_sec = 0; + time->tv_nsec = 0; + break; +#endif default: return 0; } @@ -552,8 +570,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include #include +#include #include static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 1/3] powerpc/nvram: move generic code for nvram and pstore
On 01/30/2015 10:12 PM, Arnd Bergmann wrote: On Friday 30 January 2015 20:44:00 Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini Can you make this y2038-safe in the process, possibly as a follow-up patch? Arnd, sorry for the delayed response. I will add these changes to this patch-set and re-spin.. Thanks Hari +extern unsigned long last_rtas_event; time64_t + } + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(zipped_len); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); + return 0; ktime_get_real_seconds() +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, char **buf, + bool *compressed, struct pstore_info *psi) This has to remain timespec for now but can later be changed to timespec64 when the API gets changed. + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(text_len); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); ktime_get_real_seconds() Arnd ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 0/4] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. Also, tested the patches successfully, on a kernel compiled with both CONFIG_PPC_PSERIES=y & CONFIG_PPC_POWERNV=y. Changes from v4: 1. Added a patch for y2038-safe code changes --- Hari Bathini (4): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for PPC64 opal nvram partition pstore: add pstore support on powernv powerpc: make timestamp related code y2038-safe arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |5 arch/powerpc/kernel/nvram_64.c | 677 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 673 --- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 754 insertions(+), 665 deletions(-) -- -Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 1/4] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|4 arch/powerpc/kernel/nvram_64.c | 656 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 714 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include #include #include #include +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..123d7ff 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); +#else +static inline int clobbering_unread_rtas_event(void) { return 0; } #endif #ifdef CONFIG_PPC_RTAS_DAEMON diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..42e5c6a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -54,6 +57,659 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + "ibm,rtas-log", +#endif + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* + * For capturing and compressing an oops or panic report... + + * big_oops_buf[] holds the uncompressed text we
[PATCH v5 2/4] pstore: Add pstore type id for PPC64 opal nvram partition
This patch adds a new PPC64 partition type to be used for opal specific nvram partition. A new partition type is needed as none of the existing type matches this partition type. Signed-off-by: Hari Bathini --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, "powerpc-common-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 3/4] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. It is a common practice to compile kernels with both CONFIG_PPC_PSERIES=y and CONFIG_PPC_POWERNV=y. The code in nvram_init_oops_partition() routine still works as intended, as the caller is platform specific code which passes the appropriate value for "rtas_partition_exists" parameter. In all other places, where CONFIG_PPC_PSERIES or CONFIG_PPC_POWERNV flag is used in this patchset, it is to reduce the kernel size in cases where this flag is not set and doesn't have any impact logic wise. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 42e5c6a..293da88 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = "ibm,skiboot", + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = "of-config", @@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = &skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time->tv_sec = 0; + time->tv_nsec = 0; + break; +#endif default: return 0; } @@ -552,8 +570,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include #include +#include #include static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 4/4] powerpc: make timestamp related code y2038-safe
While we are here, let us make timestamp related code y2038-safe. Suggested-by: Arnd Bergmann Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/rtas.h|3 ++- arch/powerpc/kernel/nvram_64.c |6 +++--- arch/powerpc/platforms/pseries/nvram.c | 10 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 123d7ff..efa9152 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -4,6 +4,7 @@ #include #include +#include /* * Definitions for talking to the RTAS on CHRP machines. @@ -343,7 +344,7 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES -extern unsigned long last_rtas_event; +extern time64_t last_rtas_event; extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 293da88..1e703f8 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -376,7 +376,7 @@ static int zip_oops(size_t text_len) } oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); oops_hdr->report_length = cpu_to_be16(zipped_len); - oops_hdr->timestamp = cpu_to_be64(get_seconds()); + oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds()); return 0; } @@ -423,7 +423,7 @@ static int nvram_pstore_write(enum pstore_type_id type, oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); oops_hdr->report_length = cpu_to_be16(size); - oops_hdr->timestamp = cpu_to_be64(get_seconds()); + oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds()); if (compressed) err_type = ERR_TYPE_KERNEL_PANIC_GZ; @@ -721,7 +721,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, err_type = ERR_TYPE_KERNEL_PANIC; oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); oops_hdr->report_length = cpu_to_be16(text_len); - oops_hdr->timestamp = cpu_to_be64(get_seconds()); + oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds()); } (void) nvram_write_os_partition(&oops_log_partition, oops_buf, diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 97b8fc6..d77713b 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -37,10 +37,10 @@ static DEFINE_SPINLOCK(nvram_lock); /* See clobbering_unread_rtas_event() */ #define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ -static unsigned long last_unread_rtas_event; /* timestamp */ +static time64_t last_unread_rtas_event;/* timestamp */ #ifdef CONFIG_PSTORE -unsigned long last_rtas_event; +time64_t last_rtas_event; #endif static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) @@ -145,9 +145,9 @@ int nvram_write_error_log(char * buff, int length, int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, err_type, error_log_cnt); if (!rc) { - last_unread_rtas_event = get_seconds(); + last_unread_rtas_event = ktime_get_real_seconds(); #ifdef CONFIG_PSTORE - last_rtas_event = get_seconds(); + last_rtas_event = ktime_get_real_seconds(); #endif } @@ -201,7 +201,7 @@ int clobbering_unread_rtas_event(void) { return (oops_log_partition.index == rtas_log_partition.index && last_unread_rtas_event - && get_seconds() - last_unread_rtas_event <= + && ktime_get_real_seconds() - last_unread_rtas_event <= NVRAM_RTAS_READ_TIMEOUT); } ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v5 2/4] pstore: Add pstore type id for PPC64 opal nvram partition
On 02/06/2015 01:06 AM, Hari Bathini wrote: This patch adds a new PPC64 partition type to be used for opal specific nvram partition. A new partition type is needed as none of the existing type matches this partition type. Signed-off-by: Hari Bathini This patch series is reviewed by Kees. Reference link: https://lkml.org/lkml/2015/2/5/651 Reviewed-by: Kees Cook Thanks Hari --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, "powerpc-common-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] ppc64/book3s: copy interrupts till __end_handlers marker instead of __end_interrupts
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path firstly, by moving down __end_handlers marker down past OOL handlers. Secondly, copying interrupt vectors down till __end_handlers marker instead of __end_interrupts, when running a relocatable kernel, to make sure we endup in relocated (kdump) kernel's OOL handler instead of crashed kernel's. Thirdly, by marking all the interrupt vector code that is copied down to real address 0x100 as executable, considering the relocation on exception feature that allows exceptions to be raised in virtual mode (IR=DR=1). This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini Signed-off-by: Mahesh Salgaonkar --- arch/powerpc/include/asm/sections.h |3 ++- arch/powerpc/kernel/exceptions-64s.S |8 arch/powerpc/kernel/head_64.S|2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index abf5866..b4139a5 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -10,6 +10,7 @@ extern char __start_interrupts[]; extern char __end_interrupts[]; +extern char __end_handlers[]; extern char __prom_init_toc_start[]; extern char __prom_init_toc_end[]; @@ -39,7 +40,7 @@ static inline int overlaps_interrupt_vector_text(unsigned long start, { unsigned long real_start, real_end; real_start = __start_interrupts - _stext; - real_end = __end_interrupts - _stext; + real_end = __end_handlers - _stext; return start < (unsigned long)__va(real_end) && (unsigned long)__va(real_start) < end; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..98e2ce5 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1230,10 +1230,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) @@ -1244,6 +1240,10 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + .align 7 + .globl __end_handlers +_
Re: ppc64/book3s: copy interrupts till __end_handlers marker instead of __end_interrupts
On 03/29/2016 03:47 PM, Michael Ellerman wrote: Hi Hari, You win the "Best Change Log of the Year" award. Some comments below ... On Mon, 2016-28-03 at 11:23:22 UTC, Hari Bathini wrote: Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path firstly, by moving down __end_handlers marker down past OOL handlers. Secondly, copying interrupt vectors down till __end_handlers marker instead of __end_interrupts, when running a relocatable kernel, to make sure we endup in relocated (kdump) kernel's OOL handler instead of crashed kernel's. Thirdly, by marking all the interrupt vector code that is copied down to real address 0x100 as executable, considering the relocation on exception feature that allows exceptions to be raised in virtual mode (IR=DR=1). This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. So I think you've missed one important case. My bad! I missed out on considering this case.. In do_final_fixups() we recopy the (now patched) kernel code down to zero. That code uses __end_interrupts as its limit, so I think if you look closely your OOL handlers down at zero will not have had feature fixups applied to them. I think perhaps the better fix is just to move __end_interrupts down (up) to the right location. AFAICS all users of __end_interrupts actually want that address. It would also mean we could remove __end_handlers as unused. True. This sounds less complicated. So can you please check that I'm right about do_final_fixups(), and then try moving __end_interrupts and check that works? Yeah. Testing the patch. Will post it soon. Thanks for the review! - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path by moving the __end_interrupts marker down past OOL handlers to make sure that we also copy OOL handlers to real address 0x100 when running a relocatable kernel. This helps in cases discussed above, where interrupt vectors are not long enough to branch out to OOL handlers with LOAD_HANDLER(). While we are here, let us remove the virtually insignificant __end_handlers marker. This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini Signed-off-by: Mahesh Salgaonkar --- changes from v1: 1. Changed the subject from "copy interrupts till __end_handlers marker instead of __end_interrupts" to a more generic one 2. Used __end_interrupts marker instead of __end_handlers to make the fix less complicated. 3. Removed unused __end_handlers marker. arch/powerpc/kernel/exceptions-64s.S | 23 --- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..e598580 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the + * Code from here down to end of out of line handlers is invoked from + * the exception prologs above. Because the prologs assemble the * addresses of these handlers using the LOAD_HANDLER macro, * which uses an ori instruction, these handlers must be in * the first 64k of the kernel image. @@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) - /* Other future vectors */ - .align 7 - .globl __end_interrupts -__end_interrupts: - .align 7 system_call_entry: b system_call_common @@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) @@ -1244,6 +1235,16 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facil
Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 03/30/2016 05:55 AM, Michael Ellerman wrote: On Tue, 2016-29-03 at 18:34:37 UTC, Hari Bathini wrote: diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..e598580 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the + * Code from here down to end of out of line handlers is invoked from + * the exception prologs above. Because the prologs assemble the I think it would be better to just replace __end_handlers with __end_interrupts, that way it's entirely clear what location you're talking about. @@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) - /* Other future vectors */ - .align 7 - .globl __end_interrupts -__end_interrupts: - .align 7 system_call_entry: b system_call_common @@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - Sorry I wasn't clear in my last mail, please do this as a separate cleanup patch after this patch. ok.. @@ -1244,6 +1235,16 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* FIXME: For now, let us move the __end_interrupts marker down past Why is it FIXME? In general I don't want to merge code that adds a FIXME unless there is some very good reason. AFAICS this is a permanent solution isn't it? Except for a few short interrupt vectors like 0x4f00, 04f20, etc., all other vectors defined till __end_interrupts marker ensure that LOAD_HANDLER() is used for branching to labels like system_call_entry, data_access_common, etc. that are currently not copied to real 0 in relocation case. So, we are forced to move the __end_interrupts marker down only to handle space constraint in the short vectors. So, I added the FIXME to remind the scope for improvement in the code. But after thinking over again now, moving the marker down makes us copy an additional 1~2 KB along with the 21~22 KB that we are copying already. So, not much of an improvement to lose sleep over or to add a FIXME, I guess. Your thoughts? Also, FIXME is the reason, why I did not replace __end_handlers with __end_interrupts in the comment earlier. +* the out-of-line handlers, to make sure we also copy OOL handlers +* to real adress 0x100 when running a relocatable kernel. This helps It doesn't "help" it's 100% required. Yep. Will change the wording. Thanks for the review! - Hari +* in cases where interrupt vectors are not long enough (like 0x4f00, +* 0x4f20, etc.) to branch out to OOL handlers with LOAD_HANDLER(). +*/ + .align 7 + .globl __end_interrupts +__end_interrupts: + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 03/30/2016 12:44 PM, Hari Bathini wrote: On 03/30/2016 05:55 AM, Michael Ellerman wrote: On Tue, 2016-29-03 at 18:34:37 UTC, Hari Bathini wrote: diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..e598580 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the + * Code from here down to end of out of line handlers is invoked from + * the exception prologs above. Because the prologs assemble the I think it would be better to just replace __end_handlers with __end_interrupts, that way it's entirely clear what location you're talking about. @@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) -/* Other future vectors */ -.align7 -.globl__end_interrupts -__end_interrupts: - .align7 system_call_entry: bsystem_call_common @@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) -.align7 -.globl__end_handlers -__end_handlers: - Sorry I wasn't clear in my last mail, please do this as a separate cleanup patch after this patch. ok.. @@ -1244,6 +1235,16 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) +/* FIXME: For now, let us move the __end_interrupts marker down past Why is it FIXME? In general I don't want to merge code that adds a FIXME unless there is some very good reason. AFAICS this is a permanent solution isn't it? Except for a few short interrupt vectors like 0x4f00, 04f20, etc., all other vectors defined till __end_interrupts marker ensure that LOAD_HANDLER() is used for branching to labels like system_call_entry, data_access_common, etc. that are currently not copied to real 0 in relocation case. So, we are forced to move the __end_interrupts marker down only to handle space constraint in the short vectors. So, I added the FIXME to remind the scope for improvement in the code. But after thinking over again now, moving the marker down makes us copy an additional 1~2 KB along with the 21~22 KB that we are copying already. So, not much of an improvement to lose sleep over or to add a FIXME, I guess. Your thoughts? Alternatively, how about moving the OOLs handlers that can't be branched with LOAD_HANDLER under __end_interrupts. This way we won't be copying more than a few absolutely needed handlers. STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) . . STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) We can leave __end_handlers marker to indicate code that should be part of the first 64K of kernel image. Thanks Hari Also, FIXME is the reason, why I did not replace __end_handlers with __end_interrupts in the comment earlier. + * the out-of-line handlers, to make sure we also copy OOL handlers + * to real adress 0x100 when running a relocatable kernel. This helps It doesn't "help" it's 100% required. Yep. Will change the wording. Thanks for the review! - Hari + * in cases where interrupt vectors are not long enough (like 0x4f00, + * 0x4f20, etc.) to branch out to OOL handlers with LOAD_HANDLER(). + */ +.align7 +.globl__end_interrupts +__end_interrupts: + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 03/30/2016 04:47 PM, Michael Ellerman wrote: On Wed, 2016-03-30 at 13:14 +0530, Hari Bathini wrote: Alternatively, how about moving the OOLs handlers that can't be branched with LOAD_HANDLER under __end_interrupts. This way we won't be copying more than a few absolutely needed handlers. STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) . . STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) We can leave __end_handlers marker to indicate code that should be part of the first 64K of kernel image. That might work. But I suspect you will run into issues with ".org backwards", ie. running out of space in head_64.S But try it and let me know if it works. It worked. Doing some sanity testing. Will post v3 soon with this approach. I think we also need to write a script or little C program which looks at the vmlinux and checks that nothing below __end_whatever does a direct branch. So that we don't break it again in future. Yep. That would make life easy.. Let me see if I can do something about it. Thanks Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini Signed-off-by: Mahesh Salgaonkar --- changes from v2: 2. Move the OOL handlers before __end_interrupts marker instead of moving the __end_interrupts marker 3. Leave __end_handlers marker as is. arch/powerpc/kernel/exceptions-64s.S | 29 +++-- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..9ac3a38 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -953,6 +953,25 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) + /* +* Out-Of-Line handlers for relocation-on interrupt vectors +* +* We need these OOL handlers to be below __end_interrupts +* marker to enusre we also copy these OOL handlers along +* with the interrupt vectors to real address 0x100 when +* running a relocatable kernel. Because the interrupt +* vectors branching to these OOL handlers are not long +* enough to use LOAD_HANDLER() for branching. +*/ + STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) + MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) + + STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* Other future vectors */ .align 7 .globl __end_interrupts @@ -1234,16 +1253,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) .globl __end_handlers __end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ - STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) - MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) - - STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) - STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) - STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavai
Re: [PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 04/01/2016 11:44 AM, Michael Ellerman wrote: On Wed, 2016-03-30 at 23:49 +0530, Hari Bathini wrote: Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full ... Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. ... changes from v2: 2. Move the OOL handlers before __end_interrupts marker instead of moving the __end_interrupts marker 3. Leave __end_handlers marker as is. Hi Hari, Thanks for trying this. In the end I've decided it's not a good option. If you build an allmodconfig, and turn on CONFIG_RELOCATABLE, and then look at the disassembly, you see this: c0006ffc: 48 00 29 04 b c0009900 <.ret_from_except> c0007000 <__end_handlers>: At 0x7000 we have the FWNMI area, which is fixed and can't move. As you see above we end up with only 4 bytes of space between the end of the handlers and the FWNMI area. So any tiny change that adds two more instructions prior to 0x7000 will then fail to build. Hi Michael, I agree. But the OOL handlers that are moved up in v3 were below 0x7000 earlier as well and moving them below __end_interrupts marker shouldn't make any difference in terms of space consumption at least in comparison between v2 & v3. So, I guess picking either v2 or v3 doesn't change this for better. Also, there is code between __end_interrupts and __end_handlers that is not location dependent as long as it is within 64K (0x1) that can be moved above 0x8000, if need be. For these reasons, I feel v3 is better going forward as it keeps __start_interrupts to __end_interrupts code compact and leaves alone the code that doesn't need to be copied to real 0. Am I missing something here? Thanks Hari None of that's your fault, it's just the nature of the code in there, it's very space constrained. For now I'll take your v2, but I'll edit the comment and drop the removal of __end_handlers. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 04/01/2016 04:07 PM, Michael Ellerman wrote: On Fri, 2016-04-01 at 12:23 +0530, Hari Bathini wrote: On 04/01/2016 11:44 AM, Michael Ellerman wrote: On Wed, 2016-03-30 at 23:49 +0530, Hari Bathini wrote: Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full ... Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. ... changes from v2: 2. Move the OOL handlers before __end_interrupts marker instead of moving the __end_interrupts marker 3. Leave __end_handlers marker as is. Hi Hari, Thanks for trying this. In the end I've decided it's not a good option. If you build an allmodconfig, and turn on CONFIG_RELOCATABLE, and then look at the disassembly, you see this: c0006ffc: 48 00 29 04 b c0009900 <.ret_from_except> c0007000 <__end_handlers>: At 0x7000 we have the FWNMI area, which is fixed and can't move. As you see above we end up with only 4 bytes of space between the end of the handlers and the FWNMI area. So any tiny change that adds two more instructions prior to 0x7000 will then fail to build. Hi Michael, I agree. But the OOL handlers that are moved up in v3 were below 0x7000 earlier as well and moving them below __end_interrupts marker shouldn't make any difference in terms of space consumption at least in comparison between v2 & v3. So, I guess picking either v2 or v3 doesn't change this for better. It does make a difference, due to alignment. Prior to your patch we have ~24 bytes free. Hi Michael, Hmmm.. I thought ~24 bytes was not such a difference but with the scenario you mentioned it does sound critical. Actually, this patch came into being for want of another 8~12 bytes. So, I should have known better about space constraint. Also, there is code between __end_interrupts and __end_handlers that is not location dependent as long as it is within 64K (0x1) that can be moved above 0x8000, if need be. That's true, but that sort of change is unlikely to backport well. And we need to backport this fix to everything. That does sound like a maintainer's nightmare. But if you can get that to work I'll consider it. I tried quickly but couldn't get it working, due to problems with the feature else sections being too far away from. Same case. May need sometime to get that right. Also, exploring holes between __start_interrupts & __end_interrupts. Will try and get back on this soon. If none of this works, we have v2 anyway. Thanks Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 3/3] ppc64/book3s: remove __end_handlers marker
__end_handlers marker was intended to mark down upto code that gets called from exception prologs. But that hasn't kept pace with code changes. Case in point, slb_miss_realmode being called from exception prolog code but isn't below __end_handlers marker. So, __end_handlers marker is as good as a comment but could be misleading at times if it isn't in sync with the code, as is the case now. So, let us avoid this confusion by having a better comment and removing __end_handlers marker altogether. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/exceptions-64s.S | 13 - 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index c193ebd..80f9fc4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,11 +764,10 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the - * addresses of these handlers using the LOAD_HANDLER macro, - * which uses an ori instruction, these handlers must be in - * the first 64k of the kernel image. + * Ensure that any handlers that get invoked from the exception prologs + * above are below the first 64KB (0x1) of the kernel image because + * the prologs assemble the addresses of these handlers using the + * LOAD_HANDLER macro, which uses an ori instruction. */ /*** Common interrupt handlers ***/ @@ -1243,10 +1242,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) bl vsx_unavailable_exception b ret_from_except - .align 7 - .globl __end_handlers -__end_handlers: - #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 2/3] ppc64/book3s: make some room for common interrupt vector code
With the previous patch, we choke out whatever little space is left below 0x7000 (FWNMI hard block) while there is a hole of ~1400 bytes below __end_interrupts marker when CONFIG_CBE_RAS is disabled. Considering CONFIG_CBE_RAS is not enabled by default for BOOK3S, this is not a desirable scenario especially when we have to worry about each additional instruction that goes below 0x7000. Memory region from 0x1800 to 0x4000 is dedicated for common interrupt vector code. Also, we never hit an interrupt below 0x300 when IR=DR=1 implying memory region between 0x4000 to 0x4300 can also be used for common interrupt vector code. So, we can effectively use memory region between 0x1800 to 0x4300 for common interrupt vector code. This patch tries to free up some space below 0x7000 by rearranging the common interrupt vector code. The approach here is to avoid large holes below 0x4300 for any kernel configuration. For this, let us move common interrupt vector code that only gets enabled with CONFIG_CBE_RAS above 0x8000, as it doesn't need to be too close to the call sites and can be branched to with LOAD_HANDLER() as long as it is within the first 64KB (0x1) of the kernel image. Instead, lets move common interrupt vector code marked h_instr_storage_common, facility_unavailable_common & hv_facility_unavailable_common below 0x4300. This leaves ~250 bytes free below 0x4300 and ~1150 bytes free below 0x7000 - enough space to stop worrying about every additional instruction that goes below 0x7000. This patch assumes at least commit 376af594, part of the patch series that starts with commit 468a3302, is part of the code to avoid messy compilation issues like: relocation truncated to fit: R_PPC64_REL14 against `.text'+1c90 Makefile:864: recipe for target 'vmlinux' failed I tested this patch successfully on ppc64, ppc64le lpars and baremetal environments. Couldn't test it on IBM cell blade though but expecting no problems with this patch in IBM cell blade environment as well. If someone can test this patch in cell platform, it would be great. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/exceptions-64s.S | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index f76b2f3..c193ebd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -786,6 +786,7 @@ kvmppc_skip_Hinterrupt: STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception) STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception) STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception) + STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) STD_EXCEPTION_COMMON(0xe40, emulation_assist, emulation_assist_interrupt) STD_EXCEPTION_COMMON_ASYNC(0xe60, hmi_exception, handle_hmi_exception) #ifdef CONFIG_PPC_DOORBELL @@ -794,6 +795,9 @@ kvmppc_skip_Hinterrupt: STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception) #endif STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception) + STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) + STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) + STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception) STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception) #ifdef CONFIG_ALTIVEC @@ -801,11 +805,6 @@ kvmppc_skip_Hinterrupt: #else STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception) #endif -#ifdef CONFIG_CBE_RAS - STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception) - STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception) - STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception) -#endif /* CONFIG_CBE_RAS */ /* * Relocation-on interrupts: A subset of the interrupts can be delivered @@ -1029,8 +1028,6 @@ instruction_access_common: li r5,0x400 b do_hash_page/* Try to handle as hpte fault */ - STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) - /* * Here is the common SLB miss user that is used when going to virtual * mode for SLB misses, that is currently not used @@ -1246,9 +1243,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) bl vsx_unavailable_exception b ret_from_except - STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) - STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 .globl __end_handlers __end_handlers: @@ -1268,6 +1262,12 @@ fwnmi_data_area: . = 0x8000 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ +#ifdef CONFIG_CBE_RAS + STD_EXCEPTION_CO
[PATCH v4 1/3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini Signed-off-by: Mahesh Salgaonkar --- Michael, I did test this patchset in different scenarios. But if you feel the change is too radical, we could go with version2. But I thought this was worth a shot. changes from v3: 1. No changes in this patch except for a spellcheck 2. A new patch that tries to free up space below 0x7000 (2/3) 3. A new patch to remove __end_handlers marker (3/3) arch/powerpc/kernel/exceptions-64s.S | 29 +++-- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..f76b2f3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -953,6 +953,25 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) + /* +* Out-Of-Line handlers for relocation-on interrupt vectors +* +* We need these OOL handlers to be below __end_interrupts +* marker to ensure we also copy these OOL handlers along +* with the interrupt vectors to real address 0x100 when +* running a relocatable kernel. Because the interrupt +* vectors branching to these OOL handlers are not long +* enough to use LOAD_HANDLER() for branching. +*/ + STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) + MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) + + STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* Other future vectors */ .align 7 .globl __end_interrupts @@ -1234,16 +1253,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) .globl __end_handlers __end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ - STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) - MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doo
Re: [v4, 2/3] ppc64/book3s: make some room for common interrupt vector code
On 04/15/2016 06:29 PM, Michael Ellerman wrote: On Fri, 2016-04-15 at 21:06 +1000, Michael Ellerman wrote: Hi Hari, Thanks for persisting with this. On Thu, 2016-07-04 at 21:58:50 UTC, Hari Bathini wrote: With the previous patch, we choke out whatever little space is left below 0x7000 (FWNMI hard block) while there is a hole of ~1400 bytes below __end_interrupts marker when CONFIG_CBE_RAS is disabled. Considering CONFIG_CBE_RAS is not enabled by default for BOOK3S, this is not a desirable scenario especially when we have to worry about each additional instruction that goes below 0x7000. Memory region from 0x1800 to 0x4000 is dedicated for common interrupt vector code. Also, we never hit an interrupt below 0x300 when IR=DR=1 implying memory region between 0x4000 to 0x4300 can also be used for common interrupt vector code. So, we can effectively use memory region between 0x1800 to 0x4300 for common interrupt vector code. On Power9 the system-call-vectored instruction will use the region at 0x3000, so moving code into that space is not a good long term plan. I'll take your v2 and put it in next next week. I'll add this fixes line, which I think is correct: Fixes: c1fb6816fb1b ("powerpc: Add relocation on exception vector handlers") Yeah. Thanks! cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: on crash, kexec'ed kernel needs all CPUs are online
On 10/16/2015 12:30 AM, Laurent Vivier wrote: On kexec, all secondary offline CPUs are onlined before starting the new kernel, this is not done in the case of kdump. If kdump is configured and a kernel crash occurs whereas some secondaries CPUs are offline (SMT=off), the new kernel is not able to start them and displays some "Processor X is stuck.". Starting with POWER8, subcore logic relies on all threads of core being booted. So, on startup kernel tries to start all threads, and asks OPAL (or RTAS) to start all CPUs (including threads). If a CPU has been offlined by the previous kernel, it has not been returned to OPAL, and thus OPAL cannot restart it: this CPU has been lost... Signed-off-by: Laurent Vivier Hi Laurent, Sorry for jumping too late into this. Are you seeing this issue even with the below patches: pseries: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c1caae3de46a072d0855729aed6e793e536a4a55 opal/powernv: https://github.com/open-power/skiboot/commit/9ee56b5 Thanks Hari --- arch/powerpc/kernel/crash.c | 20 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 51dbace..3ca9452 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -299,11 +300,30 @@ int crash_shutdown_unregister(crash_shutdown_t handler) } EXPORT_SYMBOL(crash_shutdown_unregister); +/* + * The next kernel will try to start all secondary CPUs and if + * there are not online it will fail to start them. + * + */ +static void wake_offline_cpus(void) +{ + int cpu = 0; + + for_each_present_cpu(cpu) { + if (!cpu_online(cpu)) { + pr_info("kexec: Waking offline cpu %d.\n", cpu); + cpu_up(cpu); + } + } +} + void default_machine_crash_shutdown(struct pt_regs *regs) { unsigned int i; int (*old_handler)(struct pt_regs *regs); + wake_offline_cpus(); + /* * This function is only called after the system * has panicked or is otherwise in a critical state. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: on crash, kexec'ed kernel needs all CPUs are online
On 11/05/2015 07:02 AM, David Gibson wrote: On Wed, 4 Nov 2015 14:54:51 +0100 Laurent Vivier wrote: On 04/11/2015 13:34, Hari Bathini wrote: On 10/16/2015 12:30 AM, Laurent Vivier wrote: On kexec, all secondary offline CPUs are onlined before starting the new kernel, this is not done in the case of kdump. If kdump is configured and a kernel crash occurs whereas some secondaries CPUs are offline (SMT=off), the new kernel is not able to start them and displays some "Processor X is stuck.". Starting with POWER8, subcore logic relies on all threads of core being booted. So, on startup kernel tries to start all threads, and asks OPAL (or RTAS) to start all CPUs (including threads). If a CPU has been offlined by the previous kernel, it has not been returned to OPAL, and thus OPAL cannot restart it: this CPU has been lost... Signed-off-by: Laurent Vivier Hi Laurent, Hi Hari, Sorry for jumping too late into this. better late than never :) Are you seeing this issue even with the below patches: pseries: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c1caae3de46a072d0855729aed6e793e536a4a55 Unfortunately, this is unlikely to be relevant - this fixes a failure while setting up the kexec. The problem we see occurs once we've booted the second kernel and it's attempting to bring up secondary CPUs. opal/powernv: https://github.com/open-power/skiboot/commit/9ee56b5 Very interesting. Is there a way to have a firmware with the fix ? From Laurent's analysis of the crash, I don't think this will be relevant either, but I'm not sure. It would be very interesting to know which (if any) released firmwares include this patch so we can test it. Hi Laurent/David, I am not so sure on this. While I get back on this, can you confirm you are seeing the issue in both PowerVM (pseries) and baremetal (powernv). What is the kernel version where the issue is seen for PowerVM and/or baremetal. Also, for baremetal, can you mention the OPAL version on which the issue is reproducible. If a bug is raised for this, I would be happy to be pointed to, to get more information on this. Thanks Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 00/18] Add FADump support on PowerNV platform
On 27/02/19 9:07 AM, Daniel Axtens wrote: Hi Hari, Hi Daniel, Firmware-Assisted Dump (FADump) is currently supported only on pseries platform. This patch series adds support for powernv platform too. The first and third patches refactor the FADump code to make use of common code across multiple platforms. The fifth patch adds basic FADump support for powernv platform. Patches seven & eight honour reserved-ranges DT node while reserving/releasing memory used by FADump. The next patch processes CPU state data provided by firmware to create and append core notes to the ELF core file. The tenth patch adds support for preserving crash data for subsequent boots (useful in cases like petitboot). Patch twelve provides support to export opalcore. This is to make debugging of failures in OPAL code easier. The subsequent patch ensures vmcore processing is skipped when only OPAL core is exported by f/w. The next patch provides option to release the kernel memory used to export opalcore. Patch seventeen adds backup area (an area populated before crash and used in the capture kernel to setup vmcore file robustly) support on PowerNV platform. The remaining patches update Firmware-Assisted Dump documentation appropriately. Note that the quantam of increase in robustness due to patch seventeen may not be worth breaking backward compatibility for older kernel versions. Would like to hear thoughts from others on it. The patch series is tested with the latest firmware plus the below skiboot changes for MPIPL support: https://patchwork.ozlabs.org/project/skiboot/list/?series=78497 ("MPIPL support") If I want to test this, is there some userspace tooling that will extract a fadump from a rebooted system and allow me to examine it as I would with a kdump (e.g. with crash)? I did look at Documentation/powerpc/firmware-assisted-dump.txt but it seems to only cover the kernel layer. This patches export two dump files: /proc/vmcore (kernel), /proc/opalcore (OPAL). If you are only interested in kernel dump, then passing fadump=on to the kernel and enabling/starting kdump-tools/kdump service, shipped with distro, would ensure dump is captured to /var/crash dir and rebooted but please be aware that the script would not copy /proc/opalcore to disk yet. Need to update scripts once this changes make it upstream.. Thanks Hari
Re: [PATCH 00/18] Add FADump support on PowerNV platform
Hi Nick, On 27/02/19 9:48 AM, Nicholas Piggin wrote: Hari Bathini's on February 22, 2019 3:35 am: Firmware-Assisted Dump (FADump) is currently supported only on pseries platform. This patch series adds support for powernv platform too. The first and third patches refactor the FADump code to make use of common code across multiple platforms. The fifth patch adds basic FADump support for powernv platform. Patches seven & eight honour reserved-ranges DT node while reserving/releasing memory used by FADump. The next patch processes CPU state data provided by firmware to create and append core notes to the ELF core file. The tenth patch adds support for preserving crash data for subsequent boots (useful in cases like petitboot). Patch twelve provides support to export opalcore. This is to make debugging of failures in OPAL code easier. The subsequent patch ensures vmcore processing is skipped when only OPAL core is exported by f/w. The next patch provides option to release the kernel memory used to export opalcore. Patch seventeen adds backup area (an area populated before crash and used in the capture kernel to setup vmcore file robustly) support on PowerNV platform. The remaining patches update Firmware-Assisted Dump documentation appropriately. Note that the quantam of increase in robustness due to patch seventeen may not be worth breaking backward compatibility for older kernel versions. Would like to hear thoughts from others on it. The patch series is tested with the latest firmware plus the below skiboot changes for MPIPL support: https://patchwork.ozlabs.org/project/skiboot/list/?series=78497 ("MPIPL support") --- Hari Bathini (18): powerpc/fadump: move internal fadump code to a new file powerpc/fadump: Improve fadump documentation pseries/fadump: move out platform specific support from generic code powerpc/fadump: use FADump instead of fadump for how it is pronounced powerpc/fadump: enable fadump support on OPAL based POWER platform powerpc/fadump: Update documentation about OPAL platform support powerpc/fadump: consider reserved ranges while reserving memory powerpc/fadump: consider reserved ranges while releasing memory powernv/fadump: process architected register state data provided by firmware powernv/fadump: add support to preserve crash data on FADUMP disabled kernel powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP powerpc/powernv: export /proc/opalcore for analysing opal crashes powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists powernv/opalcore: provide an option to invalidate /proc/opalcore file powernv/fadump: consider f/w load area powernv/fadump: update documentation about option to release opalcore powernv/fadump: use backup area to map PIR to logical CPUs The need to map firmware identifiers like PIR to Linux numbering comes up in a few places, OPAL msglog, pdbg debugger, etc. I wonder if we could have Linux register its logical CPU numbers with OPAL after it boots. Would that help with your usage? The logical to PIR map of crashing kernel is needed in the capture kernel (the kernel booted after crash to save the dump) that processes the register data provided by f/w. Not sure if the logical to PIR map would be guaranteed to be the same for both the crashing kernel and capture kernel. Actually, I don't see any value-add in using the logical to PIR map in processing the register data provided by f/w. pSeries isn't doing that and has been reliable. Intention was to get inputs from others on whether it is worth it.. powerpc/fadump: Update documentation about backup area support Documentation/powerpc/firmware-assisted-dump.txt | 208 ++-- arch/powerpc/Kconfig | 23 arch/powerpc/include/asm/fadump.h| 190 --- arch/powerpc/include/asm/opal-api.h | 58 + arch/powerpc/include/asm/opal.h |1 arch/powerpc/kernel/Makefile |6 arch/powerpc/kernel/fadump.c | 1199 -- arch/powerpc/kernel/fadump_internal.c| 297 + arch/powerpc/kernel/fadump_internal.h| 250 + I don't have much knowledge of fadump code, so I'll nitpick instead :P Why are you calling it fadump_internal, what's internal about it? You have the framework for the ops table etc here, which makes the platform code have to #include "../kernel/fadump_internal.h", and suggests it's not so internal. Seems like it would be fine just to go in include/asm/fadump.h and kernel fadump.c? Intention was to use that file to put common code used by platform specific code on both pSeries & PowerNV. How about fadump_common instead of fadump_internal to put that in perspective? arch/powerpc/kernel/prom.
[PATCH v2 00/16] Add FADump support on PowerNV platform
Firmware-Assisted Dump (FADump) is currently supported only on pseries platform. This patch series adds support for powernv platform too. The first and third patches refactor the FADump code to make use of common code across multiple platforms. The fifth patch adds basic FADump support for powernv platform. Patches seven & eight honour reserved-ranges DT node while reserving/releasing memory used by FADump. The next patch processes CPU state data provided by firmware to create and append core notes to the ELF core file. The tenth patch adds support for preserving crash data for subsequent boots (useful in cases like petitboot). Patch twelve provides support to export opalcore. This is to make debugging of failures in OPAL code easier. The subsequent patch ensures vmcore processing is skipped when only OPAL core is exported by f/w. The next patch provides option to release the kernel memory used to export opalcore. The remaining patches update Firmware-Assisted Dump documentation appropriately. The patch series is tested with the latest firmware plus the below skiboot changes for MPIPL support: https://patchwork.ozlabs.org/project/skiboot/list/?series=102588 ("MPIPL support") Changes in v2: * Rebased to latest upstream kernel version. * Updated according to latest OPAL changes. * Dropped patch seventeen from previous version as the quantam of increase in robustness due it doesn't seem worth breaking backward compatibility for older kernel versions. --- Hari Bathini (16): powerpc/fadump: move internal fadump code to a new file powerpc/fadump: Improve fadump documentation pseries/fadump: move out platform specific support from generic code powerpc/fadump: use FADump instead of fadump for how it is pronounced powerpc/fadump: enable fadump support on OPAL based POWER platform powerpc/fadump: Update documentation about OPAL platform support powerpc/fadump: consider reserved ranges while reserving memory powerpc/fadump: consider reserved ranges while releasing memory powernv/fadump: process architected register state data provided by firmware powernv/fadump: add support to preserve crash data on FADUMP disabled kernel powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP powerpc/powernv: export /proc/opalcore for analysing opal crashes powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists powernv/opalcore: provide an option to invalidate /proc/opalcore file powernv/fadump: consider f/w load area powernv/fadump: update documentation about option to release opalcore Documentation/powerpc/firmware-assisted-dump.txt | 193 ++-- arch/powerpc/Kconfig | 23 arch/powerpc/include/asm/fadump.h| 190 arch/powerpc/include/asm/opal-api.h | 58 + arch/powerpc/include/asm/opal.h |1 arch/powerpc/kernel/Makefile |6 arch/powerpc/kernel/fadump-common.c | 205 arch/powerpc/kernel/fadump-common.h | 222 arch/powerpc/kernel/fadump.c | 1163 -- arch/powerpc/kernel/prom.c |4 arch/powerpc/platforms/powernv/Makefile |3 arch/powerpc/platforms/powernv/opal-call.c |1 arch/powerpc/platforms/powernv/opal-core.c | 602 +++ arch/powerpc/platforms/powernv/opal-fadump.c | 562 +++ arch/powerpc/platforms/powernv/opal-fadump.h | 116 ++ arch/powerpc/platforms/pseries/Makefile |1 arch/powerpc/platforms/pseries/rtas-fadump.c | 534 ++ arch/powerpc/platforms/pseries/rtas-fadump.h | 96 ++ 18 files changed, 2998 insertions(+), 982 deletions(-) create mode 100644 arch/powerpc/kernel/fadump-common.c create mode 100644 arch/powerpc/kernel/fadump-common.h create mode 100644 arch/powerpc/platforms/powernv/opal-core.c create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h
[PATCH v2 01/16] powerpc/fadump: move internal fadump code to a new file
Refactoring fadump code means internal fadump code is referenced from different places. For ease, move internal code to a new file. Signed-off-by: Hari Bathini --- Changes in v2: * Using fadump-common.* instead of fadump_internal.* arch/powerpc/include/asm/fadump.h | 112 arch/powerpc/kernel/Makefile|2 arch/powerpc/kernel/fadump-common.c | 184 + arch/powerpc/kernel/fadump-common.h | 126 +++ arch/powerpc/kernel/fadump.c| 194 ++- 5 files changed, 324 insertions(+), 294 deletions(-) create mode 100644 arch/powerpc/kernel/fadump-common.c create mode 100644 arch/powerpc/kernel/fadump-common.h diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 188776b..028a8ef 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -24,34 +24,6 @@ #ifdef CONFIG_FA_DUMP -/* - * The RMA region will be saved for later dumping when kernel crashes. - * RMA is Real Mode Area, the first block of logical memory address owned - * by logical partition, containing the storage that may be accessed with - * translate off. - */ -#define RMA_START 0x0 -#define RMA_END(ppc64_rma_size) - -/* - * On some Power systems where RMO is 128MB, it still requires minimum of - * 256MB for kernel to boot successfully. When kdump infrastructure is - * configured to save vmcore over network, we run into OOM issue while - * loading modules related to network setup. Hence we need aditional 64M - * of memory to avoid OOM issue. - */ -#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ - + (0x1UL << 26)) - -/* The upper limit percentage for user specified boot memory size (25%) */ -#define MAX_BOOT_MEM_RATIO 4 - -#define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt) - -/* Alignement per CMA requirement. */ -#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ - max_t(unsigned long, MAX_ORDER - 1, pageblock_order)) - /* Firmware provided dump sections */ #define FADUMP_CPU_STATE_DATA 0x0001 #define FADUMP_HPTE_REGION 0x0002 @@ -60,18 +32,9 @@ /* Dump request flag */ #define FADUMP_REQUEST_FLAG0x0001 -/* FAD commands */ -#define FADUMP_REGISTER1 -#define FADUMP_UNREGISTER 2 -#define FADUMP_INVALIDATE 3 - /* Dump status flag */ #define FADUMP_ERROR_FLAG 0x2000 -#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1) - -#define CPU_UNKNOWN(~((u32)0)) - /* Utility macros */ #define SKIP_TO_NEXT_CPU(reg_entry)\ ({ \ @@ -125,59 +88,8 @@ struct fadump_mem_struct { struct fadump_section rmr_region; }; -/* Firmware-assisted dump configuration details. */ -struct fw_dump { - unsigned long cpu_state_data_size; - unsigned long hpte_region_size; - unsigned long boot_memory_size; - unsigned long reserve_dump_area_start; - unsigned long reserve_dump_area_size; - /* cmd line option during boot */ - unsigned long reserve_bootvar; - - unsigned long fadumphdr_addr; - unsigned long cpu_notes_buf; - unsigned long cpu_notes_buf_size; - - int ibm_configure_kernel_dump; - - unsigned long fadump_enabled:1; - unsigned long fadump_supported:1; - unsigned long dump_active:1; - unsigned long dump_registered:1; - unsigned long nocma:1; -}; - -/* - * Copy the ascii values for first 8 characters from a string into u64 - * variable at their respective indexes. - * e.g. - * The string "FADMPINF" will be converted into 0x4641444d50494e46 - */ -static inline u64 str_to_u64(const char *str) -{ - u64 val = 0; - int i; - - for (i = 0; i < sizeof(val); i++) - val = (*str) ? (val << 8) | *str++ : val << 8; - return val; -} -#define STR_TO_HEX(x) str_to_u64(x) -#define REG_ID(x) str_to_u64(x) - -#define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF") #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") -/* The firmware-assisted dump format. - * - * The register save area is an area in the partition's memory used to preserve - * the register contents (CPU state data) for the active CPUs during a firmware - * assisted dump. The dump format contains register save area header followed - * by register entries. Each list of registers for a CPU starts with - * "CPUSTRT" and ends with "CPUEND". - */ - /* Register save area header. */ struct fadump_reg_save_area_header { __be64 magic_number; @@ -185,29 +97,9 @@ struct fad
[PATCH v2 02/16] powerpc/fadump: Improve fadump documentation
The figures depicting FADump's (Firmware-Assisted Dump) memory layout are missing some finer details like different memory regions and what they represent. Improve the documentation by updating those details. Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt | 65 -- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 18c5fee..059993b 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -74,8 +74,9 @@ as follows: there is crash data available from a previous boot. During the early boot OS will reserve rest of the memory above boot memory size effectively booting with restricted memory - size. This will make sure that the second kernel will not - touch any of the dump memory area. + size. This will make sure that this kernel (also, referred + to as second kernel or capture kernel) will not touch any + of the dump memory area. -- User-space tools will read /proc/vmcore to obtain the contents of memory, which holds the previous crashed kernel dump in ELF @@ -125,48 +126,52 @@ space memory except the user pages that were present in CMA region. o Memory Reservation during first kernel - Low memory Top of memory - 0 boot memory size | - | ||<--Reserved dump area -->| | - V V| Permanent Reservation | V - +---+--/ /---+---++---++--+ - | ||CPU|HPTE| DUMP |ELF | | - +---+--/ /---+---++---++--+ -| ^ -| | -\ / - --- - Boot memory content gets transferred to - reserved area by firmware at the time of - crash + Low memoryTop of memory + 0 boot memory size |<--Reserved dump area --->| | + | || Permanent Reservation | | + V V| (Preserve area)| V + +---+--/ /---+---+++---++--+ + | ||CPU|HPTE| DUMP |HDR|ELF | | + +---+--/ /---+---+++---++--+ +| ^ ^ +| | | +\ / | + --- FADump Header + Boot memory content gets transferred (meta area) + to reserved area by firmware at the + time of crash + Fig. 1 + o Memory Reservation during second kernel after crash - Low memoryTop of memory - 0 boot memory size | - | |<- Reserved dump area --- -->| - V V V - +---+--/ /---+---++---++--+ - | ||CPU|HPTE| DUMP |ELF | | - +---+--/ /---+---++---++--+ + Low memoryTop of memory + 0 boot memory size| + | |<- Reserved dump area --->| + V V|< Preserve area ->| V + +---+--/ /---+---+++---++--+ + | ||CPU|HPTE| DUMP |HDR|ELF | | + +---+--/ /---+---+++---++--+ | | V V Used by second/proc/vmcore kernel to boot Fig. 2 -Currently the dump will be copied from /proc/vmcore to a -a new file upon user intervention. The dump data available through -/proc/vmcore will be in ELF format. Hence the existing kdump -infrastructure (kdump scripts) to save the dump works fine with -minor modifications. +Currently the dump will be copied from /proc/vmcore to a new file upon +user intervention. The dump data available through /proc/vmcore will be +in ELF format. Hence the existing kdump infrastructure (kdump scripts) +to save the dump works fine with minor modifications. KDump scripts on +major Distro releases have already been modified to work seemlessly (no +user intervention in s
[PATCH v2 03/16] pseries/fadump: move out platform specific support from generic code
Introduce callbacks for platform specific operations like register, unregister, invalidate & such, and move pseries specific code into platform code. Signed-off-by: Hari Bathini --- Changes in v2: * pSeries specific fadump code files are named rtas-fadump.* instead of pseries_fadump.* arch/powerpc/include/asm/fadump.h| 75 arch/powerpc/kernel/fadump-common.h | 39 ++ arch/powerpc/kernel/fadump.c | 501 ++-- arch/powerpc/platforms/pseries/Makefile |1 arch/powerpc/platforms/pseries/rtas-fadump.c | 538 ++ arch/powerpc/platforms/pseries/rtas-fadump.h | 96 + 6 files changed, 711 insertions(+), 539 deletions(-) create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 028a8ef..d27cde7 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -24,79 +24,8 @@ #ifdef CONFIG_FA_DUMP -/* Firmware provided dump sections */ -#define FADUMP_CPU_STATE_DATA 0x0001 -#define FADUMP_HPTE_REGION 0x0002 -#define FADUMP_REAL_MODE_REGION0x0011 - -/* Dump request flag */ -#define FADUMP_REQUEST_FLAG0x0001 - -/* Dump status flag */ -#define FADUMP_ERROR_FLAG 0x2000 - -/* Utility macros */ -#define SKIP_TO_NEXT_CPU(reg_entry)\ -({ \ - while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) \ - reg_entry++;\ - reg_entry++;\ -}) - extern int crashing_cpu; -/* Kernel Dump section info */ -struct fadump_section { - __be32 request_flag; - __be16 source_data_type; - __be16 error_flags; - __be64 source_address; - __be64 source_len; - __be64 bytes_dumped; - __be64 destination_address; -}; - -/* ibm,configure-kernel-dump header. */ -struct fadump_section_header { - __be32 dump_format_version; - __be16 dump_num_sections; - __be16 dump_status_flag; - __be32 offset_first_dump_section; - - /* Fields for disk dump option. */ - __be32 dd_block_size; - __be64 dd_block_offset; - __be64 dd_num_blocks; - __be32 dd_offset_disk_path; - - /* Maximum time allowed to prevent an automatic dump-reboot. */ - __be32 max_time_auto; -}; - -/* - * Firmware Assisted dump memory structure. This structure is required for - * registering future kernel dump with power firmware through rtas call. - * - * No disk dump option. Hence disk dump path string section is not included. - */ -struct fadump_mem_struct { - struct fadump_section_headerheader; - - /* Kernel dump sections */ - struct fadump_section cpu_state_data; - struct fadump_section hpte_region; - struct fadump_section rmr_region; -}; - -#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") - -/* Register save area header. */ -struct fadump_reg_save_area_header { - __be64 magic_number; - __be32 version; - __be32 num_cpu_offset; -}; - extern int is_fadump_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); @@ -111,5 +40,5 @@ extern void fadump_cleanup(void); static inline int is_fadump_active(void) { return 0; } static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } -#endif -#endif +#endif /* !CONFIG_FA_DUMP */ +#endif /* __PPC64_FA_DUMP_H__ */ diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 8ccd96d..f926145 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -47,6 +47,12 @@ #define FADUMP_UNREGISTER 2 #define FADUMP_INVALIDATE 3 +/* Firmware-Assited Dump platforms */ +enum fadump_platform_type { + FADUMP_PLATFORM_UNKNOWN = 0, + FADUMP_PLATFORM_PSERIES, +}; + #define FADUMP_CPU_ID_MASK ((1UL << 32) - 1) #define CPU_UNKNOWN(~((u32)0)) @@ -91,6 +97,9 @@ struct fad_crash_memory_ranges { unsigned long long size; }; +/* Platform specific callback functions */ +struct fadump_ops; + /* Firmware-assisted dump configuration details. */ struct fw_dump { unsigned long cpu_state_data_size; @@ -98,6 +107,8 @@ struct fw_dump { unsigned long boot_memory_size; unsigned long reserve_dump_area_start; unsigned long reserve_dump_area_size; + unsigned long meta_
[PATCH v2 04/16] powerpc/fadump: use FADump instead of fadump for how it is pronounced
Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt | 56 +++--- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 059993b..62e75ef 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -8,18 +8,18 @@ a crashed system, and to do so from a fully-reset system, and to minimize the total elapsed time until the system is back in production use. -- Firmware assisted dump (fadump) infrastructure is intended to replace +- Firmware-Assisted Dump (FADump) infrastructure is intended to replace the existing phyp assisted dump. - Fadump uses the same firmware interfaces and memory reservation model as phyp assisted dump. -- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore +- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore in the ELF format in the same way as kdump. This helps us reuse the kdump infrastructure for dump capture and filtering. - Unlike phyp dump, userspace tool does not need to refer any sysfs interface while reading /proc/vmcore. -- Unlike phyp dump, fadump allows user to release all the memory reserved +- Unlike phyp dump, FADump allows user to release all the memory reserved for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem. -- Once enabled through kernel boot parameter, fadump can be +- Once enabled through kernel boot parameter, FADump can be started/stopped through /sys/kernel/fadump_registered interface (see sysfs files section below) and can be easily integrated with kdump service start/stop init scripts. @@ -33,7 +33,7 @@ dump offers several strong, practical advantages: in a clean, consistent state. -- Once the dump is copied out, the memory that held the dump is immediately available to the running kernel. And therefore, - unlike kdump, fadump doesn't need a 2nd reboot to get back + unlike kdump, FADump doesn't need a 2nd reboot to get back the system to the production configuration. The above can only be accomplished by coordination with, @@ -61,7 +61,7 @@ as follows: boot successfully. For syntax of crashkernel= parameter, refer to Documentation/kdump/kdump.txt. If any offset is provided in crashkernel= parameter, it will be ignored - as fadump uses a predefined offset to reserve memory + as FADump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the @@ -120,7 +120,7 @@ blocking this significant chunk of memory from production kernel. Hence, the implementation uses the Linux kernel's Contiguous Memory Allocator (CMA) for memory reservation if CMA is configured for kernel. With CMA reservation this memory will be available for applications to -use it, while kernel is prevented from using it. With this fadump will +use it, while kernel is prevented from using it. With this FADump will still be able to capture all of the kernel memory and most of the user space memory except the user pages that were present in CMA region. @@ -170,14 +170,14 @@ KDump, as dump mechanism. The tools to examine the dump will be same as the ones used for kdump. -How to enable firmware-assisted dump (fadump): +How to enable firmware-assisted dump (FADump): - 1. Set config option CONFIG_FA_DUMP=y and build kernel. -2. Boot into linux kernel with 'fadump=on' kernel cmdline option. - By default, fadump reserved memory will be initialized as CMA area. - Alternatively, user can boot linux kernel with 'fadump=nocma' to - prevent fadump to use CMA. +2. Boot into linux kernel with 'FADump=on' kernel cmdline option. + By default, FADump reserved memory will be initialized as CMA area. + Alternatively, user can boot linux kernel with 'FADump=nocma' to + prevent FADump to use CMA. 3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. @@ -190,7 +190,7 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead option is set at kernel cmdline. 3. if user wants to capture all of user space memory and ok with reserved memory not available to production system, then - 'fadump=nocma' kernel parameter can be used to fallback to + 'FADump=nocma' kernel parameter can be used to fallback to old behaviour. Sysfs/debugfs files: @@ -203,29 +203,29 @@ Here is the list of files under kernel sysfs: /sys/kernel/fadump_enabled -This is used to display the fadump status. -0 = fadump is disa
[PATCH v2 05/16] powerpc/fadump: enable fadump support on OPAL based POWER platform
From: Hari Bathini Firmware-assisted dump support is enabled for OPAL based POWER platforms in P9 firmware. Make the corresponding updates in kernel to enable fadump support for such platforms. Signed-off-by: Hari Bathini --- Changes in v2: * Updated API number for FADump according to recent OPAL changes arch/powerpc/Kconfig |5 arch/powerpc/include/asm/opal-api.h | 35 ++ arch/powerpc/include/asm/opal.h |1 arch/powerpc/kernel/fadump-common.c | 27 ++ arch/powerpc/kernel/fadump-common.h | 44 ++- arch/powerpc/kernel/fadump.c | 259 ++ arch/powerpc/platforms/powernv/Makefile |1 arch/powerpc/platforms/powernv/opal-call.c |1 arch/powerpc/platforms/powernv/opal-fadump.c | 375 ++ arch/powerpc/platforms/powernv/opal-fadump.h | 40 +++ arch/powerpc/platforms/pseries/rtas-fadump.c | 18 - 11 files changed, 716 insertions(+), 90 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d0be82..2366a84 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -556,7 +556,7 @@ config CRASH_DUMP config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && PPC_RTAS + depends on PPC64 && (PPC_RTAS || PPC_POWERNV) select CRASH_CORE select CRASH_DUMP help @@ -567,7 +567,8 @@ config FA_DUMP is meant to be a kdump replacement offering robustness and speed not possible without system firmware assistance. - If unsure, say "N" + If unsure, say "y". Only special kernels like petitboot may + need to say "N" here. config IRQ_ALL_CPUS bool "Distribute interrupts on all CPUs by default" diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b..75471c2 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -210,7 +210,8 @@ #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 #defineOPAL_NX_COPROC_INIT 167 -#define OPAL_LAST 167 +#define OPAL_CONFIGURE_FADUMP 173 +#define OPAL_LAST 173 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -972,6 +973,37 @@ struct opal_sg_list { }; /* + * Firmware-Assisted Dump (FADump) + */ + +/* The maximum number of dump sections supported by OPAL */ +#define OPAL_FADUMP_NR_SECTIONS64 + +/* Kernel Dump section info */ +struct opal_fadump_section { + u8 src_type; + u8 reserved[7]; + __be64 src_addr; + __be64 src_size; + __be64 dest_addr; + __be64 dest_size; +}; + +/* + * FADump memory structure for registering dump support with + * POWER f/w through opal call. + */ +struct opal_fadump_mem_struct { + + __be16 section_size; /*sizeof(struct fadump_section) */ + __be16 section_count; /* number of sections */ + __be32 crashing_cpu; /* Thread on which OPAL crashed */ + __be64 reserved; + + struct opal_fadump_section section[OPAL_FADUMP_NR_SECTIONS]; +}; + +/* * Dump region ID range usable by the OS */ #define OPAL_DUMP_REGION_HOST_START0x80 @@ -1051,6 +1083,7 @@ enum { OPAL_REBOOT_NORMAL = 0, OPAL_REBOOT_PLATFORM_ERROR = 1, OPAL_REBOOT_FULL_IPL= 2, + OPAL_REBOOT_OS_ERROR= 3, }; /* Argument to OPAL_PCI_TCE_KILL */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index a55b01c..2123b3f 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -43,6 +43,7 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t bdfn, uint64_t PE_handle); int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap, uint64_t rate_phys, uint32_t size); +int64_t opal_configure_fadump(uint64_t command, void *data, uint64_t data_size); int64_t opal_console_write(int64_t term_number, __be64 *length, const uint8_t *buffer); int64_t opal_console_read(int64_t term_number, __be64 *length, diff --git a/arch/powerpc/kernel/fadump-common.c b/arch/powerpc/kernel/fadump-common.c index 0182886..514bbb5 100644 --- a/arch/powerpc/kernel/fadump-common.c +++ b/arch/powerpc/kernel/fadump-common.c @@ -10,6 +10,9 @@ * 2 of the License, or (at your option) any later version. */ +#undef DEBUG +#define pr_fmt(fmt) "f
[PATCH v2 06/16] powerpc/fadump: Update documentation about OPAL platform support
With FADump support now available on both pseries and OPAL platforms, update FADump documentation with these details. Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt | 90 -- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 62e75ef..844a229 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -70,7 +70,8 @@ as follows: normal. -- The freshly booted kernel will notice that there is a new - node (ibm,dump-kernel) in the device tree, indicating that + node (ibm,dump-kernel on PSeries or ibm,opal/dump/result-table + on OPAL platform) in the device tree, indicating that there is crash data available from a previous boot. During the early boot OS will reserve rest of the memory above boot memory size effectively booting with restricted memory @@ -93,7 +94,9 @@ as follows: Please note that the firmware-assisted dump feature is only available on Power6 and above systems with recent -firmware versions. +firmware versions on PSeries (PowerVM) platform and Power9 +and above systems with recent firmware versions on PowerNV +(OPAL) platform. Implementation details: -- @@ -108,57 +111,66 @@ that are run. If there is dump data, then the /sys/kernel/fadump_release_mem file is created, and the reserved memory is held. -If there is no waiting dump data, then only the memory required -to hold CPU state, HPTE region, boot memory dump and elfcore -header, is usually reserved at an offset greater than boot memory -size (see Fig. 1). This area is *not* released: this region will -be kept permanently reserved, so that it can act as a receptacle -for a copy of the boot memory content in addition to CPU state -and HPTE region, in the case a crash does occur. Since this reserved -memory area is used only after the system crash, there is no point in -blocking this significant chunk of memory from production kernel. -Hence, the implementation uses the Linux kernel's Contiguous Memory -Allocator (CMA) for memory reservation if CMA is configured for kernel. -With CMA reservation this memory will be available for applications to -use it, while kernel is prevented from using it. With this FADump will -still be able to capture all of the kernel memory and most of the user -space memory except the user pages that were present in CMA region. +If there is no waiting dump data, then only the memory required to +hold CPU state, HPTE region, boot memory dump, FADump header and +elfcore header, is usually reserved at an offset greater than boot +memory size (see Fig. 1). This area is *not* released: this region +will be kept permanently reserved, so that it can act as a receptacle +for a copy of the boot memory content in addition to CPU state and +HPTE region, in the case a crash does occur. + +Since this reserved memory area is used only after the system crash, +there is no point in blocking this significant chunk of memory from +production kernel. Hence, the implementation uses the Linux kernel's +Contiguous Memory Allocator (CMA) for memory reservation if CMA is +configured for kernel. With CMA reservation this memory will be +available for applications to use it, while kernel is prevented from +using it. With this FADump will still be able to capture all of the +kernel memory and most of the user space memory except the user pages +that were present in CMA region. o Memory Reservation during first kernel - Low memoryTop of memory - 0 boot memory size |<--Reserved dump area --->| | - | || Permanent Reservation | | - V V| (Preserve area)| V - +---+--/ /---+---+++---++--+ - | ||CPU|HPTE| DUMP |HDR|ELF | | - +---+--/ /---+---+++---++--+ -| ^ ^ -| | | -\ / | - --- FADump Header - Boot memory content gets transferred (meta area) - to reserved area by firmware at the - time of crash - + Low memory Top of memory + 0 boot memory size|<--- Reserved dump area --->| | + | | |Permanent Reservatio| | + V V | (Preserve area) | V + +---+/ /---+---++---+-+-+---+ + | | |///|/
[PATCH v2 07/16] powerpc/fadump: consider reserved ranges while reserving memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse reserved-ranges DT node and reserve kernel memory falling in these ranges for F/W purposes. Ensure memory in these ranges is not overlapped with memory reserved for FADump. Also, use a smaller offset, instead of the size of the memory to be reserved, by which to skip memory before making another attempt at reserving memory, after the previous attempt to reserve memory for FADump failed due to memory holes and/or reserved ranges, to reduce the likelihood of memory reservation failure. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump-common.h | 11 +++ arch/powerpc/kernel/fadump.c| 137 ++- 2 files changed, 145 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 8ad98db..ff764d4 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -101,6 +101,17 @@ struct fadump_memory_range { unsigned long long size; }; +/* + * Amount of memory (1024MB) to skip before making another attempt at + * reserving memory (after the previous attempt to reserve memory for + * FADump failed due to memory holes and/or reserved ranges) to reduce + * the likelihood of memory reservation failure. + */ +#define OFFSET_SIZE0x4000U + +/* Maximum no. of reserved ranges supported for processing. */ +#define MAX_RESERVED_RANGES128 + /* Maximum no. of real memory regions supported by the kernel */ #define MAX_REAL_MEM_REGIONS 8 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 913ab6e..39b6670 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -53,6 +53,9 @@ int crash_memory_ranges_size; int crash_mem_ranges; int max_crash_mem_ranges; +struct fadump_memory_range reserved_ranges[MAX_RESERVED_RANGES]; +int reserved_ranges_cnt; + #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -116,12 +119,116 @@ int __init fadump_cma_init(void) static int __init fadump_cma_init(void) { return 1; } #endif /* CONFIG_CMA */ +/* + * Sort the reserved ranges in-place and merge adjacent ranges + * to minimize the reserved ranges count. + */ +static void __init sort_and_merge_reserved_ranges(void) +{ + unsigned long long base, size; + struct fadump_memory_range tmp_range; + int i, j, idx; + + if (!reserved_ranges_cnt) + return; + + /* Sort the reserved ranges */ + for (i = 0; i < reserved_ranges_cnt; i++) { + idx = i; + for (j = i + 1; j < reserved_ranges_cnt; j++) { + if (reserved_ranges[idx].base > reserved_ranges[j].base) + idx = j; + } + if (idx != i) { + tmp_range = reserved_ranges[idx]; + reserved_ranges[idx] = reserved_ranges[i]; + reserved_ranges[i] = tmp_range; + } + } + + /* Merge adjacent reserved ranges */ + idx = 0; + for (i = 1; i < reserved_ranges_cnt; i++) { + base = reserved_ranges[i-1].base; + size = reserved_ranges[i-1].size; + if (reserved_ranges[i].base == (base + size)) + reserved_ranges[idx].size += reserved_ranges[i].size; + else { + idx++; + if (i == idx) + continue; + + reserved_ranges[idx] = reserved_ranges[i]; + } + } + reserved_ranges_cnt = idx + 1; +} + +static int __init add_reserved_range(unsigned long base, +unsigned long size) +{ + int i; + + if (reserved_ranges_cnt == MAX_RESERVED_RANGES) { + /* Compact reserved ranges and try again. */ + sort_and_merge_reserved_ranges(); + if (reserved_ranges_cnt == MAX_RESERVED_RANGES) + return 0; + } + + i = reserved_ranges_cnt++; + reserved_ranges[i].base = base; + reserved_ranges[i].size = size; + return 1; +} + +/* + * Scan reserved-ranges to consider them while reserving/releasing + * memory for FADump. + */ +static void __init early_init_dt_scan_reserved_ranges(unsigned long node) +{ + int len, ret; + unsigned long i; + const __be32 *prop; + + /* reserved-ranges already scanned */ + if (reserved_ranges_cnt != 0) + return; + + prop = of_get_flat_dt_prop(node, "reserved-ranges", &len); + + if (!prop) + return; + + /* +* Each reserved range is an (address,size) pair, 2 cells each, +* totalling 4 cells per range. +*/ + for (i =
[PATCH v2 08/16] powerpc/fadump: consider reserved ranges while releasing memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse 'reserved-ranges' DT node to reserve kernel memory falling in these ranges for firmware purposes. Along with the preserved area memory, also ensure memory in reserved ranges is not overlapped with memory released by capture kernel aftering saving vmcore. Also, fix the off-by-one error in fadump_release_reserved_area function while releasing memory. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump.c | 59 +- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 39b6670..fd06571 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -123,7 +123,7 @@ static int __init fadump_cma_init(void) { return 1; } * Sort the reserved ranges in-place and merge adjacent ranges * to minimize the reserved ranges count. */ -static void __init sort_and_merge_reserved_ranges(void) +static void sort_and_merge_reserved_ranges(void) { unsigned long long base, size; struct fadump_memory_range tmp_range; @@ -164,8 +164,7 @@ static void __init sort_and_merge_reserved_ranges(void) reserved_ranges_cnt = idx + 1; } -static int __init add_reserved_range(unsigned long base, -unsigned long size) +static int add_reserved_range(unsigned long base, unsigned long size) { int i; @@ -1126,33 +1125,57 @@ static void fadump_release_reserved_area(unsigned long start, unsigned long end) if (tend == end_pfn) break; - start_pfn = tend + 1; + start_pfn = tend; } } } /* - * Release the memory that was reserved in early boot to preserve the memory - * contents. The released memory will be available for general use. + * Release the memory that was reserved during early boot to preserve the + * crash'ed kernel's memory contents except reserved dump area (permanent + * reservation) and reserved ranges used by F/W. The released memory will + * be available for general use. */ static void fadump_release_memory(unsigned long begin, unsigned long end) { + int i; unsigned long ra_start, ra_end; - - ra_start = fw_dump.reserve_dump_area_start; - ra_end = ra_start + fw_dump.reserve_dump_area_size; + unsigned long tstart; /* -* exclude the dump reserve area. Will reuse it for next -* fadump registration. +* Add memory to permanently preserve to reserved ranges list +* and exclude all these ranges while releasing memory. */ - if (begin < ra_end && end > ra_start) { - if (begin < ra_start) - fadump_release_reserved_area(begin, ra_start); - if (end > ra_end) - fadump_release_reserved_area(ra_end, end); - } else - fadump_release_reserved_area(begin, end); + i = add_reserved_range(fw_dump.reserve_dump_area_start, + fw_dump.reserve_dump_area_size); + if (i == 0) { + /* +* Reached the MAX reserved ranges count. To ensure reserved +* dump area is excluded (as it will be reused for next +* FADump registration), ignore the last reserved range and +* add reserved dump area instead. +*/ + reserved_ranges_cnt--; + add_reserved_range(fw_dump.reserve_dump_area_start, + fw_dump.reserve_dump_area_size); + } + sort_and_merge_reserved_ranges(); + + tstart = begin; + for (i = 0; i < reserved_ranges_cnt; i++) { + ra_start = reserved_ranges[i].base; + ra_end = ra_start + reserved_ranges[i].size; + + if (tstart >= ra_end) + continue; + + if (tstart < ra_start) + fadump_release_reserved_area(tstart, ra_start); + tstart = ra_end; + } + + if (tstart < end) + fadump_release_reserved_area(tstart, end); } static void fadump_invalidate_release_mem(void)
[PATCH v2 09/16] powernv/fadump: process architected register state data provided by firmware
From: Hari Bathini Firmware provides architected register state data at the time of crash. Process this data and build CPU notes to append to ELF core. Signed-off-by: Hari Bathini Signed-off-by: Vasant Hegde --- Changes in v2: * Updated reg type values according to recent OPAL changes arch/powerpc/include/asm/opal-api.h | 23 +++ arch/powerpc/kernel/fadump-common.h |3 arch/powerpc/platforms/powernv/opal-fadump.c | 187 -- arch/powerpc/platforms/powernv/opal-fadump.h |4 + 4 files changed, 206 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 75471c2..91f2735 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -976,6 +976,29 @@ struct opal_sg_list { * Firmware-Assisted Dump (FADump) */ +/* FADump thread header for register entries */ +struct opal_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __packed; + +#define OPAL_REG_TYPE_GPR 0x01 +#define OPAL_REG_TYPE_SPR 0x02 + +/* FADump register entry. */ +struct opal_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +}; + /* The maximum number of dump sections supported by OPAL */ #define OPAL_FADUMP_NR_SECTIONS64 diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index ff764d4..8d47382 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -117,6 +117,9 @@ struct fadump_memory_range { /* Firmware-assisted dump configuration details. */ struct fw_dump { + unsigned long cpu_state_destination_addr; + unsigned long cpu_state_data_version; + unsigned long cpu_state_entry_size; unsigned long cpu_state_data_size; unsigned long hpte_region_size; unsigned long boot_memory_size; diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index da8480d..853f663 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -94,6 +94,12 @@ static void update_fadump_config(struct fw_dump *fadump_conf, last_end = base + size; j++; + } else if (fdm->section[i].src_type == + OPAL_FADUMP_CPU_STATE_DATA) { + fadump_conf->cpu_state_destination_addr = + be64_to_cpu(fdm->section[i].dest_addr); + fadump_conf->cpu_state_data_size = + be64_to_cpu(fdm->section[i].dest_size); } } fadump_conf->rmr_regions_cnt = j; @@ -199,6 +205,75 @@ static int opal_invalidate_fadump(struct fw_dump *fadump_conf) return 0; } +static inline void fadump_set_regval_regnum(struct pt_regs *regs, u32 reg_type, + u32 reg_num, u64 reg_val) +{ + if (reg_type == OPAL_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case 2000: + regs->nip = reg_val; + break; + case 2001: + regs->msr = reg_val; + break; + case 9: + regs->ctr = reg_val; + break; + case 8: + regs->link = reg_val; + break; + case 1: + regs->xer = reg_val; + break; + case 2002: + regs->ccr = reg_val; + break; + case 19: + regs->dar = reg_val; + break; + case 18: + regs->dsisr = reg_val; + break; + } +} + +static inline void fadump_read_registers(char *bufp, unsigned int regs_cnt, +unsigned int reg_entry_size, +struct pt_regs *regs) +{ + int i; + struct opal_fadump_reg_entry *reg_entry; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct opal_fadump_reg_entry *)bufp; + fadump_set_regval_regnum(regs, +
[PATCH v2 10/16] powernv/fadump: add support to preserve crash data on FADUMP disabled kernel
Add a new kernel config option, CONFIG_PRESERVE_FA_DUMP that ensures that crash data, from previously crash'ed kernel, is preserved. This helps in cases where FADump is not enabled but the subsequent memory preserving kernel boot is likely to process this crash data. One typical usecase for this config option is petitboot kernel. Signed-off-by: Hari Bathini --- arch/powerpc/Kconfig |9 + arch/powerpc/include/asm/fadump.h|9 +++-- arch/powerpc/kernel/Makefile |6 +++ arch/powerpc/kernel/fadump-common.h |8 arch/powerpc/kernel/fadump.c | 47 +++--- arch/powerpc/kernel/prom.c |4 +- arch/powerpc/platforms/powernv/Makefile |1 + arch/powerpc/platforms/powernv/opal-fadump.c | 37 +++- 8 files changed, 106 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2366a84..ac3259e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -570,6 +570,15 @@ config FA_DUMP If unsure, say "y". Only special kernels like petitboot may need to say "N" here. +config PRESERVE_FA_DUMP + bool "Preserve Firmware-assisted dump" + depends on PPC64 && PPC_POWERNV && !FA_DUMP + help + On a kernel with FA_DUMP disabled, this option helps to preserve + crash data from a previously crash'ed kernel. Useful when the next + memory preserving kernel boot would process this crash data. + Petitboot kernel is the typical usecase for this option. + config IRQ_ALL_CPUS bool "Distribute interrupts on all CPUs by default" depends on SMP diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index d27cde7..d09b77b 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -27,9 +27,6 @@ extern int crashing_cpu; extern int is_fadump_memory_area(u64 addr, ulong size); -extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, - int depth, void *data); -extern int fadump_reserve_mem(void); extern int setup_fadump(void); extern int is_fadump_active(void); extern int should_fadump_crash(void); @@ -41,4 +38,10 @@ static inline int is_fadump_active(void) { return 0; } static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } #endif /* !CONFIG_FA_DUMP */ + +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) +extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data); +extern int fadump_reserve_mem(void); +#endif #endif /* __PPC64_FA_DUMP_H__ */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fbecfba..42c24f8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -65,7 +65,11 @@ obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ eeh_driver.o eeh_event.o eeh_sysfs.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_FA_DUMP) += fadump.o fadump-common.o +ifeq ($(CONFIG_FA_DUMP),y) +obj-y += fadump.o fadump-common.o +else +obj-$(CONFIG_PRESERVE_FA_DUMP) += fadump.o +endif ifdef CONFIG_PPC32 obj-$(CONFIG_E500) += idle_e500.o endif diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 8d47382..1bd3aeb 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -13,6 +13,7 @@ #ifndef __PPC64_FA_DUMP_INTERNAL_H__ #define __PPC64_FA_DUMP_INTERNAL_H__ +#ifndef CONFIG_PRESERVE_FA_DUMP /* * The RMA region will be saved for later dumping when kernel crashes. * RMA is Real Mode Area, the first block of logical memory address owned @@ -88,6 +89,7 @@ struct fadump_crash_info_header { /* Platform specific callback functions */ struct fadump_ops; +#endif /* !CONFIG_PRESERVE_FA_DUMP */ /* Firmware-Assited Dump platforms */ enum fadump_platform_type { @@ -157,9 +159,12 @@ struct fw_dump { unsigned long nocma:1; enum fadump_platform_type fadump_platform; +#ifndef CONFIG_PRESERVE_FA_DUMP struct fadump_ops *ops; +#endif }; +#ifndef CONFIG_PRESERVE_FA_DUMP struct fadump_ops { ulong (*init_fadump_mem_struct)(struct fw_dump *fadump_config); int (*register_fadump)(struct fw_dump *fadump_config); @@ -181,8 +186,9 @@ u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs); void fadump_update_elfcore_header(struct fw_dump *fadump_config, char *bufp); int is_boot_memory_area_contiguous(struct fw_dump *fadump_conf); int is_res
[PATCH v2 11/16] powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP
Kernel config option CONFIG_PRESERVE_FA_DUMP is introduced to ensure crash data, from previously crash'ed kernel, is preserved. Update documentation with this details. Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt |9 + 1 file changed, 9 insertions(+) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 844a229..fa35593 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -98,6 +98,15 @@ firmware versions on PSeries (PowerVM) platform and Power9 and above systems with recent firmware versions on PowerNV (OPAL) platform. +On OPAL based machines, system first boots into an intermittent +kernel (referred to as petitboot kernel) before booting into the +capture kernel. This kernel would have minimal kernel and/or +userspace support to process crash data. Such kernel needs to +preserve previously crash'ed kernel's memory for the subsequent +capture kernel boot to process this crash data. Kernel config +option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel +to ensure that crash data is preserved to process later. + Implementation details: --
[PATCH v2 12/16] powerpc/powernv: export /proc/opalcore for analysing opal crashes
From: Hari Bathini Export /proc/opalcore file to analyze opal crashes. Since opalcore can be generated independent of CONFIG_FA_DUMP support in kernel, add this support under a new kernel config option CONFIG_OPAL_CORE. Also, avoid code duplication by moving common code used for processing the register state data to export /proc/vmcore and/or /proc/opalcore file(s). Signed-off-by: Hari Bathini --- arch/powerpc/Kconfig |9 arch/powerpc/platforms/powernv/Makefile |1 arch/powerpc/platforms/powernv/opal-core.c | 563 ++ arch/powerpc/platforms/powernv/opal-fadump.c | 94 +--- arch/powerpc/platforms/powernv/opal-fadump.h | 72 +++ 5 files changed, 669 insertions(+), 70 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-core.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ac3259e..2c76203 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -579,6 +579,15 @@ config PRESERVE_FA_DUMP memory preserving kernel boot would process this crash data. Petitboot kernel is the typical usecase for this option. +config OPAL_CORE + bool "Export OPAL memory as /proc/opalcore" + depends on PPC64 && PPC_POWERNV + help + This option uses the MPIPL support in firmware to provide + an ELF core of OPAL memory after a crash. The ELF core is + exported as /proc/opalcore file which is helpful in debugging + opal crashes using GDB. + config IRQ_ALL_CPUS bool "Distribute interrupts on all CPUs by default" depends on SMP diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index b4a8022..e659afd 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -8,6 +8,7 @@ obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_FA_DUMP) += opal-fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_OPAL_CORE)+= opal-core.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c new file mode 100644 index 000..8bf687d --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -0,0 +1,563 @@ +/* + * Interface for exporting the OPAL ELF core. + * Heavily inspired from fs/proc/vmcore.c + * + * Copyright 2018-2019, IBM Corp. + * Author: Hari Bathini + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG +#define pr_fmt(fmt) "opalcore: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../../kernel/fadump-common.h" +#include "opal-fadump.h" + +#define MAX_PT_LOAD_CNT8 + +/* NT_AUXV note related info */ +#define AUXV_CNT 1 +#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off)) + +struct opalcore_config { + unsigned intnum_cpus; + /* PIR value of crashing CPU */ + unsigned intcrashing_cpu; + + /* CPU state data info from F/W */ + unsigned long cpu_state_destination_addr; + unsigned long cpu_state_data_size; + unsigned long cpu_state_entry_size; + + /* OPAL memory to be exported as PT_LOAD segments */ + unsigned long ptload_addr[MAX_PT_LOAD_CNT]; + unsigned long ptload_size[MAX_PT_LOAD_CNT]; + unsigned long ptload_cnt; + + /* Pointer to the first PT_LOAD in the ELF core file */ + Elf64_Phdr *ptload_phdr; + + /* Total size of opalcore file. */ + size_t opalcore_size; + + struct proc_dir_entry *proc_opalcore; + + /* Buffer for all the ELF core headers and the PT_NOTE */ + size_t opalcorebuf_sz; + char*opalcorebuf; + + /* NT_AUXV buffer */ + charauxv_buf[AUXV_DESC_SZ]; +}; + +struct opalcore { + struct list_head list; + unsigned long long paddr; + unsigned long long size; + loff_t offset; +}; + +static LIST_HEAD(opalcore_list); +static struct opalcore_config *oc_conf; +static const struct opal_fadump_mem_struct *fdm_active; + +/* + * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered + * by kernel, SIGTERM otherwise. + */ +bool kernel_initiated; + +static struct opalcore * __init get_new
[PATCH v2 13/16] powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists
If OPAL crashes when the kernel is not registered for FADump, F/W still exports OPAL core through result-table DT node. Make sure '/proc/vmcore' processing is skipped as only data relevant to OPAL core is exported in such scenario. Signed-off-by: Hari Bathini --- arch/powerpc/platforms/powernv/opal-fadump.c | 12 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 65db21a..f530df0 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -108,6 +108,18 @@ static void update_fadump_config(struct fw_dump *fadump_conf, be64_to_cpu(fdm->section[i].dest_size); } } + + /* +* If dump is active and no kernel memory region is found in +* result-table, it means OPAL crashed on system with MPIPL +* support and the kernel was not registered for FADump at the +* time of crash. Skip processing /proc/vmcore in that case. +*/ + if (j == 0) { + fadump_conf->dump_active = 0; + return; + } + fadump_conf->rmr_regions_cnt = j; pr_debug("Real memory regions count: %lu\n", fadump_conf->rmr_regions_cnt);
[PATCH v2 14/16] powernv/opalcore: provide an option to invalidate /proc/opalcore file
Writing '1' to /sys/kernel/fadump_release_opalcore would release the memory held by kernel in exporting /proc/opalcore file. Signed-off-by: Hari Bathini --- arch/powerpc/platforms/powernv/opal-core.c | 39 1 file changed, 39 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 8bf687d..5503b8b 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -532,6 +534,36 @@ static void opalcore_cleanup(void) } __exitcall(opalcore_cleanup); +static ssize_t fadump_release_opalcore_store(struct kobject *kobj, +struct kobj_attribute *attr, +const char *buf, size_t count) +{ + int input = -1; + + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { + if (oc_conf == NULL) { + pr_err("'/proc/opalcore' file does not exist!\n"); + return -EPERM; + } + + /* +* Take away '/proc/opalcore' and release all memory +* used for exporting this file. +*/ + opalcore_cleanup(); + } else + return -EINVAL; + + return count; +} + +static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore, + 0200, NULL, + fadump_release_opalcore_store); + /* Init function for opalcore module. */ static int __init opalcore_init(void) { @@ -558,6 +590,13 @@ static int __init opalcore_init(void) &proc_opalcore_operations); if (oc_conf->proc_opalcore) proc_set_size(oc_conf->proc_opalcore, oc_conf->opalcore_size); + + rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr); + if (rc) { + pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n", + rc); + } + return 0; } fs_initcall(opalcore_init);
[PATCH v2 15/16] powernv/fadump: consider f/w load area
OPAL loads kernel & initrd at 512MB offset (256MB size), also exported as ibm,opal/dump/fw-load-area. So, if boot memory size of FADump is less than 768MB, kernel memory to be exported as '/proc/vmcore' would be overwritten by f/w while loading kernel & initrd. To avoid such a scenario, enforce a minimum boot memory size of 768MB on OPAL platform. Also, skip using FADump if a newer F/W version loads kernel & initrd above 768MB. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump-common.h | 15 +-- arch/powerpc/kernel/fadump.c |8 arch/powerpc/platforms/powernv/opal-fadump.c | 23 +++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 1bd3aeb..f59fdc7 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -24,14 +24,25 @@ #define RMA_END(ppc64_rma_size) /* + * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum + * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't + * mess with crash'ed kernel's memory during MPIPL. + */ +#define OPAL_MIN_BOOT_MEM (0x3000UL) + +/* * On some Power systems where RMO is 128MB, it still requires minimum of * 256MB for kernel to boot successfully. When kdump infrastructure is * configured to save vmcore over network, we run into OOM issue while * loading modules related to network setup. Hence we need additional 64M * of memory to avoid OOM issue. */ -#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ - + (0x1UL << 26)) +#define PSERIES_MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : \ +RMA_END) + (0x1UL << 26)) + +#define MIN_BOOT_MEM ((fw_dump.fadump_platform ==\ +FADUMP_PLATFORM_POWERNV) ? OPAL_MIN_BOOT_MEM : \ +PSERIES_MIN_BOOT_MEM) /* The upper limit percentage for user specified boot memory size (25%) */ #define MAX_BOOT_MEM_RATIO 4 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index ba26169..3c3adc2 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -582,6 +582,14 @@ int __init fadump_reserve_mem(void) ALIGN(fw_dump.boot_memory_size, FADUMP_CMA_ALIGNMENT); #endif + + if ((fw_dump.fadump_platform == FADUMP_PLATFORM_POWERNV) && + (fw_dump.boot_memory_size < OPAL_MIN_BOOT_MEM)) { + pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%lx\n", + fw_dump.boot_memory_size, OPAL_MIN_BOOT_MEM); + goto error_out; + } + fw_dump.rmr_source_len = fw_dump.boot_memory_size; if (!fadump_get_rmr_regions()) { pr_err("Too many holes in boot memory area to enable fadump\n"); diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index f530df0..0a22257 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -528,6 +528,29 @@ int __init opal_dt_scan_fadump(struct fw_dump *fadump_conf, ulong node) fadump_conf->cpu_state_entry_size = of_read_number(prop, 1); } + } else { + int i, len; + + prop = of_get_flat_dt_prop(dn, "fw-load-area", &len); + if (prop) { + /* +* Each f/w load area is an (address,size) pair, +* 2 cells each, totalling 4 cells per range. +*/ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, end; + + base = of_read_number(prop + (i * 4) + 0, 2); + end = base; + end += of_read_number(prop + (i * 4) + 2, 2); + if (end > OPAL_MIN_BOOT_MEM) { + pr_err("F/W load area: 0x%llx-0x%llx\n", + base, end); + pr_err("F/W version not supported!\n"); + return 1; + } + } + } } fadump_conf->ops= &opal_fadump_ops;
[PATCH v2 16/16] powernv/fadump: update documentation about option to release opalcore
With /proc/opalcore support available on OPAL based machines and an option to release memory used by kernel in exporting /proc/opalcore, update FADump documentation with these details. Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt | 19 +++ 1 file changed, 19 insertions(+) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index fa35593..6411449 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -107,6 +107,16 @@ capture kernel boot to process this crash data. Kernel config option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel to ensure that crash data is preserved to process later. +-- On OPAL based machines (PowerNV), if the kernel is build with + CONFIG_OPAL_CORE=y, OPAL memory at the time of crash is also + exported as /proc/opalcore file. This procfs file is helpful + in debugging OPAL crashes with GDB. The kernel memory used + for exporting this procfs file can be released by echo'ing + '1' to /sys/kernel/fadump_release_opalcore node. + + e.g. + # echo 1 > /sys/kernel/fadump_release_opalcore + Implementation details: -- @@ -260,6 +270,15 @@ Here is the list of files under kernel sysfs: enhanced to use this interface to release the memory reserved for dump and continue without 2nd reboot. + /sys/kernel/fadump_release_opalcore + +This file is available only on OPAL based machines when FADump is +active during capture kernel. This is used to release the memory +used by the kernel to export /proc/opalcore file. To release this +memory, echo '1' to it: + +echo 1 > /sys/kernel/fadump_release_opalcore + Here is the list of files under powerpc debugfs: (Assuming debugfs is mounted on /sys/kernel/debug directory.)
Re: [PATCH 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents
On 11/02/20 8:29 AM, Pingfan Liu wrote: > A bug is observed on pseries by taking the following steps on rhel: > -1. drmgr -c mem -r -q 5 > -2. echo c > /proc/sysrq-trigger > > And then, the failure looks like: > kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/ > kdump: saving vmcore-dmesg.txt > kdump: saving vmcore-dmesg.txt complete > kdump: saving vmcore > Checking for memory holes : [ 0.0 %] / > Checking for memory holes : [100.0 %] | > Excluding unnecessary pages : [100.0 %] \ > Copying data : [ 0.3 %] - > eta: 38s[ 44.337636] hash-mmu: mm: Hashing failure ! > EA=0x7fffba40 access=0x8004 current=makedumpfile > [ 44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 > psize 2 pte=0xc0005504 > [ 44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 > access=0x8004 current=makedumpfile > [ 44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 > psize 2 pte=0xc0005504 > [ 44.337708] makedumpfile[469]: unhandled signal 7 at 7fffba40 nip > 7fffbbc4d7fc lr 00011356ca3c code 2 > [ 44.338548] Core dump to |/bin/false pipe failed > /lib/kdump-lib-initramfs.sh: line 98: 469 Bus error > $CORE_COLLECTOR /proc/vmcore > $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete > kdump: saving vmcore failed > > * Root cause * > After analyzing, it turns out that in the current implementation, > when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as > the code __remove_memory() comes before drmem_update_dt(). > > From a viewpoint of listener and publisher, the publisher notifies the > listener before data is ready. This introduces a problem where udev > launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before > updating. And in capture kernel, makedumpfile will access the memory based > on the stale dt info, and hit a SIGBUS error due to an un-existed lmb. > > * Fix * > In order to fix this issue, update dt before __remove_memory(), and > accordingly the same rule in hot-add path. > > This will introduce extra dt updating payload for each involved lmb when > hotplug. > But it should be fine since drmem_update_dt() is memory based operation and > hotplug is not a hot path. > > Signed-off-by: Pingfan Liu > Cc: Michael Ellerman > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Hari Bathini > To: linuxppc-dev@lists.ozlabs.org > Cc: ke...@lists.infradead.org KDump fails to capture vmcore as we end up looking at a stale elfcore hdr with udev event happening before DT update. Resolved with these patches. For the series: Tested-by: Hari Bathini
[PATCH 1/2] powerpc/fadump: use static allocation for reserved memory ranges
At times, memory ranges have to be looked up during early boot, when kernel couldn't be initialized for dynamic memory allocation. In fact, reserved-ranges look up is needed during FADump memory reservation. Without accounting for reserved-ranges in reserving memory for FADump, MPIPL boot fails with memory corruption issues. So, extend memory ranges handling to support static allocation and populate reserved memory ranges during early boot. Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing memory") Cc: sta...@vger.kernel.org # v5.4+ Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/fadump-internal.h |4 + arch/powerpc/kernel/fadump.c | 77 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index c814a2b..8d61c8f 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -64,12 +64,14 @@ struct fadump_memory_range { }; /* fadump memory ranges info */ +#define RNG_NAME_SZ16 struct fadump_mrange_info { - charname[16]; + charname[RNG_NAME_SZ]; struct fadump_memory_range *mem_ranges; u32 mem_ranges_sz; u32 mem_range_cnt; u32 max_mem_ranges; + boolis_static; }; /* Platform specific callback functions */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index ff0114a..7fcf4a8f 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -38,8 +38,17 @@ static void __init fadump_reserve_crash_area(u64 base); #ifndef CONFIG_PRESERVE_FA_DUMP static DEFINE_MUTEX(fadump_mutex); -struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; -struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; +struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; + +#define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ +#define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ +sizeof(struct fadump_memory_range)) +static struct fadump_memory_range rngs[RESERVED_RNGS_CNT]; +struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs, + RESERVED_RNGS_SZ, 0, + RESERVED_RNGS_CNT, true }; + +static void __init early_init_dt_scan_reserved_ranges(unsigned long node); #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -108,6 +117,11 @@ static int __init fadump_cma_init(void) { return 1; } int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { + if (depth == 0) { + early_init_dt_scan_reserved_ranges(node); + return 0; + } + if (depth != 1) return 0; @@ -726,10 +740,14 @@ void fadump_free_cpu_notes_buf(void) static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { + if (mrange_info->is_static) { + mrange_info->mem_range_cnt = 0; + return; + } + kfree(mrange_info->mem_ranges); - mrange_info->mem_ranges = NULL; - mrange_info->mem_ranges_sz = 0; - mrange_info->max_mem_ranges = 0; + memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0, + (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ)); } /* @@ -786,6 +804,12 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; + if (mrange_info->is_static) { + pr_err("Reached array size limit for %s memory ranges\n", + mrange_info->name); + return -ENOSPC; + } + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; @@ -1202,20 +1226,19 @@ static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) * Scan reserved-ranges to consider them while reserving/releasing * memory for FADump. */ -static inline int fadump_scan_reserved_mem_ranges(void) +static void __init early_init_dt_scan_reserved_ranges(unsigned long node) { - struct device_node *root; const __be32 *prop; int len, ret = -1; unsigned long i; - root = of_find_node_by_path("/"); - if (!root) - retu
[PATCH 2/2] powerpc/fadump: consider reserved ranges while reserving memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse reserved-ranges DT node and reserve kernel memory falling in these ranges for F/W purposes. Memory reserved for FADump should not overlap with these ranges as it could corrupt memory meant for F/W or crash'ed kernel memory to be exported as vmcore. But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode"), memblock_find_in_range() is being used to find the appropriate area to reserve memory for FADump, which can't account for reserved-ranges as these ranges are reserved only after FADump memory reservation. With reserved-ranges now being populated during early boot, look out for these memory ranges while reserving memory for FADump. Without this change, MPIPL on PowerNV systems aborts with hostboot failure, when memory reserved for FADump is less than 4096MB. Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode") Cc: sta...@vger.kernel.org # v5.4+ Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump.c | 76 -- 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 7fcf4a8f..ab83be9 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -443,10 +443,70 @@ static int __init fadump_get_boot_mem_regions(void) return ret; } +/* + * Returns true, if the given range overlaps with reserved memory ranges + * starting at idx. Also, updates idx to index of overlapping memory range + * with the given memory range. + * False, otherwise. + */ +static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx) +{ + bool ret = false; + int i; + + for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) { + u64 rbase = reserved_mrange_info.mem_ranges[i].base; + u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size; + + if (end <= rbase) + break; + + if ((end > rbase) && (base < rend)) { + *idx = i; + ret = true; + break; + } + } + + return ret; +} + +/* + * Locate a suitable memory area to reserve memory for FADump. While at it, + * lookup reserved-ranges & avoid overlap with them, as they are used by F/W. + */ +static u64 __init fadump_locate_reserve_mem(u64 base, u64 size) +{ + struct fadump_memory_range *mrngs; + phys_addr_t mstart, mend; + int idx = 0; + u64 i; + + mrngs = reserved_mrange_info.mem_ranges; + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { + pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n", +i, mstart, mend, base); + + if (mstart > base) + base = PAGE_ALIGN(mstart); + + while ((mend > base) && ((mend - base) >= size)) { + if (!overlaps_reserved_ranges(base, base + size, &idx)) + goto out; + + base = mrngs[idx].base + mrngs[idx].size; + base = PAGE_ALIGN(base); + } + } + +out: + return base; +} + int __init fadump_reserve_mem(void) { - u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; - bool is_memblock_bottom_up = memblock_bottom_up(); + u64 base, size, mem_boundary, bootmem_min; int ret = 1; if (!fw_dump.fadump_enabled) @@ -467,9 +527,9 @@ int __init fadump_reserve_mem(void) PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA if (!fw_dump.nocma) { - align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, align); + ALIGN(fw_dump.boot_memory_size, + FADUMP_CMA_ALIGNMENT); } #endif @@ -537,13 +597,9 @@ int __init fadump_reserve_mem(void) * Reserve memory at an offset closer to bottom of the RAM to * minimize the impact of memory hot-remove operation. */ - memblock_set_bottom_up(true); - base = memblock_find_in_range(base, mem_boundary, size, align); - - /* Restore the previous allocation mode */ - memblock_set_bottom_up(is_memblock_bottom_up); + base = fadump_locate_reserve_mem(base, size); - if (!base) { + if (base > (mem_boundary - size)) { pr_err("Failed to find memory chunk for reservation!\n"); goto error_out; }
Re: [PATCHv3] powerpc/crashkernel: take "mem=" option into account
Hello Pingfan, Thanks for the patch.. On 19/02/20 7:48 PM, Pingfan Liu wrote: > 'mem=" option is an easy way to put high pressure on memory during some > test. Hence after applying the memory limit, instead of total mem, the > actual usable memory should be considered when reserving mem for > crashkernel. Otherwise the boot up may experience OOM issue. > > E.g. it would reserve 4G prior to the change and 512M afterward, if passing > crashkernel="2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G", and > mem=5G on a 256G machine. > > This issue is powerpc specific because it puts higher priority on fadump > and kdump reservation than on "mem=". Referring the following code: > if (fadump_reserve_mem() == 0) > reserve_crashkernel(); > ... > /* Ensure that total memory size is page-aligned. */ > limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); > memblock_enforce_memory_limit(limit); > > While on other arches, the effect of "mem=" takes a higher priority and pass > through memblock_phys_mem_size() before calling reserve_crashkernel(). > > Signed-off-by: Pingfan Liu > To: linuxppc-dev@lists.ozlabs.org > Cc: Hari Bathini > Cc: Michael Ellerman > Cc: ke...@lists.infradead.org > --- > v2 -> v3: improve commit log > arch/powerpc/kernel/machine_kexec.c | 7 --- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/kernel/machine_kexec.c > b/arch/powerpc/kernel/machine_kexec.c > index c4ed328..eec96dc 100644 > --- a/arch/powerpc/kernel/machine_kexec.c > +++ b/arch/powerpc/kernel/machine_kexec.c > @@ -114,11 +114,12 @@ void machine_kexec(struct kimage *image) > > void __init reserve_crashkernel(void) > { > - unsigned long long crash_size, crash_base; > + unsigned long long crash_size, crash_base, total_mem_sz; > int ret; > > + total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size(); > /* use common parsing */ > - ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), > + ret = parse_crashkernel(boot_command_line, total_mem_sz, > &crash_size, &crash_base); > if (ret == 0 && crash_size > 0) { > crashk_res.start = crash_base; memory_limit is adjusted after this with the below snippet: /* Crash kernel trumps memory limit */ if (memory_limit && memory_limit <= crashk_res.end) { memory_limit = crashk_res.end + 1; printk("Adjusted memory limit for crashkernel, now 0x%llx\n", memory_limit); } So, either the above snippet must be dropped or the print below should use an updated total_mem_sz based on adjusted memory_limit. I would prefer the latter.. > @@ -185,7 +186,7 @@ void __init reserve_crashkernel(void) > "for crashkernel (System RAM: %ldMB)\n", > (unsigned long)(crash_size >> 20), > (unsigned long)(crashk_res.start >> 20), > - (unsigned long)(memblock_phys_mem_size() >> 20)); > + (unsigned long)(total_mem_sz >> 20)); > > if (!memblock_is_region_memory(crashk_res.start, crash_size) || > memblock_reserve(crashk_res.start, crash_size)) { > -- - Hari
Re: [PATCHv4] powerpc/crashkernel: take "mem=" option into account
On 01/04/20 7:30 PM, Pingfan Liu wrote: > 'mem=" option is an easy way to put high pressure on memory during some > test. Hence after applying the memory limit, instead of total mem, the > actual usable memory should be considered when reserving mem for > crashkernel. Otherwise the boot up may experience OOM issue. > > E.g. it would reserve 4G prior to the change and 512M afterward, if passing > crashkernel="2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G", and > mem=5G on a 256G machine. > > This issue is powerpc specific because it puts higher priority on fadump > and kdump reservation than on "mem=". Referring the following code: > if (fadump_reserve_mem() == 0) > reserve_crashkernel(); > ... > /* Ensure that total memory size is page-aligned. */ > limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); > memblock_enforce_memory_limit(limit); > > While on other arches, the effect of "mem=" takes a higher priority and pass > through memblock_phys_mem_size() before calling reserve_crashkernel(). >> Signed-off-by: Pingfan Liu > To: linuxppc-dev@lists.ozlabs.org > Cc: Hari Bathini > Cc: Michael Ellerman > Cc: ke...@lists.infradead.org > --- > v3 -> v4: fix total_mem_sz based on adjusted memory_limit Thanks for the update. Reviewed-by: Hari Bathini
Re: [PATCH v2 3/4] Documentation/ABI: mark /sys/kernel/fadump_* sysfs files deprecated
On 18/10/19 6:35 PM, Sourabh Jain wrote: > The /sys/kernel/fadump_* sysfs files are replicated under [...] > +Note: The following FADump sysfs files are deprecated. > + > +Deprecated Alternative > + > --- > +/sys/kernel/fadump_enabled /sys/kernel/fadump/fadump_enabled > +/sys/kernel/fadump_registered/sys/kernel/fadump/fadump_registered > +/sys/kernel/fadump_release_mem > /sys/kernel/fadump/fadump_release_mem /sys/kernel/fadump/* looks tidy instead of /sys/kernel/fadump/fadump_* I mean, /sys/kernel/fadump/fadump_enabled => /sys/kernel/fadump/enabled and such.. - Hari
Re: [PATCH] powerpc/fadump: Remove duplicate message.
Michal, thanks for looking into this. On 23/10/19 11:26 PM, Michal Suchanek wrote: > There is duplicate message about lack of support by firmware in > fadump_reserve_mem and setup_fadump. Due to different capitalization it > is clear that the one in setup_fadump is shown on boot. Remove the > duplicate that is not shown. Actually, the message in fadump_reserve_mem() is logged. fadump_reserve_mem() executes first and sets fw_dump.fadump_enabled to `0`, if fadump is not supported. So, the other message in setup_fadump() doesn't get logged anymore with recent changes. The right thing to do would be to remove similar message in setup_fadump() instead. - Hari
Re: [PATCH v2 3/4] Documentation/ABI: mark /sys/kernel/fadump_* sysfs files deprecated
On 05/11/19 2:24 PM, Sourabh Jain wrote: > > > On 10/21/19 1:11 PM, Hari Bathini wrote: >> >> >> On 18/10/19 6:35 PM, Sourabh Jain wrote: >>> The /sys/kernel/fadump_* sysfs files are replicated under >> >> [...] >> >>> +Note: The following FADump sysfs files are deprecated. >>> + >>> +Deprecated Alternative >>> + >>> --- >>> +/sys/kernel/fadump_enabled /sys/kernel/fadump/fadump_enabled >>> +/sys/kernel/fadump_registered >>> /sys/kernel/fadump/fadump_registered >>> +/sys/kernel/fadump_release_mem >>> /sys/kernel/fadump/fadump_release_mem >> >> /sys/kernel/fadump/* looks tidy instead of /sys/kernel/fadump/fadump_* >> I mean, /sys/kernel/fadump/fadump_enabled => /sys/kernel/fadump/enabled and >> such.. > > > > Could you please confirm whether you want to address the sysfs file path > differently or > actually changing the sysfs file name from fadump_enabled to enabled. I meant, given the path "/sys/kernel/fadump/", the prefix fadump_ is redundant. If there are no conventions that we should retain the same file name, I suggest to drop the fadump_ prefix and just call them enabled, registered, etc.. - Hari
Re: [PATCH v3] powerpc/fadump: when fadump is supported register the fadump sysfs files.
On 07/11/19 10:17 PM, Michal Suchanek wrote: > Currently it is not possible to distinguish the case when fadump is > supported by firmware and disabled in kernel and completely unsupported > using the kernel sysfs interface. User can investigate the devicetree > but it is more reasonable to provide sysfs files in case we get some > fadumpv2 in the future. > > With this patch sysfs files are available whenever fadump is supported > by firmware. > > There is duplicate message about lack of support by firmware in > fadump_reserve_mem and setup_fadump. Remove the duplicate message in > setup_fadump. Thanks for doing this, Michal. Exporting the node will be helpful in finding if FADump is supported, given FADump is now supported on two different platforms... Reviewed-by: Hari Bathini > > Signed-off-by: Michal Suchanek > --- > v2: move the sysfs initialization earlier to avoid condition nesting > v3: remove duplicate message > --- > arch/powerpc/kernel/fadump.c | 15 ++- > 1 file changed, 6 insertions(+), 9 deletions(-) > > diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c > index ed59855430b9..ff0114aeba9b 100644 > --- a/arch/powerpc/kernel/fadump.c > +++ b/arch/powerpc/kernel/fadump.c > @@ -1466,16 +1466,15 @@ static void fadump_init_files(void) > */ > int __init setup_fadump(void) > { > - if (!fw_dump.fadump_enabled) > - return 0; > - > - if (!fw_dump.fadump_supported) { > - printk(KERN_ERR "Firmware-assisted dump is not supported on" > - " this hardware\n"); > + if (!fw_dump.fadump_supported) > return 0; > - } > > + fadump_init_files(); > fadump_show_config(); > + > + if (!fw_dump.fadump_enabled) > + return 1; > + > /* >* If dump data is available then see if it is valid and prepare for >* saving it to the disk. > @@ -1492,8 +1491,6 @@ int __init setup_fadump(void) > else if (fw_dump.reserve_dump_area_size) > fw_dump.ops->fadump_init_mem_struct(&fw_dump); > > - fadump_init_files(); > - > return 1; > } > subsys_initcall(setup_fadump); > -- - Hari
[PATCH v2 1/2] powerpc/fadump: use static allocation for reserved memory ranges
At times, memory ranges have to be looked up during early boot, when kernel couldn't be initialized for dynamic memory allocation. In fact, reserved-ranges look up is needed during FADump memory reservation. Without accounting for reserved-ranges in reserving memory for FADump, MPIPL boot fails with memory corruption issues. So, extend memory ranges handling to support static allocation and populate reserved memory ranges during early boot. Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing memory") Cc: sta...@vger.kernel.org Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/fadump-internal.h |4 + arch/powerpc/kernel/fadump.c | 77 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index c814a2b..8d61c8f 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -64,12 +64,14 @@ struct fadump_memory_range { }; /* fadump memory ranges info */ +#define RNG_NAME_SZ16 struct fadump_mrange_info { - charname[16]; + charname[RNG_NAME_SZ]; struct fadump_memory_range *mem_ranges; u32 mem_ranges_sz; u32 mem_range_cnt; u32 max_mem_ranges; + boolis_static; }; /* Platform specific callback functions */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 59e60a9..679277b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -40,8 +40,17 @@ struct kobject *fadump_kobj; #ifndef CONFIG_PRESERVE_FA_DUMP static DEFINE_MUTEX(fadump_mutex); -struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; -struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; +struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; + +#define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ +#define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ +sizeof(struct fadump_memory_range)) +static struct fadump_memory_range rngs[RESERVED_RNGS_CNT]; +struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs, + RESERVED_RNGS_SZ, 0, + RESERVED_RNGS_CNT, true }; + +static void __init early_init_dt_scan_reserved_ranges(unsigned long node); #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -110,6 +119,11 @@ static int __init fadump_cma_init(void) { return 1; } int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { + if (depth == 0) { + early_init_dt_scan_reserved_ranges(node); + return 0; + } + if (depth != 1) return 0; @@ -728,10 +742,14 @@ void fadump_free_cpu_notes_buf(void) static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { + if (mrange_info->is_static) { + mrange_info->mem_range_cnt = 0; + return; + } + kfree(mrange_info->mem_ranges); - mrange_info->mem_ranges = NULL; - mrange_info->mem_ranges_sz = 0; - mrange_info->max_mem_ranges = 0; + memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0, + (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ)); } /* @@ -788,6 +806,12 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; + if (mrange_info->is_static) { + pr_err("Reached array size limit for %s memory ranges\n", + mrange_info->name); + return -ENOSPC; + } + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; @@ -1204,20 +1228,19 @@ static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) * Scan reserved-ranges to consider them while reserving/releasing * memory for FADump. */ -static inline int fadump_scan_reserved_mem_ranges(void) +static void __init early_init_dt_scan_reserved_ranges(unsigned long node) { - struct device_node *root; const __be32 *prop; int len, ret = -1; unsigned long i; - root = of_find_node_by_path("/"); - if (!root) - return ret; + /* reserved-ranges alrea
[PATCH v2 2/2] powerpc/fadump: consider reserved ranges while reserving memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse reserved-ranges DT node and reserve kernel memory falling in these ranges for F/W purposes. Memory reserved for FADump should not overlap with these ranges as it could corrupt memory meant for F/W or crash'ed kernel memory to be exported as vmcore. But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode"), memblock_find_in_range() is being used to find the appropriate area to reserve memory for FADump, which can't account for reserved-ranges as these ranges are reserved only after FADump memory reservation. With reserved-ranges now being populated during early boot, look out for these memory ranges while reserving memory for FADump. Without this change, MPIPL on PowerNV systems aborts with hostboot failure, when memory reserved for FADump is less than 4096MB. Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode") Cc: sta...@vger.kernel.org Signed-off-by: Hari Bathini --- Changes in v2: * Add an out parameter 'found' for fadump_locate_reserve_mem() and set it to "true" when a suitable memory area is located. arch/powerpc/kernel/fadump.c | 81 +- 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 679277b..0ffe69c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -445,10 +445,73 @@ static int __init fadump_get_boot_mem_regions(void) return ret; } +/* + * Returns true, if the given range overlaps with reserved memory ranges + * starting at idx. Also, updates idx to index of overlapping memory range + * with the given memory range. + * False, otherwise. + */ +static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx) +{ + bool ret = false; + int i; + + for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) { + u64 rbase = reserved_mrange_info.mem_ranges[i].base; + u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size; + + if (end <= rbase) + break; + + if ((end > rbase) && (base < rend)) { + *idx = i; + ret = true; + break; + } + } + + return ret; +} + +/* + * Locate a suitable memory area to reserve memory for FADump. While at it, + * lookup reserved-ranges & avoid overlap with them, as they are used by F/W. + */ +static u64 __init fadump_locate_reserve_mem(u64 base, u64 size, bool *found) +{ + struct fadump_memory_range *mrngs; + phys_addr_t mstart, mend; + int idx = 0; + u64 i; + + *found = false; + mrngs = reserved_mrange_info.mem_ranges; + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { + pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n", +i, mstart, mend, base); + + if (mstart > base) + base = PAGE_ALIGN(mstart); + + while ((mend > base) && ((mend - base) >= size)) { + if (!overlaps_reserved_ranges(base, base+size, &idx)) { + *found = true; + goto out; + } + + base = mrngs[idx].base + mrngs[idx].size; + base = PAGE_ALIGN(base); + } + } + +out: + return base; +} + int __init fadump_reserve_mem(void) { - u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; - bool is_memblock_bottom_up = memblock_bottom_up(); + u64 base, size, mem_boundary, bootmem_min; int ret = 1; if (!fw_dump.fadump_enabled) @@ -469,9 +532,9 @@ int __init fadump_reserve_mem(void) PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA if (!fw_dump.nocma) { - align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, align); + ALIGN(fw_dump.boot_memory_size, + FADUMP_CMA_ALIGNMENT); } #endif @@ -535,17 +598,15 @@ int __init fadump_reserve_mem(void) pr_debug("Reserve dump area start address: 0x%lx\n", fw_dump.reserve_dump_area_start); } else { + bool found = false; + /* * Reserve memory at an offset closer to bottom of the RAM to * minimize the impact of memory hot-remove oper
Re: [PATCH 2/2] powerpc/fadump: consider reserved ranges while reserving memory
On 20/04/20 10:50 AM, Mahesh J Salgaonkar wrote: > On 2020-03-11 01:57:10 Wed, Hari Bathini wrote: >> Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for >> memory reservations") enabled support to parse reserved-ranges DT >> node and reserve kernel memory falling in these ranges for F/W >> purposes. Memory reserved for FADump should not overlap with these >> ranges as it could corrupt memory meant for F/W or crash'ed kernel >> memory to be exported as vmcore. >> >> But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's >> bottom up allocation mode"), memblock_find_in_range() is being used to >> find the appropriate area to reserve memory for FADump, which can't >> account for reserved-ranges as these ranges are reserved only after >> FADump memory reservation. >> >> With reserved-ranges now being populated during early boot, look out >> for these memory ranges while reserving memory for FADump. Without >> this change, MPIPL on PowerNV systems aborts with hostboot failure, >> when memory reserved for FADump is less than 4096MB. >> >> Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up >> allocation mode") >> Cc: sta...@vger.kernel.org # v5.4+ >> Signed-off-by: Hari Bathini >> --- >> arch/powerpc/kernel/fadump.c | 76 >> -- >> 1 file changed, 66 insertions(+), 10 deletions(-) >> >> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c >> index 7fcf4a8f..ab83be9 100644 >> --- a/arch/powerpc/kernel/fadump.c >> +++ b/arch/powerpc/kernel/fadump.c >> @@ -443,10 +443,70 @@ static int __init fadump_get_boot_mem_regions(void) >> return ret; >> } >> >> +/* >> + * Returns true, if the given range overlaps with reserved memory ranges >> + * starting at idx. Also, updates idx to index of overlapping memory range >> + * with the given memory range. >> + * False, otherwise. >> + */ >> +static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx) >> +{ >> +bool ret = false; >> +int i; >> + >> +for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) { >> +u64 rbase = reserved_mrange_info.mem_ranges[i].base; >> +u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size; >> + >> +if (end <= rbase) >> +break; >> + >> +if ((end > rbase) && (base < rend)) { >> +*idx = i; >> +ret = true; >> +break; >> +} >> +} >> + >> +return ret; >> +} >> + >> +/* >> + * Locate a suitable memory area to reserve memory for FADump. While at it, >> + * lookup reserved-ranges & avoid overlap with them, as they are used by >> F/W. >> + */ >> +static u64 __init fadump_locate_reserve_mem(u64 base, u64 size) >> +{ >> +struct fadump_memory_range *mrngs; >> +phys_addr_t mstart, mend; >> +int idx = 0; >> +u64 i; >> + >> +mrngs = reserved_mrange_info.mem_ranges; >> +for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, >> +&mstart, &mend, NULL) { >> +pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n", >> + i, mstart, mend, base); >> + >> +if (mstart > base) >> +base = PAGE_ALIGN(mstart); >> + >> +while ((mend > base) && ((mend - base) >= size)) { >> +if (!overlaps_reserved_ranges(base, base + size, &idx)) >> +goto out; >> + >> +base = mrngs[idx].base + mrngs[idx].size; >> +base = PAGE_ALIGN(base); > > What happens when all the memory ranges found to be overlaped with > reserved ranges ? Shoudn't this function return NULL ? Looks like in > that case this function returns the last set base address which is > either still overlaped or not big enough in size. Thanks for the review, Mahesh. I overlooked that corner case. Just posted v2 fixing it. - Hari
[PATCH v2 1/2] powerpc/fadump: use static allocation for reserved memory ranges
At times, memory ranges have to be looked up during early boot, when kernel couldn't be initialized for dynamic memory allocation. In fact, reserved-ranges look up is needed during FADump memory reservation. Without accounting for reserved-ranges in reserving memory for FADump, MPIPL boot fails with memory corruption issues. So, extend memory ranges handling to support static allocation and populate reserved memory ranges during early boot. Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing memory") Cc: sta...@vger.kernel.org Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar --- arch/powerpc/include/asm/fadump-internal.h |4 + arch/powerpc/kernel/fadump.c | 77 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index c814a2b..8d61c8f 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -64,12 +64,14 @@ struct fadump_memory_range { }; /* fadump memory ranges info */ +#define RNG_NAME_SZ16 struct fadump_mrange_info { - charname[16]; + charname[RNG_NAME_SZ]; struct fadump_memory_range *mem_ranges; u32 mem_ranges_sz; u32 mem_range_cnt; u32 max_mem_ranges; + boolis_static; }; /* Platform specific callback functions */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 59e60a9..679277b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -40,8 +40,17 @@ struct kobject *fadump_kobj; #ifndef CONFIG_PRESERVE_FA_DUMP static DEFINE_MUTEX(fadump_mutex); -struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; -struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; +struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; + +#define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ +#define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ +sizeof(struct fadump_memory_range)) +static struct fadump_memory_range rngs[RESERVED_RNGS_CNT]; +struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs, + RESERVED_RNGS_SZ, 0, + RESERVED_RNGS_CNT, true }; + +static void __init early_init_dt_scan_reserved_ranges(unsigned long node); #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -110,6 +119,11 @@ static int __init fadump_cma_init(void) { return 1; } int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { + if (depth == 0) { + early_init_dt_scan_reserved_ranges(node); + return 0; + } + if (depth != 1) return 0; @@ -728,10 +742,14 @@ void fadump_free_cpu_notes_buf(void) static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { + if (mrange_info->is_static) { + mrange_info->mem_range_cnt = 0; + return; + } + kfree(mrange_info->mem_ranges); - mrange_info->mem_ranges = NULL; - mrange_info->mem_ranges_sz = 0; - mrange_info->max_mem_ranges = 0; + memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0, + (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ)); } /* @@ -788,6 +806,12 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; + if (mrange_info->is_static) { + pr_err("Reached array size limit for %s memory ranges\n", + mrange_info->name); + return -ENOSPC; + } + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; @@ -1204,20 +1228,19 @@ static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) * Scan reserved-ranges to consider them while reserving/releasing * memory for FADump. */ -static inline int fadump_scan_reserved_mem_ranges(void) +static void __init early_init_dt_scan_reserved_ranges(unsigned long node) { - struct device_node *root; const __be32 *prop; int len, ret = -1; unsigned long i; - root = of_find_node_by_path("/"); - if (!root) - retu
[PATCH v2 2/2] powerpc/fadump: consider reserved ranges while reserving memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse reserved-ranges DT node and reserve kernel memory falling in these ranges for F/W purposes. Memory reserved for FADump should not overlap with these ranges as it could corrupt memory meant for F/W or crash'ed kernel memory to be exported as vmcore. But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode"), memblock_find_in_range() is being used to find the appropriate area to reserve memory for FADump, which can't account for reserved-ranges as these ranges are reserved only after FADump memory reservation. With reserved-ranges now being populated during early boot, look out for these memory ranges while reserving memory for FADump. Without this change, MPIPL on PowerNV systems aborts with hostboot failure, when memory reserved for FADump is less than 4096MB. Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode") Cc: sta...@vger.kernel.org Signed-off-by: Hari Bathini --- Changes in v2: * Add an out parameter 'found' for fadump_locate_reserve_mem() and set it to "true" when a suitable memory area is located. arch/powerpc/kernel/fadump.c | 81 +- 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 679277b..0ffe69c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -445,10 +445,73 @@ static int __init fadump_get_boot_mem_regions(void) return ret; } +/* + * Returns true, if the given range overlaps with reserved memory ranges + * starting at idx. Also, updates idx to index of overlapping memory range + * with the given memory range. + * False, otherwise. + */ +static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx) +{ + bool ret = false; + int i; + + for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) { + u64 rbase = reserved_mrange_info.mem_ranges[i].base; + u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size; + + if (end <= rbase) + break; + + if ((end > rbase) && (base < rend)) { + *idx = i; + ret = true; + break; + } + } + + return ret; +} + +/* + * Locate a suitable memory area to reserve memory for FADump. While at it, + * lookup reserved-ranges & avoid overlap with them, as they are used by F/W. + */ +static u64 __init fadump_locate_reserve_mem(u64 base, u64 size, bool *found) +{ + struct fadump_memory_range *mrngs; + phys_addr_t mstart, mend; + int idx = 0; + u64 i; + + *found = false; + mrngs = reserved_mrange_info.mem_ranges; + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { + pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n", +i, mstart, mend, base); + + if (mstart > base) + base = PAGE_ALIGN(mstart); + + while ((mend > base) && ((mend - base) >= size)) { + if (!overlaps_reserved_ranges(base, base+size, &idx)) { + *found = true; + goto out; + } + + base = mrngs[idx].base + mrngs[idx].size; + base = PAGE_ALIGN(base); + } + } + +out: + return base; +} + int __init fadump_reserve_mem(void) { - u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; - bool is_memblock_bottom_up = memblock_bottom_up(); + u64 base, size, mem_boundary, bootmem_min; int ret = 1; if (!fw_dump.fadump_enabled) @@ -469,9 +532,9 @@ int __init fadump_reserve_mem(void) PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA if (!fw_dump.nocma) { - align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, align); + ALIGN(fw_dump.boot_memory_size, + FADUMP_CMA_ALIGNMENT); } #endif @@ -535,17 +598,15 @@ int __init fadump_reserve_mem(void) pr_debug("Reserve dump area start address: 0x%lx\n", fw_dump.reserve_dump_area_start); } else { + bool found = false; + /* * Reserve memory at an offset closer to bottom of the RAM to * minimize the impact of memory hot-remove oper
[PATCH v3 1/2] powerpc/fadump: use static allocation for reserved memory ranges
At times, memory ranges have to be looked up during early boot, when kernel couldn't be initialized for dynamic memory allocation. In fact, reserved-ranges look up is needed during FADump memory reservation. Without accounting for reserved-ranges in reserving memory for FADump, MPIPL boot fails with memory corruption issues. So, extend memory ranges handling to support static allocation and populate reserved memory ranges during early boot. Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while releasing memory") Cc: sta...@vger.kernel.org Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar --- Changes in v3: * No code change. Added Mahesh's 'Reviewed-by' tag. arch/powerpc/include/asm/fadump-internal.h |4 + arch/powerpc/kernel/fadump.c | 77 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index c814a2b..8d61c8f 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -64,12 +64,14 @@ struct fadump_memory_range { }; /* fadump memory ranges info */ +#define RNG_NAME_SZ16 struct fadump_mrange_info { - charname[16]; + charname[RNG_NAME_SZ]; struct fadump_memory_range *mem_ranges; u32 mem_ranges_sz; u32 mem_range_cnt; u32 max_mem_ranges; + boolis_static; }; /* Platform specific callback functions */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 59e60a9..679277b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -40,8 +40,17 @@ struct kobject *fadump_kobj; #ifndef CONFIG_PRESERVE_FA_DUMP static DEFINE_MUTEX(fadump_mutex); -struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; -struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; +struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; + +#define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ +#define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ +sizeof(struct fadump_memory_range)) +static struct fadump_memory_range rngs[RESERVED_RNGS_CNT]; +struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs, + RESERVED_RNGS_SZ, 0, + RESERVED_RNGS_CNT, true }; + +static void __init early_init_dt_scan_reserved_ranges(unsigned long node); #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -110,6 +119,11 @@ static int __init fadump_cma_init(void) { return 1; } int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { + if (depth == 0) { + early_init_dt_scan_reserved_ranges(node); + return 0; + } + if (depth != 1) return 0; @@ -728,10 +742,14 @@ void fadump_free_cpu_notes_buf(void) static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { + if (mrange_info->is_static) { + mrange_info->mem_range_cnt = 0; + return; + } + kfree(mrange_info->mem_ranges); - mrange_info->mem_ranges = NULL; - mrange_info->mem_ranges_sz = 0; - mrange_info->max_mem_ranges = 0; + memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0, + (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ)); } /* @@ -788,6 +806,12 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; + if (mrange_info->is_static) { + pr_err("Reached array size limit for %s memory ranges\n", + mrange_info->name); + return -ENOSPC; + } + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; @@ -1204,20 +1228,19 @@ static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) * Scan reserved-ranges to consider them while reserving/releasing * memory for FADump. */ -static inline int fadump_scan_reserved_mem_ranges(void) +static void __init early_init_dt_scan_reserved_ranges(unsigned long node) { - struct device_node *root; const __be32 *prop; int len, ret = -1; unsigned long i; - root
[PATCH v3 2/2] powerpc/fadump: consider reserved ranges while reserving memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse reserved-ranges DT node and reserve kernel memory falling in these ranges for F/W purposes. Memory reserved for FADump should not overlap with these ranges as it could corrupt memory meant for F/W or crash'ed kernel memory to be exported as vmcore. But since commit 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode"), memblock_find_in_range() is being used to find the appropriate area to reserve memory for FADump, which can't account for reserved-ranges as these ranges are reserved only after FADump memory reservation. With reserved-ranges now being populated during early boot, look out for these memory ranges while reserving memory for FADump. Without this change, MPIPL on PowerNV systems aborts with hostboot failure, when memory reserved for FADump is less than 4096MB. Fixes: 579ca1a27675 ("powerpc/fadump: make use of memblock's bottom up allocation mode") Cc: sta...@vger.kernel.org Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar --- Changes in v3: * Updated fadump_locate_reserve_mem() to use return '0' instead of an out parameter as suggested by Mahesh and added his 'Reviewed-by' tag with that change. arch/powerpc/kernel/fadump.c | 76 +- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 679277b..63aac8b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -445,10 +445,72 @@ static int __init fadump_get_boot_mem_regions(void) return ret; } +/* + * Returns true, if the given range overlaps with reserved memory ranges + * starting at idx. Also, updates idx to index of overlapping memory range + * with the given memory range. + * False, otherwise. + */ +static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx) +{ + bool ret = false; + int i; + + for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) { + u64 rbase = reserved_mrange_info.mem_ranges[i].base; + u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size; + + if (end <= rbase) + break; + + if ((end > rbase) && (base < rend)) { + *idx = i; + ret = true; + break; + } + } + + return ret; +} + +/* + * Locate a suitable memory area to reserve memory for FADump. While at it, + * lookup reserved-ranges & avoid overlap with them, as they are used by F/W. + */ +static u64 __init fadump_locate_reserve_mem(u64 base, u64 size) +{ + struct fadump_memory_range *mrngs; + phys_addr_t mstart, mend; + int idx = 0; + u64 i, ret = 0; + + mrngs = reserved_mrange_info.mem_ranges; + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { + pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n", +i, mstart, mend, base); + + if (mstart > base) + base = PAGE_ALIGN(mstart); + + while ((mend > base) && ((mend - base) >= size)) { + if (!overlaps_reserved_ranges(base, base+size, &idx)) { + ret = base; + goto out; + } + + base = mrngs[idx].base + mrngs[idx].size; + base = PAGE_ALIGN(base); + } + } + +out: + return ret; +} + int __init fadump_reserve_mem(void) { - u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; - bool is_memblock_bottom_up = memblock_bottom_up(); + u64 base, size, mem_boundary, bootmem_min; int ret = 1; if (!fw_dump.fadump_enabled) @@ -469,9 +531,9 @@ int __init fadump_reserve_mem(void) PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA if (!fw_dump.nocma) { - align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, align); + ALIGN(fw_dump.boot_memory_size, + FADUMP_CMA_ALIGNMENT); } #endif @@ -539,11 +601,7 @@ int __init fadump_reserve_mem(void) * Reserve memory at an offset closer to bottom of the RAM to * minimize the impact of memory hot-remove operation. */ - memblock_set_bottom_up(true); - base = memblock_find_in_range(base, mem_boundary, size, align); - -
Re: [PATCH v2 2/2] powerpc/fadump: merge adjacent memory ranges to reduce PT_LOAD segements
On Wednesday 08 August 2018 02:38 PM, Mahesh Jagannath Salgaonkar wrote: On 08/07/2018 02:12 AM, Hari Bathini wrote: With dynamic memory allocation support for crash memory ranges array, there is no hard limit on the no. of crash memory ranges kernel could export, but program headers count could overflow in the /proc/vmcore ELF file while exporting each memory range as PT_LOAD segment. Reduce the likelihood of a such scenario, by folding adjacent crash memory ranges which minimizes the total number of PT_LOAD segments. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump.c | 45 ++ 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 2ec5704..cd0c555 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -908,22 +908,41 @@ static int allocate_crash_memory_ranges(void) static inline int fadump_add_crash_memory(unsigned long long base, unsigned long long end) { + u64 start, size; + bool is_adjacent = false; + if (base == end) return 0; - if (crash_mem_ranges == max_crash_mem_ranges) { - int ret; + /* +* Fold adjacent memory ranges to bring down the memory ranges/ +* PT_LOAD segments count. +*/ + if (crash_mem_ranges) { + start = crash_memory_ranges[crash_mem_ranges-1].base; + size = crash_memory_ranges[crash_mem_ranges-1].size; - ret = allocate_crash_memory_ranges(); - if (ret) - return ret; + if ((start + size) == base) + is_adjacent = true; + } + if (!is_adjacent) { + /* resize the array on reaching the limit */ + if (crash_mem_ranges == max_crash_mem_ranges) { + int ret; + + ret = allocate_crash_memory_ranges(); + if (ret) + return ret; + } + + start = base; + crash_memory_ranges[crash_mem_ranges].base = start; + crash_mem_ranges++; } + crash_memory_ranges[crash_mem_ranges-1].size = (end - start); pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", - crash_mem_ranges, base, end - 1, (end - base)); - crash_memory_ranges[crash_mem_ranges].base = base; - crash_memory_ranges[crash_mem_ranges].size = end - base; - crash_mem_ranges++; + (crash_mem_ranges - 1), start, end - 1, (end - start)); return 0; } @@ -999,6 +1018,14 @@ static int fadump_setup_crash_memory_ranges(void) pr_debug("Setup crash memory ranges.\n"); crash_mem_ranges = 0; + + /* allocate memory for crash memory ranges for the first time */ + if (!max_crash_mem_ranges) { + ret = allocate_crash_memory_ranges(); + if (ret) + return ret; + } + I see that the check for (!is_adjacent) in first hunk already handles the first time allocation. Do we need this ? Right. This hunk in fadump_setup_crash_memory_ranges() is unnecessary. Can be dropped. Also, I missed out on adding "#include ". Though it compiles fine with upstream kernel, will add and post v3 just to be safe.. Rest looks fine to me. Reviewed-by: Mahesh Salgaonkar Thanks for the review - Hari
[PATCH v3 1/2] powerpc/fadump: handle crash memory ranges array index overflow
Crash memory ranges is an array of memory ranges of the crashing kernel to be exported as a dump via /proc/vmcore file. The size of the array is set based on INIT_MEMBLOCK_REGIONS, which works alright in most cases where memblock memory regions count is less than INIT_MEMBLOCK_REGIONS value. But this count can grow beyond INIT_MEMBLOCK_REGIONS value since commit 142b45a72e22 ("memblock: Add array resizing support"). On large memory systems with a few DLPAR operations, the memblock memory regions count could be larger than INIT_MEMBLOCK_REGIONS value. On such systems, registering fadump results in crash or other system failures like below: task: c7f39a290010 ti: cb738000 task.ti: cb738000 NIP: c0047df4 LR: c00f9e58 CTR: c010f180 REGS: cb73b570 TRAP: 0300 Tainted: G L X (4.4.140+) MSR: 80009033 CR: 22004484 XER: 2000 CFAR: c0008500 DAR: 07a45000 DSISR: 4000 SOFTE: 0 GPR00: c00f9e58 cb73b7f0 c0f09a00 001a GPR04: c7f3bf774c90 0004 c0eb9a00 0800 GPR08: 0804 07a45000 c0fa9a00 c7ffb169ca20 GPR12: 22004482 cfa12c00 c7f3a0ea97a8 GPR16: c7f3a0ea9a50 cb73bd60 0118 0001fe80 GPR20: 0118 c0b8c980 00d0 GPR24: 07ffb0b1 c7ffb169c980 c0b8c980 GPR28: 0004 c7ffb169c980 001a c7ffb169c980 NIP [c0047df4] smp_send_reschedule+0x24/0x80 LR [c00f9e58] resched_curr+0x138/0x160 Call Trace: [cb73b7f0] [c00f9e58] resched_curr+0x138/0x160 (unreliable) [cb73b820] [c00fb538] check_preempt_curr+0xc8/0xf0 [cb73b850] [c00fb598] ttwu_do_wakeup+0x38/0x150 [cb73b890] [c00fc9c4] try_to_wake_up+0x224/0x4d0 [cb73b900] [c011ef34] __wake_up_common+0x94/0x100 [cb73b960] [c034a78c] ep_poll_callback+0xac/0x1c0 [cb73b9b0] [c011ef34] __wake_up_common+0x94/0x100 [cb73ba10] [c011f810] __wake_up_sync_key+0x70/0xa0 [cb73ba60] [c067c3e8] sock_def_readable+0x58/0xa0 [cb73ba90] [c07848ac] unix_stream_sendmsg+0x2dc/0x4c0 [cb73bb70] [c0675a38] sock_sendmsg+0x68/0xa0 [cb73bba0] [c067673c] ___sys_sendmsg+0x2cc/0x2e0 [cb73bd30] [c0677dbc] __sys_sendmsg+0x5c/0xc0 [cb73bdd0] [c06789bc] SyS_socketcall+0x36c/0x3f0 [cb73be30] [c0009488] system_call+0x3c/0x100 Instruction dump: 4e800020 6000 6042 3c4c00ec 38421c30 7c0802a6 f8010010 6000 3d42000a e92ab420 2fa9 4dde0020 2fa9 419e0044 7c0802a6 ---[ end trace a6d1dd4bab5f8253 ]--- as array index overflow is not checked for while setting up crash memory ranges causing memory corruption. To resolve this issue, dynamically allocate memory for crash memory ranges and resize it incrementally, in units of pagesize, on hitting array size limit. Fixes: 2df173d9e85d ("fadump: Initialize elfcore header and add PT_LOAD program headers.") Cc: sta...@vger.kernel.org Cc: Mahesh Salgaonkar Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar --- Changes in v3: * Included for krelloc() arch/powerpc/include/asm/fadump.h |4 +- arch/powerpc/kernel/fadump.c | 92 +++-- 2 files changed, 80 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 5a23010..3abc738 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -195,8 +195,8 @@ struct fadump_crash_info_header { struct cpumask online_mask; }; -/* Crash memory ranges */ -#define INIT_CRASHMEM_RANGES (INIT_MEMBLOCK_REGIONS + 2) +/* Crash memory ranges size unit (pagesize) */ +#define CRASHMEM_RANGES_ALLOC_SIZE PAGE_SIZE struct fad_crash_memory_ranges { unsigned long long base; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 07e8396..9f80a78 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -47,8 +48,10 @@ static struct fadump_mem_struct fdm; static const struct fadump_mem_struct *fdm_active; static DEFINE_MUTEX(fadump_mutex); -struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES]; +struct fad_crash_memory_ranges *crash_memory_ranges; +int crash_memory_ranges_size; int crash_mem_ranges; +int max_crash_mem_ranges; /* Scan the Firmware Assisted dump configuration details. */ int __init early_init_dt_scan_fw_dump(unsigned long node, @@ -868,22 +871,67 @@ static int __init process
[PATCH v3 2/2] powerpc/fadump: merge adjacent memory ranges to reduce PT_LOAD segements
With dynamic memory allocation support for crash memory ranges array, there is no hard limit on the no. of crash memory ranges kernel could export, but program headers count could overflow in the /proc/vmcore ELF file while exporting each memory range as PT_LOAD segment. Reduce the likelihood of a such scenario, by folding adjacent crash memory ranges which minimizes the total number of PT_LOAD segments. Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar --- Changes in v3: * Dropped unnecessary memory allocation hunk in fadump_setup_crash_memory_ranges() arch/powerpc/kernel/fadump.c | 37 - 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 9f80a78..5436600c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -909,22 +909,41 @@ static int allocate_crash_memory_ranges(void) static inline int fadump_add_crash_memory(unsigned long long base, unsigned long long end) { + u64 start, size; + bool is_adjacent = false; + if (base == end) return 0; - if (crash_mem_ranges == max_crash_mem_ranges) { - int ret; + /* +* Fold adjacent memory ranges to bring down the memory ranges/ +* PT_LOAD segments count. +*/ + if (crash_mem_ranges) { + start = crash_memory_ranges[crash_mem_ranges-1].base; + size = crash_memory_ranges[crash_mem_ranges-1].size; - ret = allocate_crash_memory_ranges(); - if (ret) - return ret; + if ((start + size) == base) + is_adjacent = true; + } + if (!is_adjacent) { + /* resize the array on reaching the limit */ + if (crash_mem_ranges == max_crash_mem_ranges) { + int ret; + + ret = allocate_crash_memory_ranges(); + if (ret) + return ret; + } + + start = base; + crash_memory_ranges[crash_mem_ranges].base = start; + crash_mem_ranges++; } + crash_memory_ranges[crash_mem_ranges-1].size = (end - start); pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", - crash_mem_ranges, base, end - 1, (end - base)); - crash_memory_ranges[crash_mem_ranges].base = base; - crash_memory_ranges[crash_mem_ranges].size = end - base; - crash_mem_ranges++; + (crash_mem_ranges - 1), start, end - 1, (end - start)); return 0; }
[PATCH] powerpc/fadump: cleanup crash memory ranges support
Commit 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array index overflow") changed crash memory ranges to a dynamic array that is reallocated on-demand with krealloc(). The relevant header for this call was not included. The kernel compiles though. But be cautious and add the header anyway. Also, memory allocation logic in fadump_add_crash_memory() takes care of memory allocation for crash memory ranges in all scenarios. Drop unnecessary memory allocation in fadump_setup_crash_memory_ranges(). Fixes: 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array index overflow") Cc: Mahesh Salgaonkar Signed-off-by: Hari Bathini --- * Actually posted a V3 with this changes but V2 made it! - https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=59839 arch/powerpc/kernel/fadump.c |8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 986ec47..a711d22 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1019,13 +1020,6 @@ static int fadump_setup_crash_memory_ranges(void) pr_debug("Setup crash memory ranges.\n"); crash_mem_ranges = 0; - /* allocate memory for crash memory ranges for the first time */ - if (!max_crash_mem_ranges) { - ret = allocate_crash_memory_ranges(); - if (ret) - return ret; - } - /* * add the first memory chunk (RMA_START through boot_memory_size) as * a separate memory chunk. The reason is, at the time crash firmware
[PATCH] powerpc/fadump: re-register firmware-assisted dump if already registered
Firmware-Assisted Dump (FADump) needs to be registered again after any memory hot add/remove operation to update the crash memory ranges. But currently, the kernel returns '-EEXIST' if we try to register without uregistering it first. This could expose the system to racing issues while unregistering and registering FADump from userspace during udev events. Spare the userspace of this and let it be taken care of in the kernel space for a simpler interface. Since this change, running 'echo 1 > /sys/kernel/fadump_registered' would result in re-regisering (unregistering and registering) FADump, if it was already registered. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a711d22..761b28b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1444,8 +1444,8 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case 1: if (fw_dump.dump_registered == 1) { - ret = -EEXIST; - goto unlock_out; + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); } /* Register Firmware-assisted dump */ ret = register_fadump();
Re: [PATCH] powerpc/fadump: re-register firmware-assisted dump if already registered
On Friday 14 September 2018 07:58 PM, Petr Tesarik wrote: On Fri, 14 Sep 2018 19:36:02 +0530 Hari Bathini wrote: Firmware-Assisted Dump (FADump) needs to be registered again after any memory hot add/remove operation to update the crash memory ranges. But currently, the kernel returns '-EEXIST' if we try to register without uregistering it first. This could expose the system to racing issues while unregistering and registering FADump from userspace during udev events. Spare the userspace of this and let it be taken care of in the kernel space for a simpler interface. Since this change, running 'echo 1 > /sys/kernel/fadump_registered' would result in re-regisering (unregistering and registering) FADump, if it was already registered. Great improvement to the API! Any suggestions what should be done in a client which tries to be compatible with kernels before this change and after this change? If `echo 1 > /sys/kernel/fadump_registered` fails, check for the output of `cat /sys/kernel/fadump_registered` and if it is still `1`, that indicates old kernel and we are already registered. Treat it as success if being registered is what we care about or unregister/register (if re-register is the intention).. Hope that helps.. Thanks Hari
Re: [PATCH] powerpc/numa: Skip onlining a offline node in kdump path
el) boots properly. Unlike regular kernels, which mark all available nodes as online, kdump kernel only marks just enough nodes as online and marks the rest as offline at boot. However kdump kernel boots with all available CPUs. With Commit 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot"), all CPUs are onlined on their respective nodes at boot time. try_online_node() tries to online the offline nodes but fails as all needed subsystems are not yet initialized. As part of fix, detect and skip early onlining of a offline node. Fixes: 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot") Reported-by: Pavithra Prakash Signed-off-by: Srikar Dronamraju Tested-by: Hari Bathini --- arch/powerpc/mm/numa.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index e94148a1d7e4..d88139acdfe6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1217,9 +1217,10 @@ int find_and_online_cpu_nid(int cpu) * Need to ensure that NODE_DATA is initialized for a node from * available memory (see memblock_alloc_try_nid). If unable to * init the node, then default to nearest node that has memory -* installed. +* installed. Skip onlining a node if the subsystems are not +* yet initialized. */ - if (try_online_node(new_nid)) + if (!topology_inited || try_online_node(new_nid)) new_nid = first_online_node; #else /*
[PATCH v3 00/16] Add FADump support on PowerNV platform
Firmware-Assisted Dump (FADump) is currently supported only on pseries platform. This patch series adds support for powernv platform too. The first and third patches refactor the FADump code to make use of common code across multiple platforms. The fifth patch adds basic FADump support for powernv platform. Patches seven & eight honour reserved-ranges DT node while reserving/releasing memory used by FADump. The next patch processes CPU state data provided by firmware to create and append core notes to the ELF core file. The tenth patch adds support for preserving crash data for subsequent boots (useful in cases like petitboot). Patch twelve provides support to export opalcore. This is to make debugging of failures in OPAL code easier. The subsequent patch ensures vmcore processing is skipped when only OPAL core is exported by f/w. The next patch provides option to release the kernel memory used to export opalcore. The remaining patches update Firmware-Assisted Dump documentation appropriately. The patch series is tested with the latest firmware plus the below skiboot changes for MPIPL support: https://patchwork.ozlabs.org/project/skiboot/list/?series=114104 ("MPIPL support") Changes in v3: * Rebased to latest upstream kernel version. * Updated according to latest OPAL changes. * Using metadata tags instead of structs between kernel & OPAL. * Exporting OPAL core as /sys/firmware/opal/core (not /proc/opalcore) --- Hari Bathini (16): powerpc/fadump: move internal fadump code to a new file powerpc/fadump: Improve fadump documentation pseries/fadump: move out platform specific support from generic code powerpc/fadump: use FADump instead of fadump for how it is pronounced powerpc/fadump: enable fadump support on OPAL based POWER platform powerpc/fadump: Update documentation about OPAL platform support powerpc/fadump: consider reserved ranges while reserving memory powerpc/fadump: consider reserved ranges while releasing memory powernv/fadump: process architected register state data provided by firmware powernv/fadump: add support to preserve crash data on FADUMP disabled kernel powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP powerpc/powernv: export /sys/firmware/opal/core for analysing opal crashes powernv/fadump: Skip processing /proc/vmcore when only OPAL core exists powernv/opalcore: provide an option to invalidate /sys/firmware/opal/core file powernv/fadump: consider f/w load area powernv/fadump: update documentation about option to release opalcore Documentation/powerpc/firmware-assisted-dump.txt | 193 ++-- arch/powerpc/Kconfig | 23 arch/powerpc/include/asm/fadump.h| 190 arch/powerpc/include/asm/opal-api.h | 89 ++ arch/powerpc/include/asm/opal.h |4 arch/powerpc/kernel/Makefile |6 arch/powerpc/kernel/fadump-common.c | 196 arch/powerpc/kernel/fadump-common.h | 203 arch/powerpc/kernel/fadump.c | 1183 +- arch/powerpc/kernel/prom.c |4 arch/powerpc/platforms/powernv/Makefile |3 arch/powerpc/platforms/powernv/opal-call.c |2 arch/powerpc/platforms/powernv/opal-core.c | 634 arch/powerpc/platforms/powernv/opal-fadump.c | 661 arch/powerpc/platforms/powernv/opal-fadump.h | 117 ++ arch/powerpc/platforms/pseries/Makefile |1 arch/powerpc/platforms/pseries/rtas-fadump.c | 557 ++ arch/powerpc/platforms/pseries/rtas-fadump.h | 106 ++ 18 files changed, 3187 insertions(+), 985 deletions(-) create mode 100644 arch/powerpc/kernel/fadump-common.c create mode 100644 arch/powerpc/kernel/fadump-common.h create mode 100644 arch/powerpc/platforms/powernv/opal-core.c create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h
[PATCH v3 01/16] powerpc/fadump: move internal fadump code to a new file
Refactoring fadump code means internal fadump code is referenced from different places. For ease, move internal code to a new file. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/fadump.h | 112 arch/powerpc/kernel/Makefile|2 arch/powerpc/kernel/fadump-common.c | 184 + arch/powerpc/kernel/fadump-common.h | 126 +++ arch/powerpc/kernel/fadump.c| 194 ++- 5 files changed, 324 insertions(+), 294 deletions(-) create mode 100644 arch/powerpc/kernel/fadump-common.c create mode 100644 arch/powerpc/kernel/fadump-common.h diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 17d9b6a..a2d2533 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -11,34 +11,6 @@ #ifdef CONFIG_FA_DUMP -/* - * The RMA region will be saved for later dumping when kernel crashes. - * RMA is Real Mode Area, the first block of logical memory address owned - * by logical partition, containing the storage that may be accessed with - * translate off. - */ -#define RMA_START 0x0 -#define RMA_END(ppc64_rma_size) - -/* - * On some Power systems where RMO is 128MB, it still requires minimum of - * 256MB for kernel to boot successfully. When kdump infrastructure is - * configured to save vmcore over network, we run into OOM issue while - * loading modules related to network setup. Hence we need aditional 64M - * of memory to avoid OOM issue. - */ -#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ - + (0x1UL << 26)) - -/* The upper limit percentage for user specified boot memory size (25%) */ -#define MAX_BOOT_MEM_RATIO 4 - -#define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt) - -/* Alignement per CMA requirement. */ -#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ - max_t(unsigned long, MAX_ORDER - 1, pageblock_order)) - /* Firmware provided dump sections */ #define FADUMP_CPU_STATE_DATA 0x0001 #define FADUMP_HPTE_REGION 0x0002 @@ -47,18 +19,9 @@ /* Dump request flag */ #define FADUMP_REQUEST_FLAG0x0001 -/* FAD commands */ -#define FADUMP_REGISTER1 -#define FADUMP_UNREGISTER 2 -#define FADUMP_INVALIDATE 3 - /* Dump status flag */ #define FADUMP_ERROR_FLAG 0x2000 -#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1) - -#define CPU_UNKNOWN(~((u32)0)) - /* Utility macros */ #define SKIP_TO_NEXT_CPU(reg_entry)\ ({ \ @@ -112,59 +75,8 @@ struct fadump_mem_struct { struct fadump_section rmr_region; }; -/* Firmware-assisted dump configuration details. */ -struct fw_dump { - unsigned long cpu_state_data_size; - unsigned long hpte_region_size; - unsigned long boot_memory_size; - unsigned long reserve_dump_area_start; - unsigned long reserve_dump_area_size; - /* cmd line option during boot */ - unsigned long reserve_bootvar; - - unsigned long fadumphdr_addr; - unsigned long cpu_notes_buf; - unsigned long cpu_notes_buf_size; - - int ibm_configure_kernel_dump; - - unsigned long fadump_enabled:1; - unsigned long fadump_supported:1; - unsigned long dump_active:1; - unsigned long dump_registered:1; - unsigned long nocma:1; -}; - -/* - * Copy the ascii values for first 8 characters from a string into u64 - * variable at their respective indexes. - * e.g. - * The string "FADMPINF" will be converted into 0x4641444d50494e46 - */ -static inline u64 str_to_u64(const char *str) -{ - u64 val = 0; - int i; - - for (i = 0; i < sizeof(val); i++) - val = (*str) ? (val << 8) | *str++ : val << 8; - return val; -} -#define STR_TO_HEX(x) str_to_u64(x) -#define REG_ID(x) str_to_u64(x) - -#define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF") #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") -/* The firmware-assisted dump format. - * - * The register save area is an area in the partition's memory used to preserve - * the register contents (CPU state data) for the active CPUs during a firmware - * assisted dump. The dump format contains register save area header followed - * by register entries. Each list of registers for a CPU starts with - * "CPUSTRT" and ends with "CPUEND". - */ - /* Register save area header. */ struct fadump_reg_save_area_header { __be64 magic_number; @@ -172,29 +84,9 @@ struct fadump_reg_save_area_header { __be32 num_cpu_offset; }; -/* Register
[PATCH v3 02/16] powerpc/fadump: Improve fadump documentation
The figures depicting FADump's (Firmware-Assisted Dump) memory layout are missing some finer details like different memory regions and what they represent. Improve the documentation by updating those details. Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt | 65 -- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 18c5fee..059993b 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -74,8 +74,9 @@ as follows: there is crash data available from a previous boot. During the early boot OS will reserve rest of the memory above boot memory size effectively booting with restricted memory - size. This will make sure that the second kernel will not - touch any of the dump memory area. + size. This will make sure that this kernel (also, referred + to as second kernel or capture kernel) will not touch any + of the dump memory area. -- User-space tools will read /proc/vmcore to obtain the contents of memory, which holds the previous crashed kernel dump in ELF @@ -125,48 +126,52 @@ space memory except the user pages that were present in CMA region. o Memory Reservation during first kernel - Low memory Top of memory - 0 boot memory size | - | ||<--Reserved dump area -->| | - V V| Permanent Reservation | V - +---+--/ /---+---++---++--+ - | ||CPU|HPTE| DUMP |ELF | | - +---+--/ /---+---++---++--+ -| ^ -| | -\ / - --- - Boot memory content gets transferred to - reserved area by firmware at the time of - crash + Low memoryTop of memory + 0 boot memory size |<--Reserved dump area --->| | + | || Permanent Reservation | | + V V| (Preserve area)| V + +---+--/ /---+---+++---++--+ + | ||CPU|HPTE| DUMP |HDR|ELF | | + +---+--/ /---+---+++---++--+ +| ^ ^ +| | | +\ / | + --- FADump Header + Boot memory content gets transferred (meta area) + to reserved area by firmware at the + time of crash + Fig. 1 + o Memory Reservation during second kernel after crash - Low memoryTop of memory - 0 boot memory size | - | |<- Reserved dump area --- -->| - V V V - +---+--/ /---+---++---++--+ - | ||CPU|HPTE| DUMP |ELF | | - +---+--/ /---+---++---++--+ + Low memoryTop of memory + 0 boot memory size| + | |<- Reserved dump area --->| + V V|< Preserve area ->| V + +---+--/ /---+---+++---++--+ + | ||CPU|HPTE| DUMP |HDR|ELF | | + +---+--/ /---+---+++---++--+ | | V V Used by second/proc/vmcore kernel to boot Fig. 2 -Currently the dump will be copied from /proc/vmcore to a -a new file upon user intervention. The dump data available through -/proc/vmcore will be in ELF format. Hence the existing kdump -infrastructure (kdump scripts) to save the dump works fine with -minor modifications. +Currently the dump will be copied from /proc/vmcore to a new file upon +user intervention. The dump data available through /proc/vmcore will be +in ELF format. Hence the existing kdump infrastructure (kdump scripts) +to save the dump works fine with minor modifications. KDump scripts on +major Distro releases have already been modified to work seemlessly (no +user intervention in s
[PATCH v3 03/16] pseries/fadump: move out platform specific support from generic code
Introduce callbacks for platform specific operations like register, unregister, invalidate & such, and move pseries specific code into platform code. Signed-off-by: Hari Bathini --- arch/powerpc/include/asm/fadump.h| 75 arch/powerpc/kernel/fadump-common.h | 38 ++ arch/powerpc/kernel/fadump.c | 500 ++--- arch/powerpc/platforms/pseries/Makefile |1 arch/powerpc/platforms/pseries/rtas-fadump.c | 529 ++ arch/powerpc/platforms/pseries/rtas-fadump.h | 96 + 6 files changed, 700 insertions(+), 539 deletions(-) create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index a2d2533..9a7652c 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -11,79 +11,8 @@ #ifdef CONFIG_FA_DUMP -/* Firmware provided dump sections */ -#define FADUMP_CPU_STATE_DATA 0x0001 -#define FADUMP_HPTE_REGION 0x0002 -#define FADUMP_REAL_MODE_REGION0x0011 - -/* Dump request flag */ -#define FADUMP_REQUEST_FLAG0x0001 - -/* Dump status flag */ -#define FADUMP_ERROR_FLAG 0x2000 - -/* Utility macros */ -#define SKIP_TO_NEXT_CPU(reg_entry)\ -({ \ - while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) \ - reg_entry++;\ - reg_entry++;\ -}) - extern int crashing_cpu; -/* Kernel Dump section info */ -struct fadump_section { - __be32 request_flag; - __be16 source_data_type; - __be16 error_flags; - __be64 source_address; - __be64 source_len; - __be64 bytes_dumped; - __be64 destination_address; -}; - -/* ibm,configure-kernel-dump header. */ -struct fadump_section_header { - __be32 dump_format_version; - __be16 dump_num_sections; - __be16 dump_status_flag; - __be32 offset_first_dump_section; - - /* Fields for disk dump option. */ - __be32 dd_block_size; - __be64 dd_block_offset; - __be64 dd_num_blocks; - __be32 dd_offset_disk_path; - - /* Maximum time allowed to prevent an automatic dump-reboot. */ - __be32 max_time_auto; -}; - -/* - * Firmware Assisted dump memory structure. This structure is required for - * registering future kernel dump with power firmware through rtas call. - * - * No disk dump option. Hence disk dump path string section is not included. - */ -struct fadump_mem_struct { - struct fadump_section_headerheader; - - /* Kernel dump sections */ - struct fadump_section cpu_state_data; - struct fadump_section hpte_region; - struct fadump_section rmr_region; -}; - -#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") - -/* Register save area header. */ -struct fadump_reg_save_area_header { - __be64 magic_number; - __be32 version; - __be32 num_cpu_offset; -}; - extern int is_fadump_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); @@ -99,5 +28,5 @@ static inline int is_fadump_active(void) { return 0; } static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } static inline void fadump_cleanup(void) { } -#endif -#endif +#endif /* !CONFIG_FA_DUMP */ +#endif /* __PPC64_FA_DUMP_H__ */ diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 8ccd96d..1eb1397 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -47,6 +47,12 @@ #define FADUMP_UNREGISTER 2 #define FADUMP_INVALIDATE 3 +/* Firmware-Assited Dump platforms */ +enum fadump_platform_type { + FADUMP_PLATFORM_UNKNOWN = 0, + FADUMP_PLATFORM_PSERIES, +}; + #define FADUMP_CPU_ID_MASK ((1UL << 32) - 1) #define CPU_UNKNOWN(~((u32)0)) @@ -91,6 +97,9 @@ struct fad_crash_memory_ranges { unsigned long long size; }; +/* Platform specific callback functions */ +struct fadump_ops; + /* Firmware-assisted dump configuration details. */ struct fw_dump { unsigned long cpu_state_data_size; @@ -98,6 +107,7 @@ struct fw_dump { unsigned long boot_memory_size; unsigned long reserve_dump_area_start; unsigned long reserve_dump_area_size; + unsigned long preserv_area_start; /* cmd line option during boot */ unsigned long reserve_bootvar; @
[PATCH v3 04/16] powerpc/fadump: use FADump instead of fadump for how it is pronounced
Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt | 56 +++--- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 059993b..62e75ef 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -8,18 +8,18 @@ a crashed system, and to do so from a fully-reset system, and to minimize the total elapsed time until the system is back in production use. -- Firmware assisted dump (fadump) infrastructure is intended to replace +- Firmware-Assisted Dump (FADump) infrastructure is intended to replace the existing phyp assisted dump. - Fadump uses the same firmware interfaces and memory reservation model as phyp assisted dump. -- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore +- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore in the ELF format in the same way as kdump. This helps us reuse the kdump infrastructure for dump capture and filtering. - Unlike phyp dump, userspace tool does not need to refer any sysfs interface while reading /proc/vmcore. -- Unlike phyp dump, fadump allows user to release all the memory reserved +- Unlike phyp dump, FADump allows user to release all the memory reserved for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem. -- Once enabled through kernel boot parameter, fadump can be +- Once enabled through kernel boot parameter, FADump can be started/stopped through /sys/kernel/fadump_registered interface (see sysfs files section below) and can be easily integrated with kdump service start/stop init scripts. @@ -33,7 +33,7 @@ dump offers several strong, practical advantages: in a clean, consistent state. -- Once the dump is copied out, the memory that held the dump is immediately available to the running kernel. And therefore, - unlike kdump, fadump doesn't need a 2nd reboot to get back + unlike kdump, FADump doesn't need a 2nd reboot to get back the system to the production configuration. The above can only be accomplished by coordination with, @@ -61,7 +61,7 @@ as follows: boot successfully. For syntax of crashkernel= parameter, refer to Documentation/kdump/kdump.txt. If any offset is provided in crashkernel= parameter, it will be ignored - as fadump uses a predefined offset to reserve memory + as FADump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the @@ -120,7 +120,7 @@ blocking this significant chunk of memory from production kernel. Hence, the implementation uses the Linux kernel's Contiguous Memory Allocator (CMA) for memory reservation if CMA is configured for kernel. With CMA reservation this memory will be available for applications to -use it, while kernel is prevented from using it. With this fadump will +use it, while kernel is prevented from using it. With this FADump will still be able to capture all of the kernel memory and most of the user space memory except the user pages that were present in CMA region. @@ -170,14 +170,14 @@ KDump, as dump mechanism. The tools to examine the dump will be same as the ones used for kdump. -How to enable firmware-assisted dump (fadump): +How to enable firmware-assisted dump (FADump): - 1. Set config option CONFIG_FA_DUMP=y and build kernel. -2. Boot into linux kernel with 'fadump=on' kernel cmdline option. - By default, fadump reserved memory will be initialized as CMA area. - Alternatively, user can boot linux kernel with 'fadump=nocma' to - prevent fadump to use CMA. +2. Boot into linux kernel with 'FADump=on' kernel cmdline option. + By default, FADump reserved memory will be initialized as CMA area. + Alternatively, user can boot linux kernel with 'FADump=nocma' to + prevent FADump to use CMA. 3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. @@ -190,7 +190,7 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead option is set at kernel cmdline. 3. if user wants to capture all of user space memory and ok with reserved memory not available to production system, then - 'fadump=nocma' kernel parameter can be used to fallback to + 'FADump=nocma' kernel parameter can be used to fallback to old behaviour. Sysfs/debugfs files: @@ -203,29 +203,29 @@ Here is the list of files under kernel sysfs: /sys/kernel/fadump_enabled -This is used to display the fadump status. -0 = fadump is disa
[PATCH v3 05/16] powerpc/fadump: enable fadump support on OPAL based POWER platform
From: Hari Bathini Firmware-assisted dump support is enabled for OPAL based POWER platforms in P9 firmware. Make the corresponding updates in kernel to enable fadump support for such platforms. Signed-off-by: Hari Bathini --- arch/powerpc/Kconfig |5 arch/powerpc/include/asm/opal-api.h | 58 +++ arch/powerpc/include/asm/opal.h |4 arch/powerpc/kernel/fadump-common.c | 18 + arch/powerpc/kernel/fadump-common.h | 46 ++- arch/powerpc/kernel/fadump.c | 277 arch/powerpc/platforms/powernv/Makefile |1 arch/powerpc/platforms/powernv/opal-call.c |2 arch/powerpc/platforms/powernv/opal-fadump.c | 443 ++ arch/powerpc/platforms/powernv/opal-fadump.h | 34 ++ arch/powerpc/platforms/pseries/rtas-fadump.c | 38 ++ 11 files changed, 837 insertions(+), 89 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8c1c636..f124a9b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -557,7 +557,7 @@ config CRASH_DUMP config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && PPC_RTAS + depends on PPC64 && (PPC_RTAS || PPC_POWERNV) select CRASH_CORE select CRASH_DUMP help @@ -568,7 +568,8 @@ config FA_DUMP is meant to be a kdump replacement offering robustness and speed not possible without system firmware assistance. - If unsure, say "N" + If unsure, say "y". Only special kernels like petitboot may + need to say "N" here. config IRQ_ALL_CPUS bool "Distribute interrupts on all CPUs by default" diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 09a8553..1762b1e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -208,7 +208,9 @@ #define OPAL_HANDLE_HMI2 166 #defineOPAL_NX_COPROC_INIT 167 #define OPAL_XIVE_GET_VP_STATE 170 -#define OPAL_LAST 170 +#define OPAL_MPIPL_UPDATE 173 +#define OPAL_MPIPL_QUERY_TAG 174 +#define OPAL_LAST 174 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -979,6 +981,59 @@ struct opal_sg_list { }; /* + * Firmware-Assisted Dump (FADump) + */ + +/* The maximum number of dump sections supported by OPAL */ +#define OPAL_FADUMP_NR_SECTIONS64 + +/* Kernel Dump region info */ +struct opal_fadump_region { + __be64 src; + __be64 dest; + __be64 size; +} __attribute__((packed)); + +/* FADump structure format version */ +#define MPIPL_FADUMP_VERSION 0x01 + +/* + * Metadata type. Kernel uses this field to identify the + * type of data + */ +#define MPIPL_FADUMP_TYPE_CPU 0x00 +/* OPAL : 0x01 – 0x39 */ +#define MPIPL_FADUMP_TYPE_OPAL 0x01 +/* Firmware/SMF : 0x40 – 0x79 */ +#define MPIPL_FADUMP_TYPE_FW 0x40 +/* Kernel memory region : 0x80 – 0xb9 */ +#define MPIPL_FADUMP_TYPE_KERNEL 0x80 +/* Reserved for future use : 0xc0 – 0xff */ +#define MPIPL_FADUMP_TYPE_RESERVED 0xc0 + +/* OPAL MPIPL FADump metadata */ +struct opal_mpipl_fadump { + u8 type; + u8 version; + u8 reserved[6]; + __be32 crashing_pir; + __be32 cpu_data_version; + __be32 cpu_data_size; + __be32 region_cnt; + + struct opal_fadump_region region[OPAL_FADUMP_NR_SECTIONS]; +} __attribute__((packed)); + +/* MPIPL update operations */ +enum mpipl_ops { + OPAL_MPIPL_REGISTER_TAG = 0, + OPAL_MPIPL_ADD_RANGE= 1, + OPAL_MPIPL_REMOVE_RANGE = 2, + OPAL_MPIPL_REMOVE_ALL = 3, + OPAL_MPIPL_FREE_PRESERVED_MEMORY= 4, +}; + +/* * Dump region ID range usable by the OS */ #define OPAL_DUMP_REGION_HOST_START0x80 @@ -1058,6 +1113,7 @@ enum { OPAL_REBOOT_NORMAL = 0, OPAL_REBOOT_PLATFORM_ERROR = 1, OPAL_REBOOT_FULL_IPL= 2, + OPAL_REBOOT_OS_ERROR= 3, }; /* Argument to OPAL_PCI_TCE_KILL */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 4ed5d57..4c99421 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -39,6 +39,10 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t bdfn, uint64_t PE_handle);
[PATCH v3 06/16] powerpc/fadump: Update documentation about OPAL platform support
With FADump support now available on both pseries and OPAL platforms, update FADump documentation with these details. Signed-off-by: Hari Bathini --- Documentation/powerpc/firmware-assisted-dump.txt | 90 -- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 62e75ef..844a229 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -70,7 +70,8 @@ as follows: normal. -- The freshly booted kernel will notice that there is a new - node (ibm,dump-kernel) in the device tree, indicating that + node (ibm,dump-kernel on PSeries or ibm,opal/dump/result-table + on OPAL platform) in the device tree, indicating that there is crash data available from a previous boot. During the early boot OS will reserve rest of the memory above boot memory size effectively booting with restricted memory @@ -93,7 +94,9 @@ as follows: Please note that the firmware-assisted dump feature is only available on Power6 and above systems with recent -firmware versions. +firmware versions on PSeries (PowerVM) platform and Power9 +and above systems with recent firmware versions on PowerNV +(OPAL) platform. Implementation details: -- @@ -108,57 +111,66 @@ that are run. If there is dump data, then the /sys/kernel/fadump_release_mem file is created, and the reserved memory is held. -If there is no waiting dump data, then only the memory required -to hold CPU state, HPTE region, boot memory dump and elfcore -header, is usually reserved at an offset greater than boot memory -size (see Fig. 1). This area is *not* released: this region will -be kept permanently reserved, so that it can act as a receptacle -for a copy of the boot memory content in addition to CPU state -and HPTE region, in the case a crash does occur. Since this reserved -memory area is used only after the system crash, there is no point in -blocking this significant chunk of memory from production kernel. -Hence, the implementation uses the Linux kernel's Contiguous Memory -Allocator (CMA) for memory reservation if CMA is configured for kernel. -With CMA reservation this memory will be available for applications to -use it, while kernel is prevented from using it. With this FADump will -still be able to capture all of the kernel memory and most of the user -space memory except the user pages that were present in CMA region. +If there is no waiting dump data, then only the memory required to +hold CPU state, HPTE region, boot memory dump, FADump header and +elfcore header, is usually reserved at an offset greater than boot +memory size (see Fig. 1). This area is *not* released: this region +will be kept permanently reserved, so that it can act as a receptacle +for a copy of the boot memory content in addition to CPU state and +HPTE region, in the case a crash does occur. + +Since this reserved memory area is used only after the system crash, +there is no point in blocking this significant chunk of memory from +production kernel. Hence, the implementation uses the Linux kernel's +Contiguous Memory Allocator (CMA) for memory reservation if CMA is +configured for kernel. With CMA reservation this memory will be +available for applications to use it, while kernel is prevented from +using it. With this FADump will still be able to capture all of the +kernel memory and most of the user space memory except the user pages +that were present in CMA region. o Memory Reservation during first kernel - Low memoryTop of memory - 0 boot memory size |<--Reserved dump area --->| | - | || Permanent Reservation | | - V V| (Preserve area)| V - +---+--/ /---+---+++---++--+ - | ||CPU|HPTE| DUMP |HDR|ELF | | - +---+--/ /---+---+++---++--+ -| ^ ^ -| | | -\ / | - --- FADump Header - Boot memory content gets transferred (meta area) - to reserved area by firmware at the - time of crash - + Low memory Top of memory + 0 boot memory size|<--- Reserved dump area --->| | + | | |Permanent Reservatio| | + V V | (Preserve area) | V + +---+/ /---+---++---+-+-+---+ + | | |///|/
[PATCH v3 07/16] powerpc/fadump: consider reserved ranges while reserving memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse reserved-ranges DT node and reserve kernel memory falling in these ranges for F/W purposes. Ensure memory in these ranges is not overlapped with memory reserved for FADump. Also, use a smaller offset, instead of the size of the memory to be reserved, by which to skip memory before making another attempt at reserving memory, after the previous attempt to reserve memory for FADump failed due to memory holes and/or reserved ranges, to reduce the likelihood of memory reservation failure. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump-common.h | 11 +++ arch/powerpc/kernel/fadump.c| 137 ++- 2 files changed, 145 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 36f4d71..555230e 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -101,6 +101,17 @@ struct fadump_memory_range { unsigned long long size; }; +/* + * Amount of memory (1024MB) to skip before making another attempt at + * reserving memory (after the previous attempt to reserve memory for + * FADump failed due to memory holes and/or reserved ranges) to reduce + * the likelihood of memory reservation failure. + */ +#define OFFSET_SIZE0x4000U + +/* Maximum no. of reserved ranges supported for processing. */ +#define MAX_RESERVED_RANGES128 + /* Maximum no. of real memory regions supported by the kernel */ #define MAX_REAL_MEM_REGIONS 8 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index f2c2d4a..1b3df8b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -41,6 +41,9 @@ int crash_memory_ranges_size; int crash_mem_ranges; int max_crash_mem_ranges; +struct fadump_memory_range reserved_ranges[MAX_RESERVED_RANGES]; +int reserved_ranges_cnt; + #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -104,12 +107,116 @@ int __init fadump_cma_init(void) static int __init fadump_cma_init(void) { return 1; } #endif /* CONFIG_CMA */ +/* + * Sort the reserved ranges in-place and merge adjacent ranges + * to minimize the reserved ranges count. + */ +static void __init sort_and_merge_reserved_ranges(void) +{ + unsigned long long base, size; + struct fadump_memory_range tmp_range; + int i, j, idx; + + if (!reserved_ranges_cnt) + return; + + /* Sort the reserved ranges */ + for (i = 0; i < reserved_ranges_cnt; i++) { + idx = i; + for (j = i + 1; j < reserved_ranges_cnt; j++) { + if (reserved_ranges[idx].base > reserved_ranges[j].base) + idx = j; + } + if (idx != i) { + tmp_range = reserved_ranges[idx]; + reserved_ranges[idx] = reserved_ranges[i]; + reserved_ranges[i] = tmp_range; + } + } + + /* Merge adjacent reserved ranges */ + idx = 0; + for (i = 1; i < reserved_ranges_cnt; i++) { + base = reserved_ranges[i-1].base; + size = reserved_ranges[i-1].size; + if (reserved_ranges[i].base == (base + size)) + reserved_ranges[idx].size += reserved_ranges[i].size; + else { + idx++; + if (i == idx) + continue; + + reserved_ranges[idx] = reserved_ranges[i]; + } + } + reserved_ranges_cnt = idx + 1; +} + +static int __init add_reserved_range(unsigned long base, +unsigned long size) +{ + int i; + + if (reserved_ranges_cnt == MAX_RESERVED_RANGES) { + /* Compact reserved ranges and try again. */ + sort_and_merge_reserved_ranges(); + if (reserved_ranges_cnt == MAX_RESERVED_RANGES) + return 0; + } + + i = reserved_ranges_cnt++; + reserved_ranges[i].base = base; + reserved_ranges[i].size = size; + return 1; +} + +/* + * Scan reserved-ranges to consider them while reserving/releasing + * memory for FADump. + */ +static void __init early_init_dt_scan_reserved_ranges(unsigned long node) +{ + int len, ret; + unsigned long i; + const __be32 *prop; + + /* reserved-ranges already scanned */ + if (reserved_ranges_cnt != 0) + return; + + prop = of_get_flat_dt_prop(node, "reserved-ranges", &len); + + if (!prop) + return; + + /* +* Each reserved range is an (address,size) pair, 2 cells each, +* totalling 4 cells per range. +*/ + for (i =
[PATCH v3 08/16] powerpc/fadump: consider reserved ranges while releasing memory
Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse 'reserved-ranges' DT node to reserve kernel memory falling in these ranges for firmware purposes. Along with the preserved area memory, also ensure memory in reserved ranges is not overlapped with memory released by capture kernel aftering saving vmcore. Also, fix the off-by-one error in fadump_release_reserved_area function while releasing memory. Signed-off-by: Hari Bathini --- arch/powerpc/kernel/fadump.c | 59 +- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 1b3df8b..ce8c0bf 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -111,7 +111,7 @@ static int __init fadump_cma_init(void) { return 1; } * Sort the reserved ranges in-place and merge adjacent ranges * to minimize the reserved ranges count. */ -static void __init sort_and_merge_reserved_ranges(void) +static void sort_and_merge_reserved_ranges(void) { unsigned long long base, size; struct fadump_memory_range tmp_range; @@ -152,8 +152,7 @@ static void __init sort_and_merge_reserved_ranges(void) reserved_ranges_cnt = idx + 1; } -static int __init add_reserved_range(unsigned long base, -unsigned long size) +static int add_reserved_range(unsigned long base, unsigned long size) { int i; @@ -1127,33 +1126,57 @@ static void fadump_release_reserved_area(unsigned long start, unsigned long end) if (tend == end_pfn) break; - start_pfn = tend + 1; + start_pfn = tend; } } } /* - * Release the memory that was reserved in early boot to preserve the memory - * contents. The released memory will be available for general use. + * Release the memory that was reserved during early boot to preserve the + * crash'ed kernel's memory contents except reserved dump area (permanent + * reservation) and reserved ranges used by F/W. The released memory will + * be available for general use. */ static void fadump_release_memory(unsigned long begin, unsigned long end) { + int i; unsigned long ra_start, ra_end; - - ra_start = fw_dump.reserve_dump_area_start; - ra_end = ra_start + fw_dump.reserve_dump_area_size; + unsigned long tstart; /* -* exclude the dump reserve area. Will reuse it for next -* fadump registration. +* Add memory to permanently preserve to reserved ranges list +* and exclude all these ranges while releasing memory. */ - if (begin < ra_end && end > ra_start) { - if (begin < ra_start) - fadump_release_reserved_area(begin, ra_start); - if (end > ra_end) - fadump_release_reserved_area(ra_end, end); - } else - fadump_release_reserved_area(begin, end); + i = add_reserved_range(fw_dump.reserve_dump_area_start, + fw_dump.reserve_dump_area_size); + if (i == 0) { + /* +* Reached the MAX reserved ranges count. To ensure reserved +* dump area is excluded (as it will be reused for next +* FADump registration), ignore the last reserved range and +* add reserved dump area instead. +*/ + reserved_ranges_cnt--; + add_reserved_range(fw_dump.reserve_dump_area_start, + fw_dump.reserve_dump_area_size); + } + sort_and_merge_reserved_ranges(); + + tstart = begin; + for (i = 0; i < reserved_ranges_cnt; i++) { + ra_start = reserved_ranges[i].base; + ra_end = ra_start + reserved_ranges[i].size; + + if (tstart >= ra_end) + continue; + + if (tstart < ra_start) + fadump_release_reserved_area(tstart, ra_start); + tstart = ra_end; + } + + if (tstart < end) + fadump_release_reserved_area(tstart, end); } static void fadump_invalidate_release_mem(void)
[PATCH v3 09/16] powernv/fadump: process architected register state data provided by firmware
From: Hari Bathini Firmware provides architected register state data at the time of crash. Process this data and build CPU notes to append to ELF core. Signed-off-by: Hari Bathini Signed-off-by: Vasant Hegde --- arch/powerpc/include/asm/opal-api.h | 31 arch/powerpc/kernel/fadump-common.h |3 arch/powerpc/platforms/powernv/opal-fadump.c | 197 -- arch/powerpc/platforms/powernv/opal-fadump.h |2 4 files changed, 221 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 1762b1e..a60b09f 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -984,6 +984,37 @@ struct opal_sg_list { * Firmware-Assisted Dump (FADump) */ +#define CPU_STATE_DATA_VERSION 1 + +/* FADump thread header for register entries */ +struct opal_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __attribute__((packed)); + +/* Register types populated by f/w */ +#define OPAL_REG_TYPE_GPR 0x01 +#define OPAL_REG_TYPE_SPR 0x02 + +/* ID numbers used by f/w while populating certain registers */ +#define REG_ID_NIP 0x7D0 +#define REG_ID_MSR 0x7D1 +#define REG_ID_CCR 0x7D2 + +/* FADump register entry. */ +struct opal_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +} __attribute__((packed)); + /* The maximum number of dump sections supported by OPAL */ #define OPAL_FADUMP_NR_SECTIONS64 diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 555230e..ebebe4d 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -117,6 +117,9 @@ struct fadump_memory_range { /* Firmware-assisted dump configuration details. */ struct fw_dump { + unsigned long cpu_state_destination_addr; + unsigned long cpu_state_data_version; + unsigned long cpu_state_entry_size; unsigned long cpu_state_data_size; unsigned long hpte_region_size; unsigned long boot_memory_size; diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 7e6c46a..ed3c35b 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -29,6 +29,7 @@ #include "opal-fadump.h" static const struct opal_fadump_mem_struct *opal_fdm_active; +static const struct opal_mpipl_fadump *opal_cpu_metadata; static struct opal_fadump_mem_struct *opal_fdm; static void opal_set_preserv_area_start(struct fw_dump *fadump_conf) @@ -229,6 +230,75 @@ static int opal_invalidate_fadump(struct fw_dump *fadump_conf) return 0; } +static inline void fadump_set_regval_regnum(struct pt_regs *regs, u32 reg_type, + u32 reg_num, u64 reg_val) +{ + if (reg_type == OPAL_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case REG_ID_NIP: + regs->nip = reg_val; + break; + case REG_ID_MSR: + regs->msr = reg_val; + break; + case REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void fadump_read_registers(char *bufp, unsigned int regs_cnt, +unsigned int reg_entry_size, +struct pt_regs *regs) +{ + int i; + struct opal_fadump_reg_entry *reg_entry; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct opal_fadump_reg_entry *)bufp; + fadump_set_regval_regnum(regs, +be32_to_cpu(reg_entry->reg_type), +be32_to_cpu(reg_entry->reg_num), +