[PATCH v2] fadump: fix endianess issues in firmware assisted dump handling
Firmware-assisted dump (fadump) kernel code is not LE compliant. The below patch tries to fix this issue. Tested this patch with upstream kernel. Did some sanity testing for the LE fadump vmcore generated. Below output shows crash tool successfully opening LE fadump vmcore. # crash vmlinux vmcore crash 7.0.5 Copyright (C) 2002-2014 Red Hat, Inc. Copyright (C) 2004, 2005, 2006, 2010 IBM Corporation Copyright (C) 1999-2006 Hewlett-Packard Co Copyright (C) 2005, 2006, 2011, 2012 Fujitsu Limited Copyright (C) 2006, 2007 VA Linux Systems Japan K.K. Copyright (C) 2005, 2011 NEC Corporation Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc. Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc. This program is free software, covered by the GNU General Public License, and you are welcome to change it and/or distribute copies of it under certain conditions. Enter help copying to see the conditions. This program has absolutely no warranty. Enter help warranty for details. crash: vmlinux: no .gnu_debuglink section GNU gdb (GDB) 7.6 Copyright (C) 2013 Free Software Foundation, Inc. License GPLv3+: GNU GPL version 3 or later http://gnu.org/licenses/gpl.html This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Type show copying and show warranty for details. This GDB was configured as powerpc64le-unknown-linux-gnu... KERNEL: vmlinux DUMPFILE: vmcore CPUS: 16 DATE: Wed Dec 31 19:00:00 1969 UPTIME: 00:03:28 LOAD AVERAGE: 0.46, 0.86, 0.41 TASKS: 268 NODENAME: linux-dhr2 RELEASE: 3.17.0-rc5-7-default VERSION: #6 SMP Tue Sep 30 01:06:34 EDT 2014 MACHINE: ppc64le (4116 Mhz) MEMORY: 40 GB PANIC: Oops: Kernel access of bad area, sig: 11 [#1] (check log for details) PID: 6223 COMMAND: bash TASK: c009661b2500 [THREAD_INFO: c00967ac] CPU: 2 STATE: TASK_RUNNING (PANIC) crash Changes in v2: 1. Addressed casting related warnings. 2. Elaborated on why exceptions should not be changed to big endian during fadump boot. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/fadump.h | 52 --- arch/powerpc/kernel/fadump.c | 114 + arch/powerpc/platforms/pseries/lpar.c | 15 3 files changed, 96 insertions(+), 85 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index a677456..493e72f 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -70,39 +70,39 @@ #define CPU_UNKNOWN(~((u32)0)) /* Utility macros */ -#define SKIP_TO_NEXT_CPU(reg_entry)\ -({ \ - while (reg_entry-reg_id != REG_ID(CPUEND)) \ - reg_entry++;\ - reg_entry++;\ +#define SKIP_TO_NEXT_CPU(reg_entry)\ +({ \ + while (be64_to_cpu(reg_entry-reg_id) != REG_ID(CPUEND)) \ + reg_entry++;\ + reg_entry++;\ }) /* Kernel Dump section info */ struct fadump_section { - u32 request_flag; - u16 source_data_type; - u16 error_flags; - u64 source_address; - u64 source_len; - u64 bytes_dumped; - u64 destination_address; + __be32 request_flag; + __be16 source_data_type; + __be16 error_flags; + __be64 source_address; + __be64 source_len; + __be64 bytes_dumped; + __be64 destination_address; }; /* ibm,configure-kernel-dump header. */ struct fadump_section_header { - u32 dump_format_version; - u16 dump_num_sections; - u16 dump_status_flag; - u32 offset_first_dump_section; + __be32 dump_format_version; + __be16 dump_num_sections; + __be16 dump_status_flag; + __be32 offset_first_dump_section; /* Fields for disk dump option. */ - u32 dd_block_size; - u64 dd_block_offset; - u64 dd_num_blocks; - u32 dd_offset_disk_path; + __be32 dd_block_size; + __be64 dd_block_offset; + __be64 dd_num_blocks; + __be32 dd_offset_disk_path; /* Maximum time allowed to prevent an automatic dump-reboot. */ - u32
[PATCH] PPC64: Adding symbols in vmcoreinfo to facilitate dump filtering
When CONFIG_SPARSEMEM_VMEMMAP option is used in kernel, makedumpfile fails to filter vmcore dump as it fails to do vmemmap translations. So far dump filtering on ppc64 never had to deal with vmemmap addresses seperately as vmemmap regions where mapped in zone normal. But with the inclusion of CONFIG_SPARSEMEM_VMEMMAP config option in kernel, this vmemmap address translation support becomes necessary for dump filtering. For vmemmap adress translation, few kernel symbols are needed by dump filtering tool. This patch adds those symbols to vmcoreinfo, which a dump filtering tool can use for filtering the kernel dump. Tested this changes successfully with makedumpfile tool that supports vmemmap to physical address translation outside zone normal. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgalloc-64.h |4 arch/powerpc/kernel/machine_kexec.c | 12 2 files changed, 16 insertions(+) diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b..33e507a 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -17,6 +17,10 @@ struct vmemmap_backing { unsigned long virt_addr; }; +#ifdef CONFIG_SPARSEMEM_VMEMMAP +extern struct vmemmap_backing *vmemmap_list; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an index_size so they know how to diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index e1ec57e..88a7fb4 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -18,6 +18,7 @@ #include linux/ftrace.h #include asm/machdep.h +#include asm/pgalloc.h #include asm/prom.h #include asm/sections.h @@ -75,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void) #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(contig_page_data); #endif +#if defined(CONFIG_PPC64) defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif } /* ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] PPC64: Adding symbols in vmcoreinfo to facilitate dump filtering
When CONFIG_SPARSEMEM_VMEMMAP option is set in kernel, makedumpfile tool fails to filter vmcore dump as it fails to do translations for vmemmap addresses that are mapped outside zone normal. For vmemmap adress translation support in this scenario, few kernel symbols are needed by dump filtering tool. This patch adds those symbols to vmcoreinfo, which a dump filtering tool can use for filtering the kernel dump. This changes are tested successfully with makedumpfile tool that supports vmemmap to physical address translation outside zone normal. Changes from v1: Updated patch decription and removed #ifdef around extern. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgalloc-64.h |2 ++ arch/powerpc/kernel/machine_kexec.c | 12 2 files changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b..3973e62 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -17,6 +17,8 @@ struct vmemmap_backing { unsigned long virt_addr; }; +extern struct vmemmap_backing *vmemmap_list; + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an index_size so they know how to diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index e1ec57e..88a7fb4 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -18,6 +18,7 @@ #include linux/ftrace.h #include asm/machdep.h +#include asm/pgalloc.h #include asm/prom.h #include asm/sections.h @@ -75,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void) #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(contig_page_data); #endif +#if defined(CONFIG_PPC64) defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif } /* ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 0/2] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. --- Hari Bathini (2): pstore: Add pstore type id for firmware partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |2 arch/powerpc/kernel/nvram_64.c | 679 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 663 -- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 749 insertions(+), 659 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] pstore: Add pstore type id for firmware partition
This patch adds a pstore type id to be used for opal specific nvram partitions. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index fafb7a0..e83bb93 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -337,6 +337,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, powerpc-common-%s-%lld, psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, powerpc-opal-%s-%lld, psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, unknown-%s-%lld, psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch while adding pstore support for powernv platform, moves common code for pseries and powernv to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |2 arch/powerpc/kernel/nvram_64.c | 679 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 663 -- 5 files changed, 745 insertions(+), 659 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include linux/types.h #include linux/errno.h #include linux/list.h #include uapi/asm/nvram.h +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..a033fe9 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); #endif diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..8c439a3 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include linux/init.h #include linux/slab.h #include linux/spinlock.h +#include linux/kmsg_dump.h +#include linux/pstore.h +#include linux/zlib.h #include asm/uaccess.h #include asm/nvram.h #include asm/rtas.h @@ -54,6 +57,682 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = ibm,rtas-log, + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = lnx,oops-log, + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + ibm,rtas-log, +#endif + lnx,oops-log, + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper
Re: [2/2] pstore: add pstore support on powernv
On 12/04/2014 11:07 AM, Michael Ellerman wrote: On Wed, 2014-03-12 at 11:03:15 UTC, Hari Bathini wrote: This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch while adding pstore support for powernv platform, moves common code for pseries and powernv to arch/powerpc/kernel/nvram_64.c file. Please move the common code first in a separate patch. Unless there's some reason you absolutely can't do that. Sure, Michael. Let me make the changes as suggested and post the updated patch series. Thanks Hari cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/kdump: skip enabling big endian exception during crash
In LE kernel, we currently have a hack for kexec that resets the exception endian before starting a new kernel as the kernel that is loaded could be a big endian or a little endian kernel. In kdump case, resetting exception endian fails when one or more cpus is disabled. But in case of kdump, we can conveniently ignore resetting endianess as crashkernel is always of same endianess as primary kernel. This patch adds a new inline function to say if this is kdump path. This function is used at places where such a check is needed. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kexec.h | 10 ++ arch/powerpc/kernel/machine_kexec_64.c |2 +- arch/powerpc/platforms/pseries/lpar.c |7 ++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 19c36cb..0d96d4d 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -86,6 +86,11 @@ extern int overlaps_crashkernel(unsigned long start, unsigned long size); extern void reserve_crashkernel(void); extern void machine_kexec_mask_interrupts(void); +static inline int is_kdump_path(void) +{ + return (crashing_cpu = 0) ? 1 : 0; +} + #else /* !CONFIG_KEXEC */ static inline void crash_kexec_secondary(struct pt_regs *regs) { } @@ -106,6 +111,11 @@ static inline int crash_shutdown_unregister(crash_shutdown_t handler) return 0; } +static inline int is_kdump_path(void) +{ + return 0; +} + #endif /* CONFIG_KEXEC */ #endif /* ! __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 879b3aa..b4fe804 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -330,7 +330,7 @@ void default_machine_kexec(struct kimage *image) * using debugger IPI. */ - if (crashing_cpu == -1) + if (!is_kdump_path()) kexec_prepare_cpus(); pr_debug(kexec: Starting switchover sequence.\n); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f6880d2..be41680 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -43,6 +43,7 @@ #include asm/trace.h #include asm/firmware.h #include asm/plpar_wrappers.h +#include asm/kexec.h #include asm/fadump.h #include pseries.h @@ -257,8 +258,12 @@ static void pSeries_lpar_hptab_clear(void) * * This is also called on boot when a fadump happens. In that case we * must not change the exception endian mode. +* +* This is also called during kdump which doesn't need resetting, as the +* the crashkernel is of same endainess as primary kernel. */ - if (firmware_has_feature(FW_FEATURE_SET_MODE) !is_fadump_active()) { + if (firmware_has_feature(FW_FEATURE_SET_MODE) !is_fadump_active() + !is_kdump_path()) { long rc; rc = pseries_big_endian_exceptions(); ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 0/3] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. --- Hari Bathini (3): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for firmware partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |2 arch/powerpc/kernel/nvram_64.c | 681 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 665 -- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 751 insertions(+), 661 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/3] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|2 arch/powerpc/kernel/nvram_64.c | 660 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 716 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include linux/types.h #include linux/errno.h #include linux/list.h #include uapi/asm/nvram.h +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..a033fe9 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); #endif diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..dbff7f0 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include linux/init.h #include linux/slab.h #include linux/spinlock.h +#include linux/kmsg_dump.h +#include linux/pstore.h +#include linux/zlib.h #include asm/uaccess.h #include asm/nvram.h #include asm/rtas.h @@ -54,6 +57,663 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = ibm,rtas-log, + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = lnx,oops-log, + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + ibm,rtas-log, +#endif + lnx,oops-log, + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* + * For capturing and compressing an oops or panic report... + + * big_oops_buf
[PATCH v2 2/3] pstore: Add pstore type id for firmware partition
This patch adds a pstore type id to be used for opal specific nvram partitions. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, powerpc-common-%s-%lld, psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, powerpc-opal-%s-%lld, psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, unknown-%s-%lld, psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 3/3] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index dbff7f0..3afbc91 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = ibm,skiboot, + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = of-config, @@ -479,6 +487,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time-tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time-tv_sec = 0; + time-tv_nsec = 0; + break; +#endif default: return 0; } @@ -554,8 +572,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include linux/of.h #include asm/opal.h +#include asm/nvram.h #include asm/machdep.h static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2 1/3] powerpc/nvram: move generic code for nvram and pstore
On 12/17/2014 05:33 AM, Michael Ellerman wrote: On Tue, 2014-12-16 at 23:35 +0530, Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Sharing the code is great. But, you need to keep in mind that it is very common for us to build kernels with both POWERNV=y and PSERIES=y. So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things that are optional on pseries. Not things that we *shouldn't* be doing on powernv. For example the logic in nvram_init_oops_partition() looks like it might do the wrong thing for PSERIES=y POWERNV=y. True. It might do wrong thing when an incorrect value is passed by the caller. But since the caller is platform specific code [pseries_nvram_init_log_partitions() or opal_nvram_init_log_partitions() routine], with appropriate parameter passed, I haven't seen any issues while testing. diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..a033fe9 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,6 +343,8 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); You should add an empty version of this for !PSERIES, so you don't have to ifdef all the call sites. Sure. Will update accordingly.. Thanks Hari cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] powerpc/kdump: Ignore failure in enabling big endian exception during crash
In LE kernel, we currently have a hack for kexec that resets the exception endian before starting a new kernel as the kernel that is loaded could be a big endian or a little endian kernel. In kdump case, resetting exception endian fails when one or more cpus is disabled. But we can ignore the failure and still go ahead, as in most cases crashkernel will be of same endianess as primary kernel and reseting endianess is not even needed in those cases. This patch adds a new inline function to say if this is kdump path. This function is used at places where such a check is needed. Changes from v1: Instead of skipping, ignore failure in enabling big endian exception during crash Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kexec.h | 10 ++ arch/powerpc/kernel/machine_kexec_64.c |2 +- arch/powerpc/platforms/pseries/lpar.c | 10 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 19c36cb..0d96d4d 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -86,6 +86,11 @@ extern int overlaps_crashkernel(unsigned long start, unsigned long size); extern void reserve_crashkernel(void); extern void machine_kexec_mask_interrupts(void); +static inline int is_kdump_path(void) +{ + return (crashing_cpu = 0) ? 1 : 0; +} + #else /* !CONFIG_KEXEC */ static inline void crash_kexec_secondary(struct pt_regs *regs) { } @@ -106,6 +111,11 @@ static inline int crash_shutdown_unregister(crash_shutdown_t handler) return 0; } +static inline int is_kdump_path(void) +{ + return 0; +} + #endif /* CONFIG_KEXEC */ #endif /* ! __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 879b3aa..b4fe804 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -330,7 +330,7 @@ void default_machine_kexec(struct kimage *image) * using debugger IPI. */ - if (crashing_cpu == -1) + if (!is_kdump_path()) kexec_prepare_cpus(); pr_debug(kexec: Starting switchover sequence.\n); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 469751d..63214fa 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -43,6 +43,7 @@ #include asm/trace.h #include asm/firmware.h #include asm/plpar_wrappers.h +#include asm/kexec.h #include asm/fadump.h #include pseries.h @@ -257,6 +258,7 @@ static void pSeries_lpar_hptab_clear(void) * * This is also called on boot when a fadump happens. In that case we * must not change the exception endian mode. +* */ if (firmware_has_feature(FW_FEATURE_SET_MODE) !is_fadump_active()) { long rc; @@ -267,8 +269,14 @@ static void pSeries_lpar_hptab_clear(void) * out to the user, but at least this will stop us from * continuing on further and creating an even more * difficult to debug situation. +* +* But if we reaching here after a crash, no point panicking. +* Also, in kdump path, resetting endianess may not be needed +* as the crashkernel most of the times is of same endianess +* as primary kernel. So, let's ignore the failure and try +* kdump'ing anyway. */ - if (rc) + if (rc !is_kdump_path()) panic(Could not enable big endian exceptions); } #endif ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 0/3] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. Changes from v2: Added an empty version of clobbering_unread_rtas_event() routine for !PSERIES, to avoid ifdef at the call sites --- Hari Bathini (3): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for firmware partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |4 arch/powerpc/kernel/nvram_64.c | 677 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 665 --- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 749 insertions(+), 661 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|4 arch/powerpc/kernel/nvram_64.c | 656 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 714 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include linux/types.h #include linux/errno.h #include linux/list.h #include uapi/asm/nvram.h +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..bcf6693 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); +#else +int clobbering_unread_rtas_event(void) { return 0; } #endif #ifdef CONFIG_PPC_RTAS_DAEMON diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..42e5c6a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include linux/init.h #include linux/slab.h #include linux/spinlock.h +#include linux/kmsg_dump.h +#include linux/pstore.h +#include linux/zlib.h #include asm/uaccess.h #include asm/nvram.h #include asm/rtas.h @@ -54,6 +57,659 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = ibm,rtas-log, + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = lnx,oops-log, + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + ibm,rtas-log, +#endif + lnx,oops-log, + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump
[PATCH v3 2/3] pstore: Add pstore type id for firmware partition
This patch adds a pstore type id to be used for opal specific nvram partitions. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, powerpc-common-%s-%lld, psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, powerpc-opal-%s-%lld, psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, unknown-%s-%lld, psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 3/3] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 42e5c6a..293da88 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = ibm,skiboot, + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = of-config, @@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time-tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time-tv_sec = 0; + time-tv_nsec = 0; + break; +#endif default: return 0; } @@ -552,8 +570,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include linux/of.h #include asm/opal.h +#include asm/nvram.h #include asm/machdep.h static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 2/3] pstore: Add pstore type id for PPC64 opal nvram partition
This patch adds a new PPC64 partition type to be used for opal specific nvram partition. A new partition type is needed as none of the existing type matches this partition type. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com Cc: Anton Vorontsov an...@enomsg.org Cc: Colin Cross ccr...@android.com Cc: Kees Cook keesc...@chromium.org Cc: Tony Luck tony.l...@intel.com --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, powerpc-common-%s-%lld, psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, powerpc-opal-%s-%lld, psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, unknown-%s-%lld, psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 3/3] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. It is a common practice to compile kernels with both CONFIG_PPC_PSERIES=y and CONFIG_PPC_POWERNV=y. The code in nvram_init_oops_partition() routine still works as intended, as the caller is platform specific code which passes the appropriate value for rtas_partition_exists parameter. In all other places, where CONFIG_PPC_PSERIES or CONFIG_PPC_POWERNV flag is used in this patchset, it is to reduce the kernel size in cases where this flag is not set and doesn't have any impact logic wise. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com Cc: Anton Vorontsov an...@enomsg.org Cc: Colin Cross ccr...@android.com Cc: Kees Cook keesc...@chromium.org Cc: Tony Luck tony.l...@intel.com --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 42e5c6a..293da88 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = ibm,skiboot, + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = of-config, @@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time-tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time-tv_sec = 0; + time-tv_nsec = 0; + break; +#endif default: return 0; } @@ -552,8 +570,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include linux/of.h #include asm/opal.h +#include asm/nvram.h #include asm/machdep.h static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 1/3] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|4 arch/powerpc/kernel/nvram_64.c | 656 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 714 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include linux/types.h #include linux/errno.h #include linux/list.h #include uapi/asm/nvram.h +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..123d7ff 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); +#else +static inline int clobbering_unread_rtas_event(void) { return 0; } #endif #ifdef CONFIG_PPC_RTAS_DAEMON diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..42e5c6a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include linux/init.h #include linux/slab.h #include linux/spinlock.h +#include linux/kmsg_dump.h +#include linux/pstore.h +#include linux/zlib.h #include asm/uaccess.h #include asm/nvram.h #include asm/rtas.h @@ -54,6 +57,659 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = ibm,rtas-log, + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = lnx,oops-log, + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + ibm,rtas-log, +#endif + lnx,oops-log, + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper
[PATCH v4 0/3] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. Also, tested the patches successfully, on a kernel compiled with both CONFIG_PPC_PSERIES=y CONFIG_PPC_POWERNV=y. Changes from v3: 1. Updated the changelog 2. Resolved compile issues with !CONFIG_PPC_PSERIES --- Hari Bathini (3): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for PPC64 opal nvram partition pstore: add pstore support on powernv arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |4 arch/powerpc/kernel/nvram_64.c | 677 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 665 --- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 749 insertions(+), 661 deletions(-) -- - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore
On 01/14/2015 10:01 AM, Michael Ellerman wrote: On Wed, 2014-12-24 at 17:28 +0530, Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. As I said in my reply to the previous version: ... you need to keep in mind that it is very common for us to build kernels with both POWERNV=y and PSERIES=y. So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things that are optional on pseries. Not things that we *shouldn't* be doing on powernv. we could as well do away with the PPC_PSERIES flag in a couple of places in arch/powerpc/kernel/nvram_64.c, but doing that will unnecessarily add few extra variables for !PPC_PSERIES case. Please explain in your commit message how you have dealt with that. Sure. Will update the changelog Also, you broke the build for every config that doesn't have CONFIG_PPC_PSERIES, all 95 of them. This is pasemi_defconfig for example: My bad!clobbering_unread_rtas_event should have been static inline while defining under !PPC_PSERIES Thanks Hari LD arch/powerpc/mm/built-in.o arch/powerpc/mm/init_64.o: In function `clobbering_unread_rtas_event': init_64.c:(.opd+0x48): multiple definition of `clobbering_unread_rtas_event' arch/powerpc/mm/mem.o:mem.c:(.opd+0x90): first defined here arch/powerpc/mm/init_64.o: In function `.clobbering_unread_rtas_event': init_64.c:(.text+0x80): multiple definition of `.clobbering_unread_rtas_event' arch/powerpc/mm/mem.o:mem.c:(.text+0x2c0): first defined here CC arch/powerpc/kernel/udbg.o /home/kisskb/slave/src/scripts/Makefile.build:336: recipe for target 'arch/powerpc/mm/built-in.o' failed make[2]: *** [arch/powerpc/mm/built-in.o] Error 1 /home/kisskb/slave/src/Makefile:938: recipe for target 'arch/powerpc/mm' failed make[1]: *** [arch/powerpc/mm] Error 2 make[1]: *** Waiting for unfinished jobs cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 1/3] powerpc/nvram: move generic code for nvram and pstore
On 01/15/2015 03:58 AM, Michael Ellerman wrote: On Wed, 2015-01-14 at 23:35 +0530, Hari Bathini wrote: On 01/14/2015 10:01 AM, Michael Ellerman wrote: On Wed, 2014-12-24 at 17:28 +0530, Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. As I said in my reply to the previous version: ... you need to keep in mind that it is very common for us to build kernels with both POWERNV=y and PSERIES=y. So you need to make sure you're only using CONFIG_PPC_PSERIES to protect things that are optional on pseries. Not things that we *shouldn't* be doing on powernv. we could as well do away with the PPC_PSERIES flag in a couple of places in arch/powerpc/kernel/nvram_64.c, but doing that will unnecessarily add few extra variables for !PPC_PSERIES case. Yep. I'm happy for them to be there, I just want you to explain in the changelog that you've thought about the PSERIES=y POWERNV=y case and why the code makes sense for that configuration. Please explain in your commit message how you have dealt with that. Sure. Will update the changelog Thanks. Also, you broke the build for every config that doesn't have CONFIG_PPC_PSERIES, all 95 of them. This is pasemi_defconfig for example: My bad! clobbering_unread_rtas_event should have been static inline while defining under !PPC_PSERIES Correct. Please make sure you test build at least some of the other configurations in future. I realise it's too time consuming to build all of them, but ideally for every config symbol you use in your patch you need to build a kernel config where that symbol =y and =n (and =m if it's tristate). Sure, Michael. I will keep this in mind :) Thanks Hari cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 1/3] powerpc/nvram: move generic code for nvram and pstore
On 01/30/2015 10:12 PM, Arnd Bergmann wrote: On Friday 30 January 2015 20:44:00 Hari Bathini wrote: With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com Can you make this y2038-safe in the process, possibly as a follow-up patch? Arnd, sorry for the delayed response. I will add these changes to this patch-set and re-spin.. Thanks Hari +extern unsigned long last_rtas_event; time64_t + } + oops_hdr-version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr-report_length = cpu_to_be16(zipped_len); + oops_hdr-timestamp = cpu_to_be64(get_seconds()); + return 0; ktime_get_real_seconds() +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, char **buf, + bool *compressed, struct pstore_info *psi) This has to remain timespec for now but can later be changed to timespec64 when the API gets changed. + oops_hdr-version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr-report_length = cpu_to_be16(text_len); + oops_hdr-timestamp = cpu_to_be64(get_seconds()); ktime_get_real_seconds() Arnd ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 2/4] pstore: Add pstore type id for PPC64 opal nvram partition
This patch adds a new PPC64 partition type to be used for opal specific nvram partition. A new partition type is needed as none of the existing type matches this partition type. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, powerpc-common-%s-%lld, psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, powerpc-opal-%s-%lld, psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, unknown-%s-%lld, psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 4/4] powerpc: make timestamp related code y2038-safe
While we are here, let us make timestamp related code y2038-safe. Suggested-by: Arnd Bergmann a...@arndb.de Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/rtas.h|3 ++- arch/powerpc/kernel/nvram_64.c |6 +++--- arch/powerpc/platforms/pseries/nvram.c | 10 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 123d7ff..efa9152 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -4,6 +4,7 @@ #include linux/spinlock.h #include asm/page.h +#include linux/time.h /* * Definitions for talking to the RTAS on CHRP machines. @@ -343,7 +344,7 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES -extern unsigned long last_rtas_event; +extern time64_t last_rtas_event; extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 293da88..1e703f8 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -376,7 +376,7 @@ static int zip_oops(size_t text_len) } oops_hdr-version = cpu_to_be16(OOPS_HDR_VERSION); oops_hdr-report_length = cpu_to_be16(zipped_len); - oops_hdr-timestamp = cpu_to_be64(get_seconds()); + oops_hdr-timestamp = cpu_to_be64(ktime_get_real_seconds()); return 0; } @@ -423,7 +423,7 @@ static int nvram_pstore_write(enum pstore_type_id type, oops_hdr-version = cpu_to_be16(OOPS_HDR_VERSION); oops_hdr-report_length = cpu_to_be16(size); - oops_hdr-timestamp = cpu_to_be64(get_seconds()); + oops_hdr-timestamp = cpu_to_be64(ktime_get_real_seconds()); if (compressed) err_type = ERR_TYPE_KERNEL_PANIC_GZ; @@ -721,7 +721,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, err_type = ERR_TYPE_KERNEL_PANIC; oops_hdr-version = cpu_to_be16(OOPS_HDR_VERSION); oops_hdr-report_length = cpu_to_be16(text_len); - oops_hdr-timestamp = cpu_to_be64(get_seconds()); + oops_hdr-timestamp = cpu_to_be64(ktime_get_real_seconds()); } (void) nvram_write_os_partition(oops_log_partition, oops_buf, diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 97b8fc6..d77713b 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -37,10 +37,10 @@ static DEFINE_SPINLOCK(nvram_lock); /* See clobbering_unread_rtas_event() */ #define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ -static unsigned long last_unread_rtas_event; /* timestamp */ +static time64_t last_unread_rtas_event;/* timestamp */ #ifdef CONFIG_PSTORE -unsigned long last_rtas_event; +time64_t last_rtas_event; #endif static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) @@ -145,9 +145,9 @@ int nvram_write_error_log(char * buff, int length, int rc = nvram_write_os_partition(rtas_log_partition, buff, length, err_type, error_log_cnt); if (!rc) { - last_unread_rtas_event = get_seconds(); + last_unread_rtas_event = ktime_get_real_seconds(); #ifdef CONFIG_PSTORE - last_rtas_event = get_seconds(); + last_rtas_event = ktime_get_real_seconds(); #endif } @@ -201,7 +201,7 @@ int clobbering_unread_rtas_event(void) { return (oops_log_partition.index == rtas_log_partition.index last_unread_rtas_event -get_seconds() - last_unread_rtas_event = +ktime_get_real_seconds() - last_unread_rtas_event = NVRAM_RTAS_READ_TIMEOUT); } ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 0/4] powerpc/pstore: Add pstore support for nvram partitions
This patch series adds pstore support on powernv platform to read different nvram partitions and write compressed data to oops-log nvram partition. As pseries platform already has pstore support, this series moves most of the common code for pseries and powernv platforms to a common file. Tested the patches successfully on both pseries and powernv platforms. Also, tested the patches successfully, on a kernel compiled with both CONFIG_PPC_PSERIES=y CONFIG_PPC_POWERNV=y. Changes from v4: 1. Added a patch for y2038-safe code changes --- Hari Bathini (4): powerpc/nvram: move generic code for nvram and pstore pstore: Add pstore type id for PPC64 opal nvram partition pstore: add pstore support on powernv powerpc: make timestamp related code y2038-safe arch/powerpc/include/asm/nvram.h| 50 ++ arch/powerpc/include/asm/rtas.h |5 arch/powerpc/kernel/nvram_64.c | 677 +++ arch/powerpc/platforms/powernv/opal-nvram.c | 10 arch/powerpc/platforms/pseries/nvram.c | 673 --- fs/pstore/inode.c |3 include/linux/pstore.h |1 7 files changed, 754 insertions(+), 665 deletions(-) -- -Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 1/4] powerpc/nvram: move generic code for nvram and pstore
With minor checks, we can move most of the code for nvram under pseries to a common place to be re-used by other powerpc platforms like powernv. This patch moves such common code to arch/powerpc/kernel/nvram_64.c file. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/include/asm/nvram.h | 50 ++ arch/powerpc/include/asm/rtas.h|4 arch/powerpc/kernel/nvram_64.c | 656 arch/powerpc/platforms/pseries/nvram.c | 665 4 files changed, 714 insertions(+), 661 deletions(-) diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index b0fe0fe..09a518b 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -9,12 +9,43 @@ #ifndef _ASM_POWERPC_NVRAM_H #define _ASM_POWERPC_NVRAM_H - +#include linux/types.h #include linux/errno.h #include linux/list.h #include uapi/asm/nvram.h +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + +struct err_log_info { + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +extern struct nvram_os_partition oops_log_partition; + #ifdef CONFIG_PPC_PSERIES +extern struct nvram_os_partition rtas_log_partition; + extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, @@ -50,6 +81,23 @@ extern void pmac_xpram_write(int xpaddr, u8 data); /* Synchronize NVRAM */ extern voidnvram_sync(void); +/* Initialize NVRAM OS partition */ +extern int __init nvram_init_os_partition(struct nvram_os_partition *part); + +/* Initialize NVRAM oops partition */ +extern void __init nvram_init_oops_partition(int rtas_partition_exists); + +/* Read a NVRAM partition */ +extern int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt); + +/* Write to NVRAM OS partition */ +extern int nvram_write_os_partition(struct nvram_os_partition *part, + char *buff, int length, + unsigned int err_type, + unsigned int error_log_cnt); + /* Determine NVRAM size */ extern ssize_t nvram_get_size(void); diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index b390f55..123d7ff 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -343,8 +343,12 @@ extern int early_init_dt_scan_rtas(unsigned long node, extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES +extern unsigned long last_rtas_event; +extern int clobbering_unread_rtas_event(void); extern int pseries_devicetree_update(s32 scope); extern void post_mobility_fixup(void); +#else +static inline int clobbering_unread_rtas_event(void) { return 0; } #endif #ifdef CONFIG_PPC_RTAS_DAEMON diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34f7c9b..42e5c6a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -26,6 +26,9 @@ #include linux/init.h #include linux/slab.h #include linux/spinlock.h +#include linux/kmsg_dump.h +#include linux/pstore.h +#include linux/zlib.h #include asm/uaccess.h #include asm/nvram.h #include asm/rtas.h @@ -54,6 +57,659 @@ struct nvram_partition { static LIST_HEAD(nvram_partitions); +#ifdef CONFIG_PPC_PSERIES +struct nvram_os_partition rtas_log_partition = { + .name = ibm,rtas-log, + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; +#endif + +struct nvram_os_partition oops_log_partition = { + .name = lnx,oops-log, + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *nvram_os_partitions[] = { +#ifdef CONFIG_PPC_PSERIES + ibm,rtas-log, +#endif + lnx,oops-log, + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper
[PATCH v5 3/4] pstore: add pstore support on powernv
This patch extends pstore, a generic interface to platform dependent persistent storage, support for powernv platform to capture certain useful information, during dying moments. Such support is already in place for pseries platform. This patch re-uses most of that code. It is a common practice to compile kernels with both CONFIG_PPC_PSERIES=y and CONFIG_PPC_POWERNV=y. The code in nvram_init_oops_partition() routine still works as intended, as the caller is platform specific code which passes the appropriate value for rtas_partition_exists parameter. In all other places, where CONFIG_PPC_PSERIES or CONFIG_PPC_POWERNV flag is used in this patchset, it is to reduce the kernel size in cases where this flag is not set and doesn't have any impact logic wise. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/kernel/nvram_64.c | 25 +++-- arch/powerpc/platforms/powernv/opal-nvram.c | 10 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 42e5c6a..293da88 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -127,6 +127,14 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +#ifdef CONFIG_PPC_POWERNV +static struct nvram_os_partition skiboot_partition = { + .name = ibm,skiboot, + .index = -1, + .os_partition = false +}; +#endif + #ifdef CONFIG_PPC_PSERIES static struct nvram_os_partition of_config_partition = { .name = of-config, @@ -477,6 +485,16 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time-tv_nsec = 0; break; #endif +#ifdef CONFIG_PPC_POWERNV + case PSTORE_TYPE_PPC_OPAL: + sig = NVRAM_SIG_FW; + part = skiboot_partition; + *type = PSTORE_TYPE_PPC_OPAL; + *id = PSTORE_TYPE_PPC_OPAL; + time-tv_sec = 0; + time-tv_nsec = 0; + break; +#endif default: return 0; } @@ -552,8 +570,11 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; - nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + if (machine_is(pseries)) { + nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS; + nvram_type_ids[3] = PSTORE_TYPE_PPC_OF; + } else + nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL; nvram_pstore_info.buf = oops_data; nvram_pstore_info.bufsize = oops_data_sz; diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index f9896fd..9db4398 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -16,6 +16,7 @@ #include linux/of.h #include asm/opal.h +#include asm/nvram.h #include asm/machdep.h static unsigned int nvram_size; @@ -62,6 +63,15 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) return count; } +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + void __init opal_nvram_init(void) { struct device_node *np; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v5 2/4] pstore: Add pstore type id for PPC64 opal nvram partition
On 02/06/2015 01:06 AM, Hari Bathini wrote: This patch adds a new PPC64 partition type to be used for opal specific nvram partition. A new partition type is needed as none of the existing type matches this partition type. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com This patch series is reviewed by Kees. Reference link: https://lkml.org/lkml/2015/2/5/651 Reviewed-by: Kees Cook keesc...@chromium.org Thanks Hari --- fs/pstore/inode.c |3 +++ include/linux/pstore.h |1 + 2 files changed, 4 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5041660..8e0c009 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -359,6 +359,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_COMMON: sprintf(name, powerpc-common-%s-%lld, psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, powerpc-opal-%s-%lld, psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, unknown-%s-%lld, psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index ece0c6b..af44980 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,6 +39,7 @@ enum pstore_type_id { PSTORE_TYPE_PPC_RTAS= 4, PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_PPC_COMMON = 6, + PSTORE_TYPE_PPC_OPAL= 7, PSTORE_TYPE_UNKNOWN = 255 }; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] nvram: print no error message when nvram is not set as pstore backend
Pstore only supports one backend at a time. The preferred pstore backend is set by passing the pstore.backend=name argument to the kernel at boot time. Currently, while trying to register with pstore, nvram throws an error message even when pstore.backend != nvram, which is unnecessary. This patch removes the error message in case pstore.backend != nvram. Signed-off-by: Hari Bathini hbath...@linux.vnet.ibm.com --- arch/powerpc/kernel/nvram_64.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 1e703f8..bfdbcab 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -582,9 +582,10 @@ static int nvram_pstore_init(void) spin_lock_init(nvram_pstore_info.buf_lock); rc = pstore_register(nvram_pstore_info); - if (rc != 0) - pr_err(nvram: pstore_register() failed, defaults to - kmsg_dump; returned %d\n, rc); + if (rc (rc != -EPERM)) + /* Print error only when pstore.backend == nvram */ + pr_err(nvram: pstore_register() failed, returned %d. + Defaults to kmsg_dump\n, rc); return rc; } ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: on crash, kexec'ed kernel needs all CPUs are online
On 11/05/2015 07:02 AM, David Gibson wrote: On Wed, 4 Nov 2015 14:54:51 +0100 Laurent Vivier <lviv...@redhat.com> wrote: On 04/11/2015 13:34, Hari Bathini wrote: On 10/16/2015 12:30 AM, Laurent Vivier wrote: On kexec, all secondary offline CPUs are onlined before starting the new kernel, this is not done in the case of kdump. If kdump is configured and a kernel crash occurs whereas some secondaries CPUs are offline (SMT=off), the new kernel is not able to start them and displays some "Processor X is stuck.". Starting with POWER8, subcore logic relies on all threads of core being booted. So, on startup kernel tries to start all threads, and asks OPAL (or RTAS) to start all CPUs (including threads). If a CPU has been offlined by the previous kernel, it has not been returned to OPAL, and thus OPAL cannot restart it: this CPU has been lost... Signed-off-by: Laurent Vivier<lviv...@redhat.com> Hi Laurent, Hi Hari, Sorry for jumping too late into this. better late than never :) Are you seeing this issue even with the below patches: pseries: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c1caae3de46a072d0855729aed6e793e536a4a55 Unfortunately, this is unlikely to be relevant - this fixes a failure while setting up the kexec. The problem we see occurs once we've booted the second kernel and it's attempting to bring up secondary CPUs. opal/powernv: https://github.com/open-power/skiboot/commit/9ee56b5 Very interesting. Is there a way to have a firmware with the fix ? From Laurent's analysis of the crash, I don't think this will be relevant either, but I'm not sure. It would be very interesting to know which (if any) released firmwares include this patch so we can test it. Hi Laurent/David, I am not so sure on this. While I get back on this, can you confirm you are seeing the issue in both PowerVM (pseries) and baremetal (powernv). What is the kernel version where the issue is seen for PowerVM and/or baremetal. Also, for baremetal, can you mention the OPAL version on which the issue is reproducible. If a bug is raised for this, I would be happy to be pointed to, to get more information on this. Thanks Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: on crash, kexec'ed kernel needs all CPUs are online
On 10/16/2015 12:30 AM, Laurent Vivier wrote: On kexec, all secondary offline CPUs are onlined before starting the new kernel, this is not done in the case of kdump. If kdump is configured and a kernel crash occurs whereas some secondaries CPUs are offline (SMT=off), the new kernel is not able to start them and displays some "Processor X is stuck.". Starting with POWER8, subcore logic relies on all threads of core being booted. So, on startup kernel tries to start all threads, and asks OPAL (or RTAS) to start all CPUs (including threads). If a CPU has been offlined by the previous kernel, it has not been returned to OPAL, and thus OPAL cannot restart it: this CPU has been lost... Signed-off-by: Laurent VivierHi Laurent, Sorry for jumping too late into this. Are you seeing this issue even with the below patches: pseries: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c1caae3de46a072d0855729aed6e793e536a4a55 opal/powernv: https://github.com/open-power/skiboot/commit/9ee56b5 Thanks Hari --- arch/powerpc/kernel/crash.c | 20 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 51dbace..3ca9452 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -299,11 +300,30 @@ int crash_shutdown_unregister(crash_shutdown_t handler) } EXPORT_SYMBOL(crash_shutdown_unregister); +/* + * The next kernel will try to start all secondary CPUs and if + * there are not online it will fail to start them. + * + */ +static void wake_offline_cpus(void) +{ + int cpu = 0; + + for_each_present_cpu(cpu) { + if (!cpu_online(cpu)) { + pr_info("kexec: Waking offline cpu %d.\n", cpu); + cpu_up(cpu); + } + } +} + void default_machine_crash_shutdown(struct pt_regs *regs) { unsigned int i; int (*old_handler)(struct pt_regs *regs); + wake_offline_cpus(); + /* * This function is only called after the system * has panicked or is otherwise in a critical state. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 2/2] powerpc/fadump: parse fadump reserve memory size based on memory range
Currently, memory for fadump can be specified with fadump_reserve_mem=size, where only a fixed size can be specified. Add the below syntax as well, to support conditional reservation based on system memory size: fadump_reserve_mem=:[,:,...] This syntax helps using the same commandline parameter for different system memory sizes. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Reviewed-by: Mahesh J Salgaonkar <mah...@linux.vnet.ibm.com> --- Changes in v2: 1. Changed subject from "[PATCH v2 2/2] powerpc/fadump: add support to parse size based on memory range". 2. Rebased to latest upstream. arch/powerpc/kernel/fadump.c | 64 -- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3cb3b02a..e435828 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -193,6 +193,56 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, return addr; } +/* + * This function parses command line for fadump_reserve_mem= + * + * Supports the below two syntaxes: + *1. fadump_reserve_mem=size + *2. fadump_reserve_mem=ramsize-range:size[,...] + * + * Sets fw_dump.reserve_bootvar with the memory size + * provided, 0 otherwise + * + * The function returns -EINVAL on failure, 0 otherwise. + */ +static int __init parse_fadump_reserve_mem(void) +{ + char *name = "fadump_reserve_mem="; + char *fadump_cmdline = NULL, *cur; + + fw_dump.reserve_bootvar = 0; + + /* find fadump_reserve_mem and use the last one if there are many */ + cur = strstr(boot_command_line, name); + while (cur) { + fadump_cmdline = cur; + cur = strstr(cur+1, name); + } + + /* when no fadump_reserve_mem= cmdline option is provided */ + if (!fadump_cmdline) + return 0; + + fadump_cmdline += strlen(name); + + /* for fadump_reserve_mem=size cmdline syntax */ + if (!is_param_range_based(fadump_cmdline)) { + fw_dump.reserve_bootvar = memparse(fadump_cmdline, NULL); + return 0; + } + + /* for fadump_reserve_mem=ramsize-range:size[,...] cmdline syntax */ + cur = fadump_cmdline; + fw_dump.reserve_bootvar = parse_mem_range_size("fadump_reserve_mem", + , memblock_phys_mem_size()); + if (cur == fadump_cmdline) { + printk(KERN_INFO "fadump_reserve_mem: Invaild syntax!\n"); + return -EINVAL; + } + + return 0; +} + /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,12 +262,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) { unsigned long size; + /* sets fw_dump.reserve_bootvar */ + parse_fadump_reserve_mem(); + /* * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ if (fw_dump.reserve_bootvar) return fw_dump.reserve_bootvar; + else + printk(KERN_INFO "fadump: calculating default boot size\n"); /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -348,15 +403,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, ); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 0/2] powerpc/fadump: support memory range syntax for fadump memory reservation
This patchset adds support to input system memory range based memory size for fadump reservation. The crashkernel parameter already supports such syntax. The first patch refactors the parsing code of crashkernel parameter for reuse. The second patch uses the newly refactored parsing code to reserve memory for fadump based on system memory size. --- Hari Bathini (2): refactor code parsing size based on memory range powerpc/fadump: parse fadump reserve memory size based on memory range arch/powerpc/kernel/fadump.c | 64 include/linux/kernel.h |5 ++ kernel/kexec_core.c | 63 ++-- kernel/params.c | 96 ++ 4 files changed, 161 insertions(+), 67 deletions(-) -- Hari Bathini ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/2] refactor code parsing size based on memory range
Currently, crashkernel parameter supports the below syntax to parse size based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters with similar syntax. So, move this code to a more generic place for code reuse. Cc: Eric Biederman <ebied...@xmission.com> Cc: Vivek Goyal <vgo...@redhat.com> Cc: Rusty Russell <ru...@rustcorp.com.au> Cc: ke...@lists.infradead.org Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- changes in v2: 1. Rebased to latest upstream. 2. Marked few more people on cc. include/linux/kernel.h |5 +++ kernel/kexec_core.c| 63 +++- kernel/params.c| 96 3 files changed, 106 insertions(+), 58 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10f..72f55e5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -436,6 +436,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_param_range_based(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0..d43f5cc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1083,59 +1083,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, ); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); + *crash_size = parse_mem_range_size("crashkernel", , system_ram); + if (cur == cmdline) + return -EINVAL; if (*crash_size > 0) { while (*cur && *cur != ' ' && *cur != '@') @@ -1272,7 +1222,6 @@ static int __init __parse_crashkernel(char *cmdline, const char *name, const char *suffix) { - char*first_colon, *first_space; char*ck_cmdline; BUG_ON(!crash_size || !crash_base); @@ -1290,12 +1239,10 @@ static int __init __parse_crashkernel(char *cmdline, return parse_crashkernel_suffix(ck_cmdline, crash_size, suffix); /* -* if the commandline contains a ':', then that's the extended +* if the parameter is range based, then that's the extended * syntax -- if not, it must be the classic syntax */ - first_col
Re: [v2,1/2] refactor code parsing size based on memory range
On 06/24/2016 10:56 AM, Michael Ellerman wrote: On Wed, 2016-22-06 at 19:25:26 UTC, Hari Bathini wrote: Currently, crashkernel parameter supports the below syntax to parse size based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters with similar syntax. So, move this code to a more generic place for code reuse. Cc: Eric Biederman <ebied...@xmission.com> Cc: Vivek Goyal <vgo...@redhat.com> Cc: Rusty Russell <ru...@rustcorp.com.au> Cc: ke...@lists.infradead.org Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Hari, it's not immediately clear that this makes no change to the logic in the kexec code. Can you reply with a longer change log explaining why the old & new logic is the same for kexec. Hi Michael, Please consider this changelog for this patch: -- crashkernel parameter supports different syntaxes to specify the amount of memory to be reserved for kdump kernel. Below is one of the supported syntaxes that needs parsing to find the memory size to reserve, based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters, like fadump_reserve_mem, which could use similar syntax. So, to reuse code, moving the code that checks if the parameter syntax is as above and also the code that parses memory size to reserve, for this syntax. While the code is moved to kernel/params.c file, there is no change in logic for crashkernel parameter parsing as the moved code is invoked with function calls at appropriate places. -- Thanks Hari diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10f..72f55e5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -436,6 +436,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_param_range_based(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0..d43f5cc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1083,59 +1083,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, ); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); + *crash_size = parse_mem_range_size("crashkernel", , system_ram); + if (cur == cmdline) + return -EINVAL; if (*crash_size > 0) {
Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 03/30/2016 04:47 PM, Michael Ellerman wrote: On Wed, 2016-03-30 at 13:14 +0530, Hari Bathini wrote: Alternatively, how about moving the OOLs handlers that can't be branched with LOAD_HANDLER under __end_interrupts. This way we won't be copying more than a few absolutely needed handlers. STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) . . STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) We can leave __end_handlers marker to indicate code that should be part of the first 64K of kernel image. That might work. But I suspect you will run into issues with ".org backwards", ie. running out of space in head_64.S But try it and let me know if it works. It worked. Doing some sanity testing. Will post v3 soon with this approach. I think we also need to write a script or little C program which looks at the vmlinux and checks that nothing below __end_whatever does a direct branch. So that we don't break it again in future. Yep. That would make life easy.. Let me see if I can do something about it. Thanks Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Signed-off-by: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com> --- changes from v2: 2. Move the OOL handlers before __end_interrupts marker instead of moving the __end_interrupts marker 3. Leave __end_handlers marker as is. arch/powerpc/kernel/exceptions-64s.S | 29 +++-- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..9ac3a38 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -953,6 +953,25 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) + /* +* Out-Of-Line handlers for relocation-on interrupt vectors +* +* We need these OOL handlers to be below __end_interrupts +* marker to enusre we also copy these OOL handlers along +* with the interrupt vectors to real address 0x100 when +* running a relocatable kernel. Because the interrupt +* vectors branching to these OOL handlers are not long +* enough to use LOAD_HANDLER() for branching. +*/ + STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) + MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) + + STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* Other future vectors */ .align 7 .globl __end_interrupts @@ -1234,16 +1253,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) .globl __end_handlers __end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ - STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) - MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) - - STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) - STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec
[PATCH v4 3/3] ppc64/book3s: remove __end_handlers marker
__end_handlers marker was intended to mark down upto code that gets called from exception prologs. But that hasn't kept pace with code changes. Case in point, slb_miss_realmode being called from exception prolog code but isn't below __end_handlers marker. So, __end_handlers marker is as good as a comment but could be misleading at times if it isn't in sync with the code, as is the case now. So, let us avoid this confusion by having a better comment and removing __end_handlers marker altogether. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/kernel/exceptions-64s.S | 13 - 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index c193ebd..80f9fc4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,11 +764,10 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the - * addresses of these handlers using the LOAD_HANDLER macro, - * which uses an ori instruction, these handlers must be in - * the first 64k of the kernel image. + * Ensure that any handlers that get invoked from the exception prologs + * above are below the first 64KB (0x1) of the kernel image because + * the prologs assemble the addresses of these handlers using the + * LOAD_HANDLER macro, which uses an ori instruction. */ /*** Common interrupt handlers ***/ @@ -1243,10 +1242,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) bl vsx_unavailable_exception b ret_from_except - .align 7 - .globl __end_handlers -__end_handlers: - #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 2/3] ppc64/book3s: make some room for common interrupt vector code
With the previous patch, we choke out whatever little space is left below 0x7000 (FWNMI hard block) while there is a hole of ~1400 bytes below __end_interrupts marker when CONFIG_CBE_RAS is disabled. Considering CONFIG_CBE_RAS is not enabled by default for BOOK3S, this is not a desirable scenario especially when we have to worry about each additional instruction that goes below 0x7000. Memory region from 0x1800 to 0x4000 is dedicated for common interrupt vector code. Also, we never hit an interrupt below 0x300 when IR=DR=1 implying memory region between 0x4000 to 0x4300 can also be used for common interrupt vector code. So, we can effectively use memory region between 0x1800 to 0x4300 for common interrupt vector code. This patch tries to free up some space below 0x7000 by rearranging the common interrupt vector code. The approach here is to avoid large holes below 0x4300 for any kernel configuration. For this, let us move common interrupt vector code that only gets enabled with CONFIG_CBE_RAS above 0x8000, as it doesn't need to be too close to the call sites and can be branched to with LOAD_HANDLER() as long as it is within the first 64KB (0x1) of the kernel image. Instead, lets move common interrupt vector code marked h_instr_storage_common, facility_unavailable_common & hv_facility_unavailable_common below 0x4300. This leaves ~250 bytes free below 0x4300 and ~1150 bytes free below 0x7000 - enough space to stop worrying about every additional instruction that goes below 0x7000. This patch assumes at least commit 376af594, part of the patch series that starts with commit 468a3302, is part of the code to avoid messy compilation issues like: relocation truncated to fit: R_PPC64_REL14 against `.text'+1c90 Makefile:864: recipe for target 'vmlinux' failed I tested this patch successfully on ppc64, ppc64le lpars and baremetal environments. Couldn't test it on IBM cell blade though but expecting no problems with this patch in IBM cell blade environment as well. If someone can test this patch in cell platform, it would be great. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/kernel/exceptions-64s.S | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index f76b2f3..c193ebd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -786,6 +786,7 @@ kvmppc_skip_Hinterrupt: STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception) STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception) STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception) + STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) STD_EXCEPTION_COMMON(0xe40, emulation_assist, emulation_assist_interrupt) STD_EXCEPTION_COMMON_ASYNC(0xe60, hmi_exception, handle_hmi_exception) #ifdef CONFIG_PPC_DOORBELL @@ -794,6 +795,9 @@ kvmppc_skip_Hinterrupt: STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception) #endif STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception) + STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) + STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) + STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception) STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception) #ifdef CONFIG_ALTIVEC @@ -801,11 +805,6 @@ kvmppc_skip_Hinterrupt: #else STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception) #endif -#ifdef CONFIG_CBE_RAS - STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception) - STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception) - STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception) -#endif /* CONFIG_CBE_RAS */ /* * Relocation-on interrupts: A subset of the interrupts can be delivered @@ -1029,8 +1028,6 @@ instruction_access_common: li r5,0x400 b do_hash_page/* Try to handle as hpte fault */ - STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) - /* * Here is the common SLB miss user that is used when going to virtual * mode for SLB misses, that is currently not used @@ -1246,9 +1243,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) bl vsx_unavailable_exception b ret_from_except - STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) - STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 .globl __end_handlers __end_handlers: @@ -1268,6 +1262,12 @@ fwnmi_data_area: . = 0x8000 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ +#ifdef CONFIG_CBE_RAS + STD_EXCEPT
[PATCH v4 1/3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Signed-off-by: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com> --- Michael, I did test this patchset in different scenarios. But if you feel the change is too radical, we could go with version2. But I thought this was worth a shot. changes from v3: 1. No changes in this patch except for a spellcheck 2. A new patch that tries to free up space below 0x7000 (2/3) 3. A new patch to remove __end_handlers marker (3/3) arch/powerpc/kernel/exceptions-64s.S | 29 +++-- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..f76b2f3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -953,6 +953,25 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) + /* +* Out-Of-Line handlers for relocation-on interrupt vectors +* +* We need these OOL handlers to be below __end_interrupts +* marker to ensure we also copy these OOL handlers along +* with the interrupt vectors to real address 0x100 when +* running a relocatable kernel. Because the interrupt +* vectors branching to these OOL handlers are not long +* enough to use LOAD_HANDLER() for branching. +*/ + STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) + MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) + + STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* Other future vectors */ .align 7 .globl __end_interrupts @@ -1234,16 +1253,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) .globl __end_handlers __end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ - STD_RELON_EXCEPTION_HV_OOL(0xe40, emul
Re: [PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 04/01/2016 04:07 PM, Michael Ellerman wrote: On Fri, 2016-04-01 at 12:23 +0530, Hari Bathini wrote: On 04/01/2016 11:44 AM, Michael Ellerman wrote: On Wed, 2016-03-30 at 23:49 +0530, Hari Bathini wrote: Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full ... Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. ... changes from v2: 2. Move the OOL handlers before __end_interrupts marker instead of moving the __end_interrupts marker 3. Leave __end_handlers marker as is. Hi Hari, Thanks for trying this. In the end I've decided it's not a good option. If you build an allmodconfig, and turn on CONFIG_RELOCATABLE, and then look at the disassembly, you see this: c0006ffc: 48 00 29 04 b c0009900 <.ret_from_except> c0007000 <__end_handlers>: At 0x7000 we have the FWNMI area, which is fixed and can't move. As you see above we end up with only 4 bytes of space between the end of the handlers and the FWNMI area. So any tiny change that adds two more instructions prior to 0x7000 will then fail to build. Hi Michael, I agree. But the OOL handlers that are moved up in v3 were below 0x7000 earlier as well and moving them below __end_interrupts marker shouldn't make any difference in terms of space consumption at least in comparison between v2 & v3. So, I guess picking either v2 or v3 doesn't change this for better. It does make a difference, due to alignment. Prior to your patch we have ~24 bytes free. Hi Michael, Hmmm.. I thought ~24 bytes was not such a difference but with the scenario you mentioned it does sound critical. Actually, this patch came into being for want of another 8~12 bytes. So, I should have known better about space constraint. Also, there is code between __end_interrupts and __end_handlers that is not location dependent as long as it is within 64K (0x1) that can be moved above 0x8000, if need be. That's true, but that sort of change is unlikely to backport well. And we need to backport this fix to everything. That does sound like a maintainer's nightmare. But if you can get that to work I'll consider it. I tried quickly but couldn't get it working, due to problems with the feature else sections being too far away from. Same case. May need sometime to get that right. Also, exploring holes between __start_interrupts & __end_interrupts. Will try and get back on this soon. If none of this works, we have v2 anyway. Thanks Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: ppc64/book3s: copy interrupts till __end_handlers marker instead of __end_interrupts
On 03/29/2016 03:47 PM, Michael Ellerman wrote: Hi Hari, You win the "Best Change Log of the Year" award. Some comments below ... On Mon, 2016-28-03 at 11:23:22 UTC, Hari Bathini wrote: Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path firstly, by moving down __end_handlers marker down past OOL handlers. Secondly, copying interrupt vectors down till __end_handlers marker instead of __end_interrupts, when running a relocatable kernel, to make sure we endup in relocated (kdump) kernel's OOL handler instead of crashed kernel's. Thirdly, by marking all the interrupt vector code that is copied down to real address 0x100 as executable, considering the relocation on exception feature that allows exceptions to be raised in virtual mode (IR=DR=1). This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. So I think you've missed one important case. My bad! I missed out on considering this case.. In do_final_fixups() we recopy the (now patched) kernel code down to zero. That code uses __end_interrupts as its limit, so I think if you look closely your OOL handlers down at zero will not have had feature fixups applied to them. I think perhaps the better fix is just to move __end_interrupts down (up) to the right location. AFAICS all users of __end_interrupts actually want that address. It would also mean we could remove __end_handlers as unused. True. This sounds less complicated. So can you please check that I'm right about do_final_fixups(), and then try moving __end_interrupts and check that works? Yeah. Testing the patch. Will post it soon. Thanks for the review! - Hari ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 03/30/2016 12:44 PM, Hari Bathini wrote: On 03/30/2016 05:55 AM, Michael Ellerman wrote: On Tue, 2016-29-03 at 18:34:37 UTC, Hari Bathini wrote: diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..e598580 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the + * Code from here down to end of out of line handlers is invoked from + * the exception prologs above. Because the prologs assemble the I think it would be better to just replace __end_handlers with __end_interrupts, that way it's entirely clear what location you're talking about. @@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) -/* Other future vectors */ -.align7 -.globl__end_interrupts -__end_interrupts: - .align7 system_call_entry: bsystem_call_common @@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) -.align7 -.globl__end_handlers -__end_handlers: - Sorry I wasn't clear in my last mail, please do this as a separate cleanup patch after this patch. ok.. @@ -1244,6 +1235,16 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) +/* FIXME: For now, let us move the __end_interrupts marker down past Why is it FIXME? In general I don't want to merge code that adds a FIXME unless there is some very good reason. AFAICS this is a permanent solution isn't it? Except for a few short interrupt vectors like 0x4f00, 04f20, etc., all other vectors defined till __end_interrupts marker ensure that LOAD_HANDLER() is used for branching to labels like system_call_entry, data_access_common, etc. that are currently not copied to real 0 in relocation case. So, we are forced to move the __end_interrupts marker down only to handle space constraint in the short vectors. So, I added the FIXME to remind the scope for improvement in the code. But after thinking over again now, moving the marker down makes us copy an additional 1~2 KB along with the 21~22 KB that we are copying already. So, not much of an improvement to lose sleep over or to add a FIXME, I guess. Your thoughts? Alternatively, how about moving the OOLs handlers that can't be branched with LOAD_HANDLER under __end_interrupts. This way we won't be copying more than a few absolutely needed handlers. STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) . . STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) We can leave __end_handlers marker to indicate code that should be part of the first 64K of kernel image. Thanks Hari Also, FIXME is the reason, why I did not replace __end_handlers with __end_interrupts in the comment earlier. + * the out-of-line handlers, to make sure we also copy OOL handlers + * to real adress 0x100 when running a relocatable kernel. This helps It doesn't "help" it's 100% required. Yep. Will change the wording. Thanks for the review! - Hari + * in cases where interrupt vectors are not long enough (like 0x4f00, + * 0x4f20, etc.) to branch out to OOL handlers with LOAD_HANDLER(). + */ +.align7 +.globl__end_interrupts +__end_interrupts: + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path by moving the __end_interrupts marker down past OOL handlers to make sure that we also copy OOL handlers to real address 0x100 when running a relocatable kernel. This helps in cases discussed above, where interrupt vectors are not long enough to branch out to OOL handlers with LOAD_HANDLER(). While we are here, let us remove the virtually insignificant __end_handlers marker. This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Signed-off-by: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com> --- changes from v1: 1. Changed the subject from "copy interrupts till __end_handlers marker instead of __end_interrupts" to a more generic one 2. Used __end_interrupts marker instead of __end_handlers to make the fix less complicated. 3. Removed unused __end_handlers marker. arch/powerpc/kernel/exceptions-64s.S | 23 --- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..e598580 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the + * Code from here down to end of out of line handlers is invoked from + * the exception prologs above. Because the prologs assemble the * addresses of these handlers using the LOAD_HANDLER macro, * which uses an ori instruction, these handlers must be in * the first 64k of the kernel image. @@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) - /* Other future vectors */ - .align 7 - .globl __end_interrupts -__end_interrupts: - .align 7 system_call_entry: b system_call_common @@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) @@ -1244,6 +1235,16 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60
Re: [PATCH v3] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 04/01/2016 11:44 AM, Michael Ellerman wrote: On Wed, 2016-03-30 at 23:49 +0530, Hari Bathini wrote: Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full ... Let us fix this undependable code path by moving these OOL handlers below __end_interrupts marker to make sure we also copy these handlers to real address 0x100 when running a relocatable kernel. Because the interrupt vectors branching to these OOL handlers are not long enough to use LOAD_HANDLER() for branching as discussed above. ... changes from v2: 2. Move the OOL handlers before __end_interrupts marker instead of moving the __end_interrupts marker 3. Leave __end_handlers marker as is. Hi Hari, Thanks for trying this. In the end I've decided it's not a good option. If you build an allmodconfig, and turn on CONFIG_RELOCATABLE, and then look at the disassembly, you see this: c0006ffc: 48 00 29 04 b c0009900 <.ret_from_except> c0007000 <__end_handlers>: At 0x7000 we have the FWNMI area, which is fixed and can't move. As you see above we end up with only 4 bytes of space between the end of the handlers and the FWNMI area. So any tiny change that adds two more instructions prior to 0x7000 will then fail to build. Hi Michael, I agree. But the OOL handlers that are moved up in v3 were below 0x7000 earlier as well and moving them below __end_interrupts marker shouldn't make any difference in terms of space consumption at least in comparison between v2 & v3. So, I guess picking either v2 or v3 doesn't change this for better. Also, there is code between __end_interrupts and __end_handlers that is not location dependent as long as it is within 64K (0x1) that can be moved above 0x8000, if need be. For these reasons, I feel v3 is better going forward as it keeps __start_interrupts to __end_interrupts code compact and leaves alone the code that doesn't need to be copied to real 0. Am I missing something here? Thanks Hari None of that's your fault, it's just the nature of the code in there, it's very space constrained. For now I'll take your v2, but I'll edit the comment and drop the removal of __end_handlers. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] ppc64/book3s: copy interrupts till __end_handlers marker instead of __end_interrupts
Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out- of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (read OOL handlers) outside this section should be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need atleast 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we endup branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this undependable code path firstly, by moving down __end_handlers marker down past OOL handlers. Secondly, copying interrupt vectors down till __end_handlers marker instead of __end_interrupts, when running a relocatable kernel, to make sure we endup in relocated (kdump) kernel's OOL handler instead of crashed kernel's. Thirdly, by marking all the interrupt vector code that is copied down to real address 0x100 as executable, considering the relocation on exception feature that allows exceptions to be raised in virtual mode (IR=DR=1). This fix has been tested successfully in kdump scenario, on a lpar with 4K page size by using different default/production kernel and kdump kernel. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Signed-off-by: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/sections.h |3 ++- arch/powerpc/kernel/exceptions-64s.S |8 arch/powerpc/kernel/head_64.S|2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index abf5866..b4139a5 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -10,6 +10,7 @@ extern char __start_interrupts[]; extern char __end_interrupts[]; +extern char __end_handlers[]; extern char __prom_init_toc_start[]; extern char __prom_init_toc_end[]; @@ -39,7 +40,7 @@ static inline int overlaps_interrupt_vector_text(unsigned long start, { unsigned long real_start, real_end; real_start = __start_interrupts - _stext; - real_end = __end_interrupts - _stext; + real_end = __end_handlers - _stext; return start < (unsigned long)__va(real_end) && (unsigned long)__va(real_start) < end; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..98e2ce5 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1230,10 +1230,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) @@ -1244,6 +1240,10 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) +
Re: [v2] ppc64/book3s: fix branching to out of line handlers in relocation kernel
On 03/30/2016 05:55 AM, Michael Ellerman wrote: On Tue, 2016-29-03 at 18:34:37 UTC, Hari Bathini wrote: diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..e598580 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,8 +764,8 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the + * Code from here down to end of out of line handlers is invoked from + * the exception prologs above. Because the prologs assemble the I think it would be better to just replace __end_handlers with __end_interrupts, that way it's entirely clear what location you're talking about. @@ -953,11 +953,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) - /* Other future vectors */ - .align 7 - .globl __end_interrupts -__end_interrupts: - .align 7 system_call_entry: b system_call_common @@ -1230,10 +1225,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - Sorry I wasn't clear in my last mail, please do this as a separate cleanup patch after this patch. ok.. @@ -1244,6 +1235,16 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* FIXME: For now, let us move the __end_interrupts marker down past Why is it FIXME? In general I don't want to merge code that adds a FIXME unless there is some very good reason. AFAICS this is a permanent solution isn't it? Except for a few short interrupt vectors like 0x4f00, 04f20, etc., all other vectors defined till __end_interrupts marker ensure that LOAD_HANDLER() is used for branching to labels like system_call_entry, data_access_common, etc. that are currently not copied to real 0 in relocation case. So, we are forced to move the __end_interrupts marker down only to handle space constraint in the short vectors. So, I added the FIXME to remind the scope for improvement in the code. But after thinking over again now, moving the marker down makes us copy an additional 1~2 KB along with the 21~22 KB that we are copying already. So, not much of an improvement to lose sleep over or to add a FIXME, I guess. Your thoughts? Also, FIXME is the reason, why I did not replace __end_handlers with __end_interrupts in the comment earlier. +* the out-of-line handlers, to make sure we also copy OOL handlers +* to real adress 0x100 when running a relocatable kernel. This helps It doesn't "help" it's 100% required. Yep. Will change the wording. Thanks for the review! - Hari +* in cases where interrupt vectors are not long enough (like 0x4f00, +* 0x4f20, etc.) to branch out to OOL handlers with LOAD_HANDLER(). +*/ + .align 7 + .globl __end_interrupts +__end_interrupts: + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 2/2] powerpc/fadump: add support to parse size based on memory range
Currently, memory for fadump can be specified with fadump_reserve_mem=size, where only a fixed size can be specified. Add the below syntax as well, to support conditional reservation based on system memory size: fadump_reserve_mem=:[,:,...] This syntax helps using the same commandline parameter for different system memory sizes. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: 1. Changed subject from "powerpc/fadump: add support to specify memory range based size" 2. Reused crashkernel parsing code that was moved to kernel/params.c (see patch 1/2) arch/powerpc/kernel/fadump.c | 64 -- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d0af58b..a868281 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -193,6 +193,56 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, return addr; } +/* + * This function parses command line for fadump_reserve_mem= + * + * Supports the below two syntaxes: + *1. fadump_reserve_mem=size + *2. fadump_reserve_mem=ramsize-range:size[,...] + * + * Sets fw_dump.reserve_bootvar with the memory size + * provided, 0 otherwise + * + * The function returns -EINVAL on failure, 0 otherwise. + */ +static int __init parse_fadump_reserve_mem(void) +{ + char *name = "fadump_reserve_mem="; + char *fadump_cmdline = NULL, *cur; + + fw_dump.reserve_bootvar = 0; + + /* find fadump_reserve_mem and use the last one if there are many */ + cur = strstr(boot_command_line, name); + while (cur) { + fadump_cmdline = cur; + cur = strstr(cur+1, name); + } + + /* when no fadump_reserve_mem= cmdline option is provided */ + if (!fadump_cmdline) + return 0; + + fadump_cmdline += strlen(name); + + /* for fadump_reserve_mem=size cmdline syntax */ + if (!is_param_range_based(fadump_cmdline)) { + fw_dump.reserve_bootvar = memparse(fadump_cmdline, NULL); + return 0; + } + + /* for fadump_reserve_mem=ramsize-range:size[,...] cmdline syntax */ + cur = fadump_cmdline; + fw_dump.reserve_bootvar = parse_mem_range_size("fadump_reserve_mem", + , memblock_phys_mem_size()); + if (cur == fadump_cmdline) { + printk(KERN_INFO "fadump_reserve_mem: Invaild syntax!\n"); + return -EINVAL; + } + + return 0; +} + /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,12 +262,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) { unsigned long size; + /* sets fw_dump.reserve_bootvar */ + parse_fadump_reserve_mem(); + /* * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ if (fw_dump.reserve_bootvar) return fw_dump.reserve_bootvar; + else + printk(KERN_INFO "fadump: calculating default boot size\n"); /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -352,15 +407,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, ); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] Refactor code parsing size based on memory range
Currently, crashkernel parameter supports the below syntax to parse size based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters with similar syntax. So, move this code to a more generic place for code reuse. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- While this patch in itself has nothing to do with powerpc, the powerpc patch (2/2) depends on this patch.. include/linux/kernel.h |5 +++ kernel/kexec_core.c| 63 +++- kernel/params.c| 96 3 files changed, 106 insertions(+), 58 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2f7775e..e755ed1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -429,6 +429,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_param_range_based(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1391d3e..71e92b2 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1084,59 +1084,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, ); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); + *crash_size = parse_mem_range_size("crashkernel", , system_ram); + if (cur == cmdline) + return -EINVAL; if (*crash_size > 0) { while (*cur && *cur != ' ' && *cur != '@') @@ -1273,7 +1223,6 @@ static int __init __parse_crashkernel(char *cmdline, const char *name, const char *suffix) { - char*first_colon, *first_space; char*ck_cmdline; BUG_ON(!crash_size || !crash_base); @@ -1291,12 +1240,10 @@ static int __init __parse_crashkernel(char *cmdline, return parse_crashkernel_suffix(ck_cmdline, crash_size, suffix); /* -* if the commandline contains a ':', then that's the extended +* if the parameter is range based, then that's the extended * syntax -- if not, it must be the classic syntax */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first
[PATCH 2/3] powerpc/fadump: add support to specify memory range based size
Currently, memory for fadump can be specified with fadump_reserve_mem=size, where only a fixed size can be specified. This patch tries to extend this syntax to support conditional reservation based on memory size, with the below syntax: fadump_reserve_mem=:[,:,...] This syntax helps using the same commandline parameter for different system memory sizes. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/kernel/fadump.c | 127 +++--- 1 file changed, 118 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d0af58b..a7fef3e 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -193,6 +193,121 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, return addr; } +#define FADUMP_MEM_CMDLINE_PREFIX "fadump_reserve_mem=" + +static __init char *get_last_fadump_reserve_mem(void) +{ + char *p = boot_command_line, *fadump_cmdline = NULL; + + /* find fadump_reserve_mem and use the last one if there are more */ + p = strstr(p, FADUMP_MEM_CMDLINE_PREFIX); + while (p) { + fadump_cmdline = p; + p = strstr(p+1, FADUMP_MEM_CMDLINE_PREFIX); + } + + return fadump_cmdline; +} + +#define parse_fadump_print(fmt, arg...) \ + printk(KERN_INFO "fadump_reserve_mem: " fmt, ##arg) + +/* + * This function parses command line for fadump_reserve_mem= + * + * Supports the below two syntaxes: + *1. fadump_reserve_mem=size + *2. fadump_reserve_mem=ramsize-range:size[,...] + * + * Sets fw_dump.reserve_bootvar with the memory size + * provided, 0 otherwise + * + * The function returns -EINVAL on failure, 0 otherwise. + */ +static int __init parse_fadump_reserve_mem(void) +{ + char *cur, *tmp; + char *first_colon, *first_space; + char *fadump_cmdline; + unsigned long long system_ram; + + fw_dump.reserve_bootvar = 0; + fadump_cmdline = get_last_fadump_reserve_mem(); + + /* when no fadump_reserve_mem= cmdline option is provided */ + if (!fadump_cmdline) + return 0; + + first_colon = strchr(fadump_cmdline, ':'); + first_space = strchr(fadump_cmdline, ' '); + cur = fadump_cmdline + strlen(FADUMP_MEM_CMDLINE_PREFIX); + + /* for fadump_reserve_mem=size cmdline syntax */ + if (!first_colon || (first_space && (first_colon > first_space))) { + fw_dump.reserve_bootvar = memparse(cur, ); + return 0; + } + + /* for fadump_reserve_mem=ramsize-range:size[,...] cmdline syntax */ + system_ram = memblock_phys_mem_size(); + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, ); + if (cur == tmp) { + parse_fadump_print("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + parse_fadump_print("'-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, ); + if (cur == tmp) { + parse_fadump_print("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + parse_fadump_print("end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + parse_fadump_print("':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, ); + if (cur == tmp) { + parse_fadump_print("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + parse_fadump_print("invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + fw_dump.reserve_bootvar = size; + break; + } + } while (*cur++ == ','); + + return 0; +} + /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,6 +327,9 @@ static
[PATCH 1/3] powerpc/fadump: set an upper limit for the default memory reserved for fadump
When boot memory size for fadump is not specified, memory is reserved for fadump based on system RAM size. As the system RAM size increases, the memory reserved for fadump increases as well. This patch sets an upper limit on the memory reserved for fadump, to avoid reserving excess memory. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/fadump.h |6 ++ arch/powerpc/kernel/fadump.c |4 2 files changed, 10 insertions(+) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index b4407d0..2c3cb32 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -43,6 +43,12 @@ #define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + (0x1UL << 26)) +/* + * Maximum memory needed for fadump to boot up successfully. Use this as + * an upper limit for fadump so we don't endup reserving excess memory. + */ +#define MAX_BOOT_MEM (0x1UL << 32) + #define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt) #ifndef ELF_CORE_EFLAGS diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3cb3b02a..d0af58b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -225,6 +225,10 @@ static inline unsigned long fadump_calculate_reserve_size(void) /* round it down in multiples of 256 */ size = size & ~0x0FFFUL; + /* Set an upper limit on the memory to be reserved */ + if (size > MAX_BOOT_MEM) + size = MAX_BOOT_MEM; + /* Truncate to memory_limit. We don't want to over reserve the memory.*/ if (memory_limit && size > memory_limit) size = memory_limit; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/3] powerpc/fadump: add support for fadump_nr_cpus= parameter
Kernel parameter 'nr_cpus' can be used to limit the maximum number of processors that an SMP kernel could support. This patch extends this to fadump by introducing 'fadump_nr_cpus' parameter that can help in booting fadump kernel on a lower memory footprint. Suggested-by: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com> Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/kernel/fadump.c | 22 ++ 1 file changed, 22 insertions(+) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a7fef3e..c75783c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -470,6 +470,28 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); +/* Look for fadump_nr_cpus= cmdline option. */ +static int __init early_fadump_nrcpus(char *p) +{ + int nr_cpus; + + /* +* fadump_nr_cpus parameter is only applicable on a +* fadump active kernel. This is to reduce memory +* needed to boot a fadump active kernel. +* So, check if we are booting after crash. +*/ + if (!is_fadump_active()) + return 0; + + get_option(, _cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} +early_param("fadump_nr_cpus", early_fadump_nrcpus); + static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [v4, 2/3] ppc64/book3s: make some room for common interrupt vector code
On 04/15/2016 06:29 PM, Michael Ellerman wrote: On Fri, 2016-04-15 at 21:06 +1000, Michael Ellerman wrote: Hi Hari, Thanks for persisting with this. On Thu, 2016-07-04 at 21:58:50 UTC, Hari Bathini wrote: With the previous patch, we choke out whatever little space is left below 0x7000 (FWNMI hard block) while there is a hole of ~1400 bytes below __end_interrupts marker when CONFIG_CBE_RAS is disabled. Considering CONFIG_CBE_RAS is not enabled by default for BOOK3S, this is not a desirable scenario especially when we have to worry about each additional instruction that goes below 0x7000. Memory region from 0x1800 to 0x4000 is dedicated for common interrupt vector code. Also, we never hit an interrupt below 0x300 when IR=DR=1 implying memory region between 0x4000 to 0x4300 can also be used for common interrupt vector code. So, we can effectively use memory region between 0x1800 to 0x4300 for common interrupt vector code. On Power9 the system-call-vectored instruction will use the region at 0x3000, so moving code into that space is not a good long term plan. I'll take your v2 and put it in next next week. I'll add this fixes line, which I think is correct: Fixes: c1fb6816fb1b ("powerpc: Add relocation on exception vector handlers") Yeah. Thanks! cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [3/3] powerpc/fadump: add support for fadump_nr_cpus= parameter
On 05/07/2016 09:42 AM, Michael Ellerman wrote: On Fri, 2016-06-05 at 11:51:08 UTC, Hari Bathini wrote: Kernel parameter 'nr_cpus' can be used to limit the maximum number of processors that an SMP kernel could support. This patch extends this to fadump by introducing 'fadump_nr_cpus' parameter that can help in booting fadump kernel on a lower memory footprint. Is there really no other way to do this? I really hate adding new, single use only command line parameters. Hmmm.. only alternative I can think about is enforcing a certain nr_cpu_ids value whenever fadump is active, but that doesn't sound right.. Any suggestions? Thanks Hari cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [2/3] powerpc/fadump: add support to specify memory range based size
On 05/07/2016 09:41 AM, Michael Ellerman wrote: On Fri, 2016-06-05 at 11:50:37 UTC, Hari Bathini wrote: Currently, memory for fadump can be specified with fadump_reserve_mem=size, where only a fixed size can be specified. This patch tries to extend this syntax to support conditional reservation based on memory size, with the below syntax: fadump_reserve_mem=:[,:,...] This syntax helps using the same commandline parameter for different system memory sizes. This is basically using the crashkernel= syntax right? Yep. One of the typical crashkernel syntax.. So can we please reuse the crashkernel= parsing code? but crashkernel has a few other variants which don't make sense for fadump. To reuse the crashkernel parsing code for fadump, it needs little bit of refactoring. Will try to do that and respin.. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RESEND][PATCH v2 1/2] kexec: refactor code parsing size based on memory range
crashkernel parameter supports different syntaxes to specify the amount of memory to be reserved for kdump kernel. Below is one of the supported syntaxes that needs parsing to find the memory size to reserve, based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters, like fadump_reserve_mem=, which could use similar syntax. This patch moves crashkernel's parsing code for above syntax to to kernel/params.c file for reuse. Two functions is_param_range_based() and parse_mem_range_size() are added to kernel/params.c file for this purpose. Any parameter that uses the above syntax can use is_param_range_based() function to validate the syntax and parse_mem_range_size() function to get the parsed memory size. While some code is moved to kernel/params.c file, there is no change functionality wise in parsing the crashkernel parameter. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: 1. Updated changelog include/linux/kernel.h |5 +++ kernel/kexec_core.c| 63 +++- kernel/params.c| 96 3 files changed, 106 insertions(+), 58 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a611..2df7ba2 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -435,6 +435,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_param_range_based(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5616755..3a74024 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1104,59 +1104,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, ); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); + *crash_size = parse_mem_range_size("crashkernel", , system_ram); + if (cur == cmdline) + return -EINVAL; if (*crash_size > 0) { while (*cur && *cur != ' ' && *cur != '@') @@ -1293,7 +1243,6 @@ static int __init __parse_crashkernel(char *cmdline, const char *name, const char *suffix) { - char*first_colon, *first_space; char*ck_cmdline; BUG_ON(!crash_size || !crash_base); @@ -1311,12 +1260,1
[RESEND][PATCH v2 0/2] powerpc/fadump: support memory range syntax for fadump memory reservation
This patchset adds support to input system memory range based memory size for fadump reservation. The crashkernel parameter already supports such syntax. The first patch refactors the parsing code of crashkernel parameter for reuse. The second patch uses the newly refactored parsing code to reserve memory for fadump based on system memory size. --- Hari Bathini (2): kexec: refactor code parsing size based on memory range powerpc/fadump: parse fadump reserve memory size based on memory range arch/powerpc/kernel/fadump.c | 64 include/linux/kernel.h |5 ++ kernel/kexec_core.c | 63 ++-- kernel/params.c | 96 ++ 4 files changed, 161 insertions(+), 67 deletions(-) ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RESEND][PATCH v2 2/2] powerpc/fadump: parse fadump reserve memory size based on memory range
Currently, memory for fadump can be specified with fadump_reserve_mem=size, where only a fixed size can be specified. Add the below syntax as well, to support conditional reservation based on system memory size: fadump_reserve_mem=:[,:,...] This syntax helps using the same commandline parameter for different system memory sizes. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Reviewed-by: Mahesh J Salgaonkar <mah...@linux.vnet.ibm.com> --- arch/powerpc/kernel/fadump.c | 64 -- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b3a6633..4661ae6 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -193,6 +193,56 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, return addr; } +/* + * This function parses command line for fadump_reserve_mem= + * + * Supports the below two syntaxes: + *1. fadump_reserve_mem=size + *2. fadump_reserve_mem=ramsize-range:size[,...] + * + * Sets fw_dump.reserve_bootvar with the memory size + * provided, 0 otherwise + * + * The function returns -EINVAL on failure, 0 otherwise. + */ +static int __init parse_fadump_reserve_mem(void) +{ + char *name = "fadump_reserve_mem="; + char *fadump_cmdline = NULL, *cur; + + fw_dump.reserve_bootvar = 0; + + /* find fadump_reserve_mem and use the last one if there are many */ + cur = strstr(boot_command_line, name); + while (cur) { + fadump_cmdline = cur; + cur = strstr(cur+1, name); + } + + /* when no fadump_reserve_mem= cmdline option is provided */ + if (!fadump_cmdline) + return 0; + + fadump_cmdline += strlen(name); + + /* for fadump_reserve_mem=size cmdline syntax */ + if (!is_param_range_based(fadump_cmdline)) { + fw_dump.reserve_bootvar = memparse(fadump_cmdline, NULL); + return 0; + } + + /* for fadump_reserve_mem=ramsize-range:size[,...] cmdline syntax */ + cur = fadump_cmdline; + fw_dump.reserve_bootvar = parse_mem_range_size("fadump_reserve_mem", + , memblock_phys_mem_size()); + if (cur == fadump_cmdline) { + printk(KERN_INFO "fadump_reserve_mem: Invaild syntax!\n"); + return -EINVAL; + } + + return 0; +} + /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,12 +262,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) { unsigned long size; + /* sets fw_dump.reserve_bootvar */ + parse_fadump_reserve_mem(); + /* * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ if (fw_dump.reserve_bootvar) return fw_dump.reserve_bootvar; + else + printk(KERN_INFO "fadump: calculating default boot size\n"); /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -348,15 +403,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, ); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 1/2] kexec: refactor code parsing size based on memory range
crashkernel parameter supports different syntaxes to specify the amount of memory to be reserved for kdump kernel. Below is one of the supported syntaxes that needs parsing to find the memory size to reserve, based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters, like fadump_reserve_mem=, which could use similar syntax. This patch moves crashkernel's parsing code for above syntax to to kernel/params.c file for reuse. Two functions is_param_range_based() and parse_mem_range_size() are added to kernel/params.c file for this purpose. Any parameter that uses the above syntax can use is_param_range_based() function to validate the syntax and parse_mem_range_size() function to get the parsed memory size. While some code is moved to kernel/params.c file, there is no change functionality wise in parsing the crashkernel parameter. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v2: 1. Moved the code to lib/cmdline.c instead of kernel/params.c include/linux/kernel.h |5 ++ kernel/kexec_core.c| 63 ++--- lib/cmdline.c | 104 3 files changed, 114 insertions(+), 58 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a611..39ff869 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -435,6 +435,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_colon_in_param(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5616755..152c4c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1104,59 +1104,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, ); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); + *crash_size = parse_mem_range_size("crashkernel", , system_ram); + if (cur == cmdline) + return -EINVAL; if (*crash_size > 0) { while (*cur && *cur != ' ' && *cur != '@') @@ -1293,7 +1243,6 @@ static int __init __parse_crashkernel(char *cmdline, const char *name, const char *suffix) { - char*first_colon, *first_space; char*ck_cmdline; BUG_ON(!crash_size || !crash_bas
[PATCH v3 0/2] powerpc/fadump: support memory range syntax for fadump memory reservation
This patchset adds support to input system memory range based memory size for fadump reservation. The crashkernel parameter already supports such syntax. The first patch refactors the parsing code of crashkernel parameter for reuse. The second patch uses the newly refactored parsing code to reserve memory for fadump based on system memory size. --- Hari Bathini (2): kexec: refactor code parsing size based on memory range powerpc/fadump: parse fadump reserve memory size based on memory range arch/powerpc/kernel/fadump.c | 63 ++--- include/linux/kernel.h |5 ++ kernel/kexec_core.c | 63 ++--- lib/cmdline.c| 104 ++ 4 files changed, 168 insertions(+), 67 deletions(-)
[PATCH v3 2/2] powerpc/fadump: parse fadump reserve memory size based on memory range
When fadump is enabled, by default 5% of system RAM is reserved for fadump kernel. While that works for most cases, it is not good enough for every case. Currently, to override the default value, fadump supports specifying memory to reserve with fadump_reserve_mem=size, where only a fixed size can be specified. This patch adds support to specify memory size to reserve for different memory ranges as below: fadump_reserve_mem=:[,:,...] Supporting range based input for "fadump_reserve_mem" parameter helps using the same commandline parameter for different system memory sizes. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Reviewed-by: Mahesh J Salgaonkar <mah...@linux.vnet.ibm.com> --- Changes from v2: 1. Updated changelog arch/powerpc/kernel/fadump.c | 63 -- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b3a6633..7c01b5b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -193,6 +193,55 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, return addr; } +/* + * This function parses command line for fadump_reserve_mem= + * + * Supports the below two syntaxes: + *1. fadump_reserve_mem=size + *2. fadump_reserve_mem=ramsize-range:size[,...] + * + * Sets fw_dump.reserve_bootvar with the memory size + * provided, 0 otherwise + * + * The function returns -EINVAL on failure, 0 otherwise. + */ +static int __init parse_fadump_reserve_mem(void) +{ + char *name = "fadump_reserve_mem="; + char *fadump_cmdline = NULL, *cur; + + fw_dump.reserve_bootvar = 0; + + /* find fadump_reserve_mem and use the last one if there are many */ + cur = strstr(boot_command_line, name); + while (cur) { + fadump_cmdline = cur; + cur = strstr(cur+1, name); + } + + /* when no fadump_reserve_mem= cmdline option is provided */ + if (!fadump_cmdline) + return 0; + + fadump_cmdline += strlen(name); + + /* for fadump_reserve_mem=size cmdline syntax */ + if (!is_colon_in_param(fadump_cmdline)) { + fw_dump.reserve_bootvar = memparse(fadump_cmdline, NULL); + return 0; + } + + /* for fadump_reserve_mem=ramsize-range:size[,...] cmdline syntax */ + cur = fadump_cmdline; + fw_dump.reserve_bootvar = parse_mem_range_size("fadump_reserve_mem", + , memblock_phys_mem_size()); + if (cur == fadump_cmdline) { + return -EINVAL; + } + + return 0; +} + /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,12 +261,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) { unsigned long size; + /* sets fw_dump.reserve_bootvar */ + parse_fadump_reserve_mem(); + /* * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ if (fw_dump.reserve_bootvar) return fw_dump.reserve_bootvar; + else + printk(KERN_INFO "fadump: calculating default boot size\n"); /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -348,15 +402,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, ); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc;
Re: [v2,1/2] refactor code parsing size based on memory range
Ping.. On Friday 24 June 2016 10:45 PM, Hari Bathini wrote: On 06/24/2016 10:56 AM, Michael Ellerman wrote: On Wed, 2016-22-06 at 19:25:26 UTC, Hari Bathini wrote: Currently, crashkernel parameter supports the below syntax to parse size based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters with similar syntax. So, move this code to a more generic place for code reuse. Cc: Eric Biederman <ebied...@xmission.com> Cc: Vivek Goyal <vgo...@redhat.com> Cc: Rusty Russell <ru...@rustcorp.com.au> Cc: ke...@lists.infradead.org Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Hari, it's not immediately clear that this makes no change to the logic in the kexec code. Can you reply with a longer change log explaining why the old & new logic is the same for kexec. Hi Michael, Please consider this changelog for this patch: -- crashkernel parameter supports different syntaxes to specify the amount of memory to be reserved for kdump kernel. Below is one of the supported syntaxes that needs parsing to find the memory size to reserve, based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters, like fadump_reserve_mem, which could use similar syntax. So, to reuse code, moving the code that checks if the parameter syntax is as above and also the code that parses memory size to reserve, for this syntax. While the code is moved to kernel/params.c file, there is no change in logic for crashkernel parameter parsing as the moved code is invoked with function calls at appropriate places. -- Thanks Hari diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10f..72f55e5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -436,6 +436,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_param_range_based(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0..d43f5cc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1083,59 +1083,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ -do { -unsigned long long start, end = ULLONG_MAX, size; - -/* get the start of the range */ -start = memparse(cur, ); -if (cur == tmp) { -pr_warn("crashkernel: Memory value expected\n"); -return -EINVAL; -} -cur = tmp; -if (*cur != '-') { -pr_warn("crashkernel: '-' expected\n"); -return -EINVAL; -} -cur++; - -/* if no ':' is here, than we read the end */ -if (*cur != ':') { -end = memparse(cur, ); -if (cur == tmp) { -pr_warn("crashkernel: Memory value expected\n"); -return -EINVAL; -} -cur = tmp; -if (end <= start) { -pr_warn("crashkernel: end <= start\n"); -return -EINVAL; -} -} - -if (*cur != ':') { -pr_warn("crashkernel: ':' expected\n"); -return -EINVAL; -} -cur++; - -size = memparse(cur, ); -if (cur == tmp) { -pr_warn("Memory value expected\n"); -return -EINVAL; -} -cur = tmp; -if (size >= system_ram) { -pr_warn("crashkernel: invalid size\n"); -return -EINVAL; -} - -/* match ? */ -if (system_ram >= start && system_ram < end) { -*crash_size = size; -break; -} -} while (*cur++ == ','); +*crash_size = parse_mem_range_size("crashkernel", , system_ram); +if (cur == cmdline) +return -EINVAL; if (*crash_size > 0) { while (*cur && *cur != ' ' && *cur != '@') @@ -1272,7 +1222,6 @@ static int __init __parse_crashkernel(char *cmdline, const char *name, const char *suffix) { -char*first_colon, *first_space; char*ck_cmdline; BUG_ON(!crash_size || !crash_base); @@ -1290,12 +1239,10 @@ static int __init __parse_crashkernel(char *cmdline,
Re: [v2,1/2] refactor code parsing size based on memory range
On 07/05/2016 10:48 AM, Michael Ellerman wrote: On 06/24/2016 10:56 AM, Michael Ellerman wrote: On Wed, 2016-22-06 at 19:25:26 UTC, Hari Bathini wrote: ... While the code is moved to kernel/params.c file, there is no change in logic for crashkernel parameter parsing as the moved code is invoked with function calls at appropriate places. Hi Michael, Are you sure that's true? Yes. I tested it. The old code would return -EINVAL from parse_crashkernel_mem() for any error, regardless of whether it had already parsed some of the string. eg: diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0..d43f5cc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1083,59 +1083,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } So eg, if I give it "128M-foo" it will modify cur, and then error out here ^ It does modify cur (local variable) but that would have no bearing on parsing logic as we are returning immediately.. You've changed that to: + *crash_size = parse_mem_range_size("crashkernel", , system_ram); + if (cur == cmdline) + return -EINVAL; Which only returns EINVAL if cur is not modified at all. I think the confusion is with the same local variable cur in parse_crashkernel_mem() & parse_mem_range_size() functions. We modified cur (local variable) in parse_mem_range_size() but the output parameter (char **str) remains unchanged unless we find a match. Thanks Hari And looking below: diff --git a/kernel/params.c b/kernel/params.c index a6d6149..84e40ae 100644 --- a/kernel/params.c +++ b/kernel/params.c ... +unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram) +{ + char *cur = *str, *tmp; + unsigned long long mem_size = 0; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, ); + if (cur == tmp) { + printk(KERN_INFO "%s: Memory value expected\n", param); + return mem_size; + } + cur = tmp; + if (*cur != '-') { + printk(KERN_INFO "%s: '-' expected\n", param); + return mem_size; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, ); + if (cur == tmp) { + printk(KERN_INFO "%s: Memory value expected\n", + param); + return mem_size; If we error out here for example, we have modified cur, so the code above *won't* return EINVAL. Which looks like a behaviour change to me? cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RESEND][PATCH v2 1/2] kexec: refactor code parsing size based on memory range
Hi Dave Thanks for the review.. On Thursday 04 August 2016 02:56 PM, Dave Young wrote: Hi Hari, On 08/04/16 at 01:03am, Hari Bathini wrote: crashkernel parameter supports different syntaxes to specify the amount of memory to be reserved for kdump kernel. Below is one of the supported syntaxes that needs parsing to find the memory size to reserve, based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters, like fadump_reserve_mem=, which could use similar syntax. This patch moves crashkernel's parsing code for above syntax to to kernel/params.c file for reuse. Two functions is_param_range_based() and parse_mem_range_size() are added to kernel/params.c file for this purpose. Any parameter that uses the above syntax can use is_param_range_based() function to validate the syntax and parse_mem_range_size() function to get the parsed memory size. While some code is moved to kernel/params.c file, there is no change functionality wise in parsing the crashkernel parameter. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: 1. Updated changelog include/linux/kernel.h |5 +++ kernel/kexec_core.c| 63 +++- kernel/params.c| 96 3 files changed, 106 insertions(+), 58 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a611..2df7ba2 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -435,6 +435,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_param_range_based(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5616755..3a74024 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1104,59 +1104,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, ); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, ); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); + *crash_size = parse_mem_range_size("crashkernel", , system_ram); + if (cur == cmdline) + return -EINVAL; if (*crash_size > 0) { while (*cur && *cur != ' ' && *cur != '@') @@ -1293,7 +1243,6 @@ static int __init __parse_crashkernel(char *cmdline, const char *name, const char *suffix) {
Re: [RESEND][PATCH v2 2/2] powerpc/fadump: parse fadump reserve memory size based on memory range
On Thursday 04 August 2016 03:15 PM, Michael Ellerman wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: ... /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,12 +262,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) { unsigned long size; + /* sets fw_dump.reserve_bootvar */ + parse_fadump_reserve_mem(); + /* * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ if (fw_dump.reserve_bootvar) return fw_dump.reserve_bootvar; + else + printk(KERN_INFO "fadump: calculating default boot size\n"); /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; The code already knows how to reserve 5% based on the size of the machine's memory, as long as no commandline parameter is passed. So why can't we just use that logic? Hi Michael, That is the default value reserved but not a good enough value for every case. It is a bit difficult to come up with a robust formula that works for every case as new kernel changes could make the values obsolete. But it won't be all that difficult to find values that work for different memory ranges for a given kernel version. Passing that as range based input with "fadump_reserve_mem" parameter would work for every memory configuration on a given system, which is what this patch is trying to provide.. Thanks Hari cheers
Re: [RESEND][PATCH v2 2/2] powerpc/fadump: parse fadump reserve memory size based on memory range
On Friday 05 August 2016 12:23 AM, Hari Bathini wrote: On Thursday 04 August 2016 03:15 PM, Michael Ellerman wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: ... /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,12 +262,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) { unsigned long size; +/* sets fw_dump.reserve_bootvar */ +parse_fadump_reserve_mem(); + /* * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ if (fw_dump.reserve_bootvar) return fw_dump.reserve_bootvar; +else +printk(KERN_INFO "fadump: calculating default boot size\n"); /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; The code already knows how to reserve 5% based on the size of the machine's memory, as long as no commandline parameter is passed. So why can't we just use that logic? Hi Michael, That is the default value reserved but not a good enough value for every case. It is a bit difficult to come up with a robust formula that works for every case as new kernel changes could make the values obsolete. But it won't be all that difficult to find values that work for different memory ranges for a given kernel version. Passing that as range based input with "fadump_reserve_mem" parameter would work for every memory configuration on a given system, which is what this patch is trying to provide.. Hi Michael, You want me to add this to the changelog on respin? Thanks Hari Thanks Hari cheers
Re: [RESEND][PATCH v2 2/2] powerpc/fadump: parse fadump reserve memory size based on memory range
On Monday 08 August 2016 02:26 PM, Michael Ellerman wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: On Friday 05 August 2016 12:23 AM, Hari Bathini wrote: On Thursday 04 August 2016 03:15 PM, Michael Ellerman wrote: The code already knows how to reserve 5% based on the size of the machine's memory, as long as no commandline parameter is passed. So why can't we just use that logic? That is the default value reserved but not a good enough value for every case. It is a bit difficult to come up with a robust formula that works for every case as new kernel changes could make the values obsolete. But it won't be all that difficult to find values that work for different memory ranges for a given kernel version. Passing that as range based input with "fadump_reserve_mem" parameter would work for every memory configuration on a given system, which is what this patch is trying to provide.. You want me to add this to the changelog on respin? Hi Michael, I'm not really convinced. Distros are going to want to specify a fixed set of values for different memory sizes, at least that's what I've seen in the past with kdump. So I don't see why we can't just do that in the kernel with a formula based on memory size, and maybe some other information. Agreed. Such support would be great but this patch is adding support for a new syntax for an existing parameter which should still be good to have? Maybe the formula is more complicated than 5% of RAM, but it shouldn't be *that* much more complicated. Depending on what all kernel versions that need support, this can get ugly? I could be completely wrong though.. Thanks Hari cheers ___ kexec mailing list ke...@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 1/2] fadump: reduce memory consumption for capture kernel
In case of fadump, capture (fadump) kernel boots like a normal kernel. While this has its advantages, the capture kernel would initialize all the components like normal kernel, which may not necessarily be needed for a typical dump capture kernel. So, fadump capture kernel ends up needing more memory than a typical (read kdump) capture kernel to boot. This can be overcome by introducing parameters like fadump_nr_cpus=1, similar to nr_cpus=1 parameter, applicable only when fadump is active. But this approach needs introduction of special parameters applicable only when fadump is active (capture kernel), for every parameter that reduces memory/resource consumption. A better approach would be to pass extra parameters to fadump capture kernel. As firmware leaves the memory contents intact from the time of crash till the new kernel is booted up, parameters to append to capture kernel can be saved in real memory region and retrieved later when the capture kernel is in its early boot process for appending to command line parameters. This patch introduces a new node /sys/kernel/fadump_cmdline_append to specify the parameters to pass to fadump capture kernel, saves them in real memory region and appends these parameters to capture kernel early in its boot process. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: * Not changing dump format version to keep compatibility intact. Using start and end markers instead, to check sanity of handover area. * Checking for memory overlap with current kernel before setting up a handover area. arch/powerpc/include/asm/fadump.h | 31 +++ arch/powerpc/kernel/fadump.c | 158 + arch/powerpc/kernel/prom.c| 22 + 3 files changed, 210 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 0031806..e6b3dc0 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -24,6 +24,8 @@ #ifdef CONFIG_FA_DUMP +#include + /* * The RMA region will be saved for later dumping when kernel crashes. * RMA is Real Mode Area, the first block of logical memory address owned @@ -126,6 +128,13 @@ struct fw_dump { /* cmd line option during boot */ unsigned long reserve_bootvar; + /* +* Area to pass info to capture (fadump) kernel. For now, +* we are only passing parameters to append. +*/ + unsigned long handover_area_start; + unsigned long handover_area_size; + unsigned long fadumphdr_addr; unsigned long cpu_notes_buf; unsigned long cpu_notes_buf_size; @@ -159,6 +168,27 @@ static inline u64 str_to_u64(const char *str) #define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF") #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") +/* + * Start address for the area to pass off certain configuration details + * like parameters to append to the commandline for a capture (fadump) kernel. + * Will refer to this area as handover area henceforth. Setting start address + * of handover area to 128MB as this area needs to be accessed in realmode. + */ +#define FADUMP_HANDOVER_AREA_START (1UL << 27) +#define FADUMP_HANDOVER_AREA_SIZE (sizeof(struct fadump_handover_info) \ ++ H_END_MARKER_SIZE) + +#define H_AREA_START_MARKERSTR_TO_HEX("HDRSTART") +#define H_AREA_END_MARKER STR_TO_HEX("HOVEREND") +#define H_END_MARKER_SIZE 8 + +/* config info to be passed to capture kernel */ +struct fadump_handover_info { + u64 start_marker; + u64 size; + charparams[COMMAND_LINE_SIZE/2]; +}; + /* The firmware-assisted dump format. * * The register save area is an area in the partition's memory used to preserve @@ -200,6 +230,7 @@ struct fad_crash_memory_ranges { extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); +extern char *get_fadump_parameters_realmode(void); extern int fadump_reserve_mem(void); extern int setup_fadump(void); extern int is_fadump_active(void); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8f0c7c5..eab26e9 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -41,7 +41,6 @@ #include #include #include -#include static struct fw_dump fw_dump; static struct fadump_mem_struct fdm; @@ -51,6 +50,8 @@ static DEFINE_MUTEX(fadump_mutex); struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES]; int crash_mem_ranges; +extern char _stext[], _end[]; + /* Scan the Firmware Assisted dump configuration details. */ int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) @@ -74,6 +75,9 @@
[PATCH v2 2/2] fadump: update documentation about introduction of handover area
Update documentation about introduction of handover area that includes configuration details like extra parameters to append to capture kernel. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Documentation/powerpc/firmware-assisted-dump.txt | 83 ++ 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 3007bc9..6c6a0e9 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -67,11 +67,17 @@ as follows: -- The freshly booted kernel will notice that there is a new node (ibm,dump-kernel) in the device tree, indicating that - there is crash data available from a previous boot. During - the early boot OS will reserve rest of the memory above - boot memory size effectively booting with restricted memory - size. This will make sure that the second kernel will not - touch any of the dump memory area. + there is crash data available from a previous boot. This + second kernel, where crash data is available from previous + boot, is referred to as capture kernel. The capture kernel, + early during the boot process, looks for a handover area + (see fig. 2), saved by the first kernel to be handed over + to it. This handover area contains certain config info like + extra parameters to append to the capture kernel. The capture + kernel applies this configuration accordingly. Later, reserves + rest of the memory above boot memory size effectively booting + with restricted memory size. This will make sure that the + capture kernel will not touch any of the dump memory area. -- User-space tools will read /proc/vmcore to obtain the contents of memory, which holds the previous crashed kernel dump in ELF @@ -113,15 +119,18 @@ crash does occur. o Memory Reservation during first kernel - Low memoryTop of memory - 0 boot memory size | - | | |<--Reserved dump area -->| - V V | Permanent Reservation V - +---+--/ /--+---++---++ - | | |CPU|HPTE| DUMP |ELF | - +---+--/ /--+---++---++ -| ^ -| | + Low memory Top of memory + 0 | + | Handover area | + | | | + | | boot memory size |<-- Reserved dump area-->| + | | | | Permanent Reservation | + V V VV + +--+-++--/ /--+---++---++ + | | || |CPU|HPTE| DUMP |ELF | + +--+-++--/ /--+---++---++ + | ___| ^ + \/ | \ / --- Boot memory content gets transferred to @@ -129,18 +138,21 @@ crash does occur. crash Fig. 1 - o Memory Reservation during second kernel after crash - - Low memoryTop of memory - 0 boot memory size | - | |<- Reserved dump area --- -->| - V V V - +---+--/ /--+---++---++ - | | |CPU|HPTE| DUMP |ELF | - +---+--/ /--+---++---++ -|| -VV - Used by second/proc/vmcore + o Memory Reservation during capture (fadump) kernel after crash + + Low memory Top of memory + 0 | + | Handover area| + | | | + | |boot memory size | + | | |<- Reserved dump area --- -->| + V V V V + +--+-++--/ /--+---++---++ + | | ||
Re: [PATCH v1 1/2] fadump: reduce memory consumption for capture kernel
Hi Mahesh, On Tuesday 31 January 2017 01:05 AM, Mahesh Jagannath Salgaonkar wrote: On 01/30/2017 10:14 PM, Hari Bathini wrote: In case of fadump, capture (fadump) kernel boots like a normal kernel. While this has its advantages, the capture kernel would initialize all the components like normal kernel, which may not necessarily be needed for a typical dump capture kernel. So, fadump capture kernel ends up needing more memory than a typical (read kdump) capture kernel to boot. ... +#define FADUMP_FORMAT_VERSION 0x0002 Why 0x0002 ? Does Phyp now support new version of dump format ? We should be more careful not to break backward compatibility. Dump format version has not changed in Phyp. Undone the change in v2 to keep backward compatibility intact. +static ssize_t fadump_params_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", + get_fadump_params_buf(__va(fw_dump.handover_area_start))); May be we should show current cmdline + fadump append params. I think it is better to display only append parameters as current cmdline parameters may not be accurate always? Thanks Hari
Re: [PATCH v4 2/5] ia64: reuse append_elf_note() and final_note() functions
On Friday 20 January 2017 11:17 AM, Michael Ellerman wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: Get rid of multiple definitions of append_elf_note() & final_note() functions. Reuse these functions compiled under CONFIG_CRASH_CORE Also, define Elf_Word and use it instead of generic u32 or the more specific Elf64_Word. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v3: * Dropped hard-coded values and used DIV_ROUND_UP(). Changes from v2: * Added a definition for Elf_Word. * Used IA64 version of append_elf_note() and final_note() functions. arch/ia64/kernel/crash.c | 22 -- include/linux/crash_core.h |4 include/linux/elf.h|2 ++ kernel/crash_core.c| 34 ++ kernel/kexec_core.c| 28 5 files changed, 20 insertions(+), 70 deletions(-) Do the powerpc patches later in the series actually depend on this one? Or is this just an unrelated cleanup? As it is I can't merge the series until we at least get an ack on this from the ia64 folks. If you can just split this out as a separate patch that would make it a lot easier to get the rest merged. Hi Michael, append_elf_note() & final_note() functions were defined statically at three different places, arch/powerpc/kernel/fadump.c being one of them. With my changes, I would need to add a fourth static definition if I ignore this cleanup. So, I preferred to clean this up... Let me ping IA64 folks one last time. Will do a respin without the cleanup if I don't get any response from them by end of this week.. Thanks Hari
Re: [PATCH v4 2/5] ia64: reuse append_elf_note() and final_note() functions
On Tuesday 17 January 2017 10:36 PM, Hari Bathini wrote: On Friday 06 January 2017 07:33 AM, Dave Young wrote: On 01/05/17 at 11:01pm, Hari Bathini wrote: Get rid of multiple definitions of append_elf_note() & final_note() functions. Reuse these functions compiled under CONFIG_CRASH_CORE Also, define Elf_Word and use it instead of generic u32 or the more specific Elf64_Word. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v3: * Dropped hard-coded values and used DIV_ROUND_UP(). Changes from v2: * Added a definition for Elf_Word. * Used IA64 version of append_elf_note() and final_note() functions. arch/ia64/kernel/crash.c | 22 -- include/linux/crash_core.h |4 include/linux/elf.h|2 ++ kernel/crash_core.c| 34 ++ kernel/kexec_core.c| 28 5 files changed, 20 insertions(+), 70 deletions(-) diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index 2955f35..75859a0 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -27,28 +27,6 @@ static int kdump_freeze_monarch; static int kdump_on_init = 1; static int kdump_on_fatal_mca = 1; -static inline Elf64_Word -*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, -size_t data_len) -{ -struct elf_note *note = (struct elf_note *)buf; -note->n_namesz = strlen(name) + 1; -note->n_descsz = data_len; -note->n_type = type; -buf += (sizeof(*note) + 3)/4; -memcpy(buf, name, note->n_namesz); -buf += (note->n_namesz + 3)/4; -memcpy(buf, data, data_len); -buf += (data_len + 3)/4; -return buf; -} - -static void -final_note(void *buf) -{ -memset(buf, 0, sizeof(struct elf_note)); -} - extern void ia64_dump_cpu_regs(void *); static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus); diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 18d0f94..541a197 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -55,6 +55,10 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, diff --git a/include/linux/elf.h b/include/linux/elf.h index 20fa8d8..ba069e8 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_noteelf32_note #define elf_addr_tElf32_Off #define Elf_HalfElf32_Half +#define Elf_WordElf32_Word #else @@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_noteelf64_note #define elf_addr_tElf64_Off #define Elf_HalfElf64_Half +#define Elf_WordElf64_Word #endif diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 80b441d..362dace 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -291,32 +291,26 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } -static u32 *append_elf_note(u32 *buf, char *name, unsigned int type, -void *data, size_t data_len) +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) { -struct elf_note note; - -note.n_namesz = strlen(name) + 1; -note.n_descsz = data_len; -note.n_type = type; -memcpy(buf, , sizeof(note)); -buf += (sizeof(note) + 3)/4; -memcpy(buf, name, note.n_namesz); -buf += (note.n_namesz + 3)/4; -memcpy(buf, data, note.n_descsz); -buf += (note.n_descsz + 3)/4; +struct elf_note *note = (struct elf_note *)buf; + +note->n_namesz = strlen(name) + 1; +note->n_descsz = data_len; +note->n_type = type; +buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); +memcpy(buf, name, note->n_namesz); +buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); +memcpy(buf, data, data_len); +buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); return buf; } -static void final_note(u32 *buf) +void final_note(Elf_Word *buf) { -struct elf_note note; - -note.n_namesz = 0; -note.n_descsz = 0; -note.n_type = 0; -memcpy(buf, , sizeof(note)); +memset(buf, 0, sizeof(struct elf_note)); } static void update_vmcoreinfo_note(void) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 2179a16..263d764 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -990,34 +990,6 @@ int crash_shrink_memory(unsigned long new_size) return ret;
[PATCH v1 1/2] fadump: reduce memory consumption for capture kernel
In case of fadump, capture (fadump) kernel boots like a normal kernel. While this has its advantages, the capture kernel would initialize all the components like normal kernel, which may not necessarily be needed for a typical dump capture kernel. So, fadump capture kernel ends up needing more memory than a typical (read kdump) capture kernel to boot. This can be overcome by introducing parameters like fadump_nr_cpus=1, similar to nr_cpus=1 parameter, applicable only when fadump is active. But this approach needs introduction of special parameters applicable only when fadump is active (capture kernel), for every parameter that reduces memory/resource consumption. A better approach would be to pass extra parameters to fadump capture kernel. As firmware leaves the memory contents intact from the time of crash till the new kernel is booted up, parameters to append to capture kernel can be saved in real memory region and retrieved later when the capture kernel is in its early boot process for appending to command line parameters. This patch introduces a new node /sys/kernel/fadump_cmdline_append to specify the parameters to pass to fadump capture kernel, saves them in real memory region and appends these parameters to capture kernel early in its boot process. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/fadump.h | 28 arch/powerpc/kernel/fadump.c | 125 - arch/powerpc/kernel/prom.c| 19 ++ 3 files changed, 170 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 0031806..484083a 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -24,6 +24,8 @@ #ifdef CONFIG_FA_DUMP +#include + /* * The RMA region will be saved for later dumping when kernel crashes. * RMA is Real Mode Area, the first block of logical memory address owned @@ -45,6 +47,8 @@ #define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt) +#define FADUMP_FORMAT_VERSION 0x0002 + /* Firmware provided dump sections */ #define FADUMP_CPU_STATE_DATA 0x0001 #define FADUMP_HPTE_REGION 0x0002 @@ -126,6 +130,13 @@ struct fw_dump { /* cmd line option during boot */ unsigned long reserve_bootvar; + /* +* Area to pass info to capture (fadump) kernel. For now, +* we are only passing parameters to append. +*/ + unsigned long handover_area_start; + unsigned long handover_area_size; + unsigned long fadumphdr_addr; unsigned long cpu_notes_buf; unsigned long cpu_notes_buf_size; @@ -159,6 +170,22 @@ static inline u64 str_to_u64(const char *str) #define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF") #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") +/* + * The start address for an area to pass off certain configuration details + * like parameters to append to the commandline for a capture (fadump) kernel. + * Setting it to 128MB as this needs to be accessed in realmode. + */ +#define FADUMP_HANDOVER_AREA_START (1UL << 27) + +#define FADUMP_PARAMS_AREA_MARKER STR_TO_HEX("FADMPCMD") +#define FADUMP_PARAMS_INFO_SIZEsizeof(struct fadump_params_info) + +/* fadump parameters info */ +struct fadump_params_info { + u64 params_area_marker; + charparams[COMMAND_LINE_SIZE/2]; +}; + /* The firmware-assisted dump format. * * The register save area is an area in the partition's memory used to preserve @@ -200,6 +227,7 @@ struct fad_crash_memory_ranges { extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); +extern char *get_fadump_parameters_realmode(void); extern int fadump_reserve_mem(void); extern int setup_fadump(void); extern int is_fadump_active(void); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8f0c7c5..bc82d22 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -41,7 +41,6 @@ #include #include #include -#include static struct fw_dump fw_dump; static struct fadump_mem_struct fdm; @@ -74,6 +73,9 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, fw_dump.fadump_supported = 1; fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token); + fw_dump.handover_area_start = FADUMP_HANDOVER_AREA_START; + fw_dump.handover_area_size = PAGE_ALIGN(FADUMP_PARAMS_INFO_SIZE); + /* * The 'ibm,kernel-dump' rtas node is present only if there is * dump data waiting for us. @@ -147,7 +149,7 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, memset(fdm, 0, sizeof(struct fadump_mem_struct)); addr = addr & PAGE_MASK; - fdm->header.dump_form
[PATCH v1 1/2] fadump: reduce memory consumption for capture kernel
In case of fadump, capture (fadump) kernel boots like a normal kernel. While this has its advantages, the capture kernel would initialize all the components like normal kernel, which may not necessarily be needed for a typical dump capture kernel. So, fadump capture kernel ends up needing more memory than a typical (read kdump) capture kernel to boot. This can be overcome by introducing parameters like fadump_nr_cpus=1, similar to nr_cpus=1 parameter, applicable only when fadump is active. But this approach needs introduction of special parameters applicable only when fadump is active (capture kernel), for every parameter that reduces memory/resource consumption. A better approach would be to pass extra parameters to fadump capture kernel. As firmware leaves the memory contents intact from the time of crash till the new kernel is booted up, parameters to append to capture kernel can be saved in real memory region and retrieved later when the capture kernel is in its early boot process for appending to command line parameters. This patch introduces a new node /sys/kernel/fadump_cmdline_append to specify the parameters to pass to fadump capture kernel, saves them in real memory region and appends these parameters to capture kernel early in its boot process. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/fadump.h | 28 arch/powerpc/kernel/fadump.c | 125 - arch/powerpc/kernel/prom.c| 19 ++ 3 files changed, 170 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 0031806..484083a 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -24,6 +24,8 @@ #ifdef CONFIG_FA_DUMP +#include + /* * The RMA region will be saved for later dumping when kernel crashes. * RMA is Real Mode Area, the first block of logical memory address owned @@ -45,6 +47,8 @@ #define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt) +#define FADUMP_FORMAT_VERSION 0x0002 + /* Firmware provided dump sections */ #define FADUMP_CPU_STATE_DATA 0x0001 #define FADUMP_HPTE_REGION 0x0002 @@ -126,6 +130,13 @@ struct fw_dump { /* cmd line option during boot */ unsigned long reserve_bootvar; + /* +* Area to pass info to capture (fadump) kernel. For now, +* we are only passing parameters to append. +*/ + unsigned long handover_area_start; + unsigned long handover_area_size; + unsigned long fadumphdr_addr; unsigned long cpu_notes_buf; unsigned long cpu_notes_buf_size; @@ -159,6 +170,22 @@ static inline u64 str_to_u64(const char *str) #define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF") #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") +/* + * The start address for an area to pass off certain configuration details + * like parameters to append to the commandline for a capture (fadump) kernel. + * Setting it to 128MB as this needs to be accessed in realmode. + */ +#define FADUMP_HANDOVER_AREA_START (1UL << 27) + +#define FADUMP_PARAMS_AREA_MARKER STR_TO_HEX("FADMPCMD") +#define FADUMP_PARAMS_INFO_SIZEsizeof(struct fadump_params_info) + +/* fadump parameters info */ +struct fadump_params_info { + u64 params_area_marker; + charparams[COMMAND_LINE_SIZE/2]; +}; + /* The firmware-assisted dump format. * * The register save area is an area in the partition's memory used to preserve @@ -200,6 +227,7 @@ struct fad_crash_memory_ranges { extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); +extern char *get_fadump_parameters_realmode(void); extern int fadump_reserve_mem(void); extern int setup_fadump(void); extern int is_fadump_active(void); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8f0c7c5..bc82d22 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -41,7 +41,6 @@ #include #include #include -#include static struct fw_dump fw_dump; static struct fadump_mem_struct fdm; @@ -74,6 +73,9 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, fw_dump.fadump_supported = 1; fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token); + fw_dump.handover_area_start = FADUMP_HANDOVER_AREA_START; + fw_dump.handover_area_size = PAGE_ALIGN(FADUMP_PARAMS_INFO_SIZE); + /* * The 'ibm,kernel-dump' rtas node is present only if there is * dump data waiting for us. @@ -147,7 +149,7 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, memset(fdm, 0, sizeof(struct fadump_mem_struct)); addr = addr & PAGE_MASK; - fdm->header.dump_form
[PATCH v1 2/2] fadump: update documentation about introduction of handover area
Update documentation about introduction of handover area that includes configuration details like extra parameters to append to capture kernel. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Documentation/powerpc/firmware-assisted-dump.txt | 83 ++ 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 3007bc9..2da3a3f 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -67,11 +67,17 @@ as follows: -- The freshly booted kernel will notice that there is a new node (ibm,dump-kernel) in the device tree, indicating that - there is crash data available from a previous boot. During - the early boot OS will reserve rest of the memory above - boot memory size effectively booting with restricted memory - size. This will make sure that the second kernel will not - touch any of the dump memory area. + there is crash data available from a previous boot. This + second kernel, where crash data is available from previous + boot, is referred to as capture kernel. The capture kernel, + early during the boot process, looks for a handover area + (see fig. 2), saved by the first kernel to be handed over + to it. This handover area contains certain config info like + extra parameters to append to the capture kernel. The capture + kernel applies this configuration accordingly. Later, reserves + rest of the memory above boot memory size effectively booting + with restricted memory size. This will make sure that the + capture kernel will not touch any of the dump memory area. -- User-space tools will read /proc/vmcore to obtain the contents of memory, which holds the previous crashed kernel dump in ELF @@ -113,15 +119,18 @@ crash does occur. o Memory Reservation during first kernel - Low memoryTop of memory - 0 boot memory size | - | | |<--Reserved dump area -->| - V V | Permanent Reservation V - +---+--/ /--+---++---++ - | | |CPU|HPTE| DUMP |ELF | - +---+--/ /--+---++---++ -| ^ -| | + Low memory Top of memory + 0 | + | Handover area | + | | | + | | boot memory size |<-- Reserved dump area-->| + | | | | Permanent Reservation | + V V VV + +--+-++--/ /--+---++---++ + | | || |CPU|HPTE| DUMP |ELF | + +--+-++--/ /--+---++---++ + | ___| ^ + \/ | \ / --- Boot memory content gets transferred to @@ -129,18 +138,21 @@ crash does occur. crash Fig. 1 - o Memory Reservation during second kernel after crash - - Low memoryTop of memory - 0 boot memory size | - | |<- Reserved dump area --- -->| - V V V - +---+--/ /--+---++---++ - | | |CPU|HPTE| DUMP |ELF | - +---+--/ /--+---++---++ -|| -VV - Used by second/proc/vmcore + o Memory Reservation during capture (fadump) kernel after crash + + Low memory Top of memory + 0 | + | Handover area| + | | | + | |boot memory size | + | | |<- Reserved dump area --- -->| + V V V V + +--+-++--/ /--+---++---++ + | | ||
Re: [PATCH v4 2/5] ia64: reuse append_elf_note() and final_note() functions
On Tuesday 24 January 2017 11:53 PM, Tony Luck wrote: On Tue, Jan 24, 2017 at 10:11 AM, Hari Bathini <hbath...@linux.vnet.ibm.com> wrote: Hello IA64 folks, Could you please review this patch..? It looks OK in principal. My lab is in partial disarray at the moment (just got back from a sabbatical) so I can't test build and boot. Have you cross-compiled it (or gotten a success build report from zero-day)? I haven't gotten a success/failure build report from zero-day. Not sure what to make of it. But I did try cross-compiling and it was successful. Should that do? Thanks Hari If you have ... then add an Acked-by: Tony Luck <tony.l...@intel.com> -Tony
[PATCH v2] powerpc/fadump: set an upper limit for boot memory size
By default, 5% of system RAM is reserved for preserving boot memory. Alternatively, a user can specify the amount of memory to reserve. See Documentation/powerpc/firmware-assisted-dump.txt for details. In addition to the memory reserved for preserving boot memory, some more memory is reserved, to save HPTE region, CPU state data and ELF core headers. Memory Reservation during first kernel looks like below: Low memoryTop of memory 0 boot memory size | | | |<--Reserved dump area -->| V V | Permanent Reservation V +---+--/ /--+---++---++ | | |CPU|HPTE| DUMP |ELF | +---+--/ /--+---++---++ | ^ | | \ / --- Boot memory content gets transferred to reserved area by firmware at the time of crash This implicitly means that the sum of the sizes of boot memory, CPU state data, HPTE region, DUMP preserving area and ELF core headers can't be greater than the total memory size. But currently, a user is allowed to specify any value as boot memory size. So, the above rule is violated when a boot memory size around 50% of the total available memory is specified. As the kernel is not handling this currently, it may lead to undefined behavior. Fix it by setting an upper limit for boot memory size to 25% of the total available memory. Also, instead of using memblock_end_of_DRAM(), which doesn't take the holes, if any, in the memory layout into account, use memblock_phys_mem_size() to calculate the percentage of total available memory. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- This patch is based on top of the patchset to reuse-crashkernel-parameter- for-fadump (http://patchwork.ozlabs.org/patch/711522). Changes from v1: * Using memblock_phys_mem_size() instead of memblock_end_of_DRAM() to get system RAM size. arch/powerpc/include/asm/fadump.h |3 +++ arch/powerpc/kernel/fadump.c | 16 +++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 60b9108..a3de219 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -43,6 +43,9 @@ #define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + (0x1UL << 26)) +/* The upper limit percentage for user specified boot memory size (25%) */ +#define MAX_BOOT_MEM_RATIO 4 + #define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt) /* Firmware provided dump sections */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index e013f8f..21d5404 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -221,12 +221,26 @@ static inline unsigned long fadump_calculate_reserve_size(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), , ); if (ret == 0 && size > 0) { + unsigned long max_size; + fw_dump.reserve_bootvar = (unsigned long)size; + + /* +* Adjust if the boot memory size specified is above +* the upper limit. +*/ + max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO; + if (fw_dump.reserve_bootvar > max_size) { + fw_dump.reserve_bootvar = max_size; + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + return fw_dump.reserve_bootvar; } /* divide by 20 to get 5% of value */ - size = memblock_end_of_DRAM() / 20; + size = memblock_phys_mem_size() / 20; /* round it down in multiples of 256 */ size = size & ~0x0FFFUL;
Re: [PATCH] powerpc/fadump: set an upper limit for boot memory size
Hi Michael, On Friday 17 February 2017 11:54 AM, Michael Ellerman wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index de7d39a..d5107f4 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -222,6 +222,18 @@ static inline unsigned long fadump_calculate_reserve_size(void) , ); if (ret == 0 && size > 0) { fw_dump.reserve_bootvar = (unsigned long)size; + /* +* Adjust if the boot memory size specified is above +* the upper limit. +*/ + if (fw_dump.reserve_bootvar > + (memblock_end_of_DRAM() / MAX_BOOT_MEM_RATIO)) { Using memblock_end_of_DRAM() doesn't take into account the fact that you might have holes in your memory layout. Possibly on PowerVM that never happens, but I don't think we should write the code to assume that, if possible. I think memblock_phys_mem_size() can fill in.. In the same file, memblock_end_of_DRAM() is also used when nothing is specified through cmdline. Let me also change that and respin.. Thanks Hari
[PATCH] powerpc/fadump: set an upper limit for boot memory size
By default, 5% of system RAM is reserved for preserving boot memory. Alternatively, a user can specify the amount of memory to reserve. See Documentation/powerpc/firmware-assisted-dump.txt for details. In addition to the memory reserved for preserving boot memory, some more memory is reserved, to save HPTE region, CPU state data and ELF core headers. Memory Reservation during first kernel looks like below: Low memoryTop of memory 0 boot memory size | | | |<--Reserved dump area -->| V V | Permanent Reservation V +---+--/ /--+---++---++ | | |CPU|HPTE| DUMP |ELF | +---+--/ /--+---++---++ | ^ | | \ / --- Boot memory content gets transferred to reserved area by firmware at the time of crash The implicit rule here is that the sum of the sizes of boot memory, CPU state data, HPTE region and ELF core headers can't be greater than the total memory size. But currently, a user is allowed to specify any value as boot memory size. So, the above rule is violated when a boot memory size closer to 50% of the total available memory is specified. As the kernel is not handling this currently, it may lead to undefined behavior. Fix it by setting an upper limit for boot memory size to 25% of the total available memory. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- This patch is based on top of reuse-crashkernel-parameter-for-fadump patchset (https://lists.ozlabs.org/pipermail/linuxppc-dev/2017-January/152724.html) arch/powerpc/include/asm/fadump.h |3 +++ arch/powerpc/kernel/fadump.c | 12 2 files changed, 15 insertions(+) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 60b9108..a3de219 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -43,6 +43,9 @@ #define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + (0x1UL << 26)) +/* The upper limit percentage for user specified boot memory size (25%) */ +#define MAX_BOOT_MEM_RATIO 4 + #define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt) /* Firmware provided dump sections */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index de7d39a..d5107f4 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -222,6 +222,18 @@ static inline unsigned long fadump_calculate_reserve_size(void) , ); if (ret == 0 && size > 0) { fw_dump.reserve_bootvar = (unsigned long)size; + /* +* Adjust if the boot memory size specified is above +* the upper limit. +*/ + if (fw_dump.reserve_bootvar > + (memblock_end_of_DRAM() / MAX_BOOT_MEM_RATIO)) { + fw_dump.reserve_bootvar = (memblock_end_of_DRAM() / + MAX_BOOT_MEM_RATIO); + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + return fw_dump.reserve_bootvar; }
Re: [PATCH v4 2/5] ia64: reuse append_elf_note() and final_note() functions
On Friday 06 January 2017 07:33 AM, Dave Young wrote: On 01/05/17 at 11:01pm, Hari Bathini wrote: Get rid of multiple definitions of append_elf_note() & final_note() functions. Reuse these functions compiled under CONFIG_CRASH_CORE Also, define Elf_Word and use it instead of generic u32 or the more specific Elf64_Word. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v3: * Dropped hard-coded values and used DIV_ROUND_UP(). Changes from v2: * Added a definition for Elf_Word. * Used IA64 version of append_elf_note() and final_note() functions. arch/ia64/kernel/crash.c | 22 -- include/linux/crash_core.h |4 include/linux/elf.h|2 ++ kernel/crash_core.c| 34 ++ kernel/kexec_core.c| 28 5 files changed, 20 insertions(+), 70 deletions(-) diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index 2955f35..75859a0 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -27,28 +27,6 @@ static int kdump_freeze_monarch; static int kdump_on_init = 1; static int kdump_on_fatal_mca = 1; -static inline Elf64_Word -*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note *note = (struct elf_note *)buf; - note->n_namesz = strlen(name) + 1; - note->n_descsz = data_len; - note->n_type = type; - buf += (sizeof(*note) + 3)/4; - memcpy(buf, name, note->n_namesz); - buf += (note->n_namesz + 3)/4; - memcpy(buf, data, data_len); - buf += (data_len + 3)/4; - return buf; -} - -static void -final_note(void *buf) -{ - memset(buf, 0, sizeof(struct elf_note)); -} - extern void ia64_dump_cpu_regs(void *); static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus); diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 18d0f94..541a197 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -55,6 +55,10 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, diff --git a/include/linux/elf.h b/include/linux/elf.h index 20fa8d8..ba069e8 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_note elf32_note #define elf_addr_tElf32_Off #define Elf_Half Elf32_Half +#define Elf_Word Elf32_Word #else @@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_note elf64_note #define elf_addr_tElf64_Off #define Elf_Half Elf64_Half +#define Elf_Word Elf64_Word #endif diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 80b441d..362dace 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -291,32 +291,26 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } -static u32 *append_elf_note(u32 *buf, char *name, unsigned int type, - void *data, size_t data_len) +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) { - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, , sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); return buf; } -static void final_note(u32 *buf) +void final_note(Elf_Word *buf) { - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, , sizeof(note)); + memset(buf, 0, sizeof(struct elf_note)); } static void update_vmcoreinfo_note(void) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 2179a16..263d764 100644 -
Re: [PATCH v2] powerpc/mm: export current mmu mode info
Hi Michael/Aneesh, Thanks for reviewing the patch.. On Friday 23 September 2016 04:40 PM, Michael Ellerman wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e2fb408..558987c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -199,6 +199,21 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; +/* + * Possible MMU modes + */ +#define MMU_MODE_NONE 0 +#define MMU_MODE_RADIX 1 +#define MMU_MODE_HASH 2 +#define MMU_MODE_HASH32 3 +#define MMU_MODE_NOHASH 4 +#define MMU_MODE_NOHASH32 5 These are already defined in the same file: /* * MMU families */ #define MMU_FTR_HPTE_TABLE ASM_CONST(0x0001) #define MMU_FTR_TYPE_8xxASM_CONST(0x0002) #define MMU_FTR_TYPE_40xASM_CONST(0x0004) #define MMU_FTR_TYPE_44xASM_CONST(0x0008) #define MMU_FTR_TYPE_FSL_E ASM_CONST(0x0010) #define MMU_FTR_TYPE_47xASM_CONST(0x0020) #define MMU_FTR_TYPE_RADIX ASM_CONST(0x0040) And the values for the current CPU are in cur_cpu_spec->mmu_features. I primarily tried to introduce this patch as crash tool doesn't have access to offset info (which is needed to access structure member mmu_features) early in it's initialization process. So if you must export anything, make it that value, and hopefully the rest of the patch goes away. On second thought, as long as we can get the vmemmap start address, for which we have a variable already, we can push finding of MMU type for later. I may need no kernel patch in that case. Working on patches for crash & makedumpfile tools accordingly. Will post a v3 only if that doesn't work out.. Thanks Hari
Re: [PATCH v2] powerpc/mm: export current mmu mode info
On Friday 23 September 2016 10:14 AM, Aneesh Kumar K.V wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: Hi Aneesh, On Thursday 22 September 2016 09:54 PM, Aneesh Kumar K.V wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: The kernel now supports both radix and hash MMU modes. Tools like crash and makedumpfile need to know the current MMU mode the kernel is using, to debug/analyze it. The current MMU mode depends on hardware support and also whether disable_radix cmdline parameter is passed to the kernel. The mmu_features member of cpu_spec structure holds the current MMU mode a cpu is using. But the above mentioned tools need to know the MMU mode early in their init process, when they may not have access to offset info of structure members. A hard-coded offset may help but it won't be robust. IIUC, you walk the linux page table and that should be more or less same Taking the case of crash tool, vmemmap start value is currently hard-coded to 0xf000UL but it changed to 0xc00aUL in case of radix. All of that is already defined as variables in the kernel. You can look at radix__early_init_mmu(). between radix/hash right except few bits. Now what crash will be interested in will be the RPN part of the table which should be same between hash/radix. Though the walk is pretty much the same, the tool still needs to know the right index values and vmemmap start to use, as they are different for radix and hash.. This patch introduces a new global variable, which holds the current MMU mode the kernel is running in and can be accessed by tools early in thier init process, Init process of what ? kernel or crash tool ? tool initialization - crash or makedumpfile.. helping tools to initialize accurately for each MMU mode. This patch also optimizes the radix_enabled() function call. how do you differentiate between the hold linux page table format and the new ? Can you also summarize what crash tool look for in the page table ? It needs the index sizes, masked bit values and page flag info to do the page table walk. Since they can be different for hash and radix.. Can you look at radix__early_init_mmu/hash__early_init_mmu and see you can work with the variables defined there ? Did consider that but didn't opt for it for a few reasons: 1. Will still need to know the MMU mode as huge page address translation is not the same for radix & hash. 2. Will have to get all these values from a crashed kernel when I can set them based on MMU mode. Less dependence on the failed kernel, the better.. 3. Stash more variables in vmcoreinfo (for makedumpfile) when one is sufficient to serve the purpose. Thanks Hari
[PATCH] ppc64/book3s: export mmu type info
The kernel now supports both radix and hash MMU modes. Tools like crash and makedumpfile need to know, the current MMU mode the kernel is using to debug/analyze the kernel. The current MMU mode depends on H/W support and also whether disable_radix cmdline parameter is passed to the kernel. The mmu_features member of cpu_spec structure holds the current MMU mode a cpu is using. But the above mentioned tools need to know, the MMU mode early in their init process when they have no access to offset info of structure members. A hard-coded offset may help but it won't be robust. This patch introduces a new global variable, which holds the current MMU mode the kernel is running in and can be accessed by tools early in thier init process, helping tools to initialize accurately for each MMU mode. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/book3s/64/mmu.h |5 + arch/powerpc/include/asm/book3s/64/pgtable.h |6 ++ arch/powerpc/kernel/machine_kexec.c |3 +++ arch/powerpc/mm/hash_utils_64.c |2 ++ arch/powerpc/mm/pgtable-radix.c |2 ++ arch/powerpc/mm/pgtable_64.c |6 ++ 6 files changed, 24 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 8afb0e0..af68df3 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -30,6 +30,11 @@ extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; #ifndef __ASSEMBLY__ /* + * current MMU mode + */ +extern unsigned int current_mmu_mode; + +/* * ISA 3.0 partiton and process table entry format */ struct prtb_entry { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 263bf39..f7faebd 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -2,6 +2,12 @@ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ /* + * Possible MMU modes + */ +#define HASH_MMU_MODE 0 +#define RADIX_MMU_MODE 1 + +/* * Common bits between hash and Radix page table */ #define _PAGE_BIT_SWAP_TYPE0 diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 2694d07..4ecc184 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -77,6 +77,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) +#ifdef CONFIG_PPC_BOOK3S + VMCOREINFO_SYMBOL(current_mmu_mode); +#endif VMCOREINFO_SYMBOL(vmemmap_list); VMCOREINFO_SYMBOL(mmu_vmemmap_psize); VMCOREINFO_SYMBOL(mmu_psize_defs); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0821556..3c7855a 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -886,6 +886,8 @@ void __init hash__early_init_devtree(void) void __init hash__early_init_mmu(void) { + current_mmu_mode = HASH_MMU_MODE; + htab_init_page_sizes(); /* diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index af897d9..98fbc97 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -298,6 +298,8 @@ void __init radix__early_init_mmu(void) { unsigned long lpcr; + current_mmu_mode = RADIX_MMU_MODE; + #ifdef CONFIG_PPC_64K_PAGES /* PAGE_SIZE mappings */ mmu_virtual_psize = MMU_PAGE_64K; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index f5e8d4e..04319ac 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -63,6 +63,12 @@ #ifdef CONFIG_PPC_BOOK3S_64 /* + * current MMU mode + */ +unsigned int current_mmu_mode; +EXPORT_SYMBOL(current_mmu_mode); + +/* * partition table and process table for ISA 3.0 */ struct prtb_entry *process_tb;
Re: [PATCH v2] powerpc/mm: export current mmu mode info
On Thursday 22 September 2016 09:32 PM, Hari Bathini wrote: The kernel now supports both radix and hash MMU modes. Tools like crash and makedumpfile need to know the current MMU mode the kernel is using, to debug/analyze it. The current MMU mode depends on hardware support and also whether disable_radix cmdline parameter is passed to the kernel. The mmu_features member of cpu_spec structure holds the current MMU mode a cpu is using. But the above mentioned tools need to know the MMU mode early in their init process, when they may not have access to offset info of structure members. A hard-coded offset may help but it won't be robust. This patch introduces a new global variable, which holds the current MMU mode the kernel is running in and can be accessed by tools early in thier init process, helping tools to initialize accurately for each MMU mode. This patch also optimizes the radix_enabled() function call. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: * Patch name changed from "ppc64/book3s: export mmu type info" * Optimized radix_enabled() function arch/powerpc/include/asm/mmu.h | 22 +- arch/powerpc/kernel/machine_kexec.c |3 +++ arch/powerpc/mm/hash_utils_64.c |2 ++ arch/powerpc/mm/pgtable-radix.c |2 ++ arch/powerpc/mm/pgtable.c |6 ++ arch/powerpc/mm/tlb_hash32.c|1 + arch/powerpc/mm/tlb_nohash.c|2 ++ 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e2fb408..558987c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -199,6 +199,21 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; +/* + * Possible MMU modes + */ +#define MMU_MODE_NONE 0 +#define MMU_MODE_RADIX 1 +#define MMU_MODE_HASH 2 +#define MMU_MODE_HASH32 3 +#define MMU_MODE_NOHASH 4 +#define MMU_MODE_NOHASH32 5 + +/* + * current MMU mode + */ +extern unsigned int current_mmu_mode __read_mostly; + #ifdef CONFIG_PPC64 /* This is our real memory area size on ppc64 server, on embedded, we * make it match the size our of bolted TLB area @@ -218,7 +233,12 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #ifdef CONFIG_PPC_RADIX_MMU static inline bool radix_enabled(void) { - return mmu_has_feature(MMU_FTR_TYPE_RADIX); + if (current_mmu_mode == MMU_MODE_RADIX) + return true; + else if (current_mmu_mode != MMU_MODE_NONE) + return false; + else + return mmu_has_feature(MMU_FTR_TYPE_RADIX); } static inline bool early_radix_enabled(void) diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 2694d07..4ecc184 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -77,6 +77,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) +#ifdef CONFIG_PPC_BOOK3S + VMCOREINFO_SYMBOL(current_mmu_mode); Oops! This doesn't have to be under any flag. Let me resend. Thanks Hari
[PATCH v2] powerpc/mm: export current mmu mode info
The kernel now supports both radix and hash MMU modes. Tools like crash and makedumpfile need to know the current MMU mode the kernel is using, to debug/analyze it. The current MMU mode depends on hardware support and also whether disable_radix cmdline parameter is passed to the kernel. The mmu_features member of cpu_spec structure holds the current MMU mode a cpu is using. But the above mentioned tools need to know the MMU mode early in their init process, when they may not have access to offset info of structure members. A hard-coded offset may help but it won't be robust. This patch introduces a new global variable, which holds the current MMU mode the kernel is running in and can be accessed by tools early in thier init process, helping tools to initialize accurately for each MMU mode. This patch also optimizes the radix_enabled() function call. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: * Patch name changed from "ppc64/book3s: export mmu type info" * Optimized radix_enabled() function arch/powerpc/include/asm/mmu.h | 22 +- arch/powerpc/kernel/machine_kexec.c |3 +++ arch/powerpc/mm/hash_utils_64.c |2 ++ arch/powerpc/mm/pgtable-radix.c |2 ++ arch/powerpc/mm/pgtable.c |6 ++ arch/powerpc/mm/tlb_hash32.c|1 + arch/powerpc/mm/tlb_nohash.c|2 ++ 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e2fb408..558987c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -199,6 +199,21 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; +/* + * Possible MMU modes + */ +#define MMU_MODE_NONE 0 +#define MMU_MODE_RADIX 1 +#define MMU_MODE_HASH 2 +#define MMU_MODE_HASH32 3 +#define MMU_MODE_NOHASH 4 +#define MMU_MODE_NOHASH32 5 + +/* + * current MMU mode + */ +extern unsigned int current_mmu_mode __read_mostly; + #ifdef CONFIG_PPC64 /* This is our real memory area size on ppc64 server, on embedded, we * make it match the size our of bolted TLB area @@ -218,7 +233,12 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #ifdef CONFIG_PPC_RADIX_MMU static inline bool radix_enabled(void) { - return mmu_has_feature(MMU_FTR_TYPE_RADIX); + if (current_mmu_mode == MMU_MODE_RADIX) + return true; + else if (current_mmu_mode != MMU_MODE_NONE) + return false; + else + return mmu_has_feature(MMU_FTR_TYPE_RADIX); } static inline bool early_radix_enabled(void) diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 2694d07..4ecc184 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -77,6 +77,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) +#ifdef CONFIG_PPC_BOOK3S + VMCOREINFO_SYMBOL(current_mmu_mode); +#endif VMCOREINFO_SYMBOL(vmemmap_list); VMCOREINFO_SYMBOL(mmu_vmemmap_psize); VMCOREINFO_SYMBOL(mmu_psize_defs); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0821556..a566a95 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -886,6 +886,8 @@ void __init hash__early_init_devtree(void) void __init hash__early_init_mmu(void) { + current_mmu_mode = MMU_MODE_HASH; + htab_init_page_sizes(); /* diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index af897d9..4b0ad48 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -298,6 +298,8 @@ void __init radix__early_init_mmu(void) { unsigned long lpcr; + current_mmu_mode = MMU_MODE_RADIX; + #ifdef CONFIG_PPC_64K_PAGES /* PAGE_SIZE mappings */ mmu_virtual_psize = MMU_PAGE_64K; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 0b6fb24..4638a00 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -31,6 +31,12 @@ #include #include +/* + * current MMU mode + */ +unsigned int current_mmu_mode __read_mostly = MMU_MODE_NONE; +EXPORT_SYMBOL(current_mmu_mode); + static inline int is_exec_fault(void) { return current->thread.regs && TRAP(current->thread.regs) == 0x400; diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 702d768..0b55425 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -170,4 +170,5 @@ EXPORT_SYMBOL(flush_tlb_range); void __init early_init_mmu(void) { + current_mmu_mode = MMU_MODE_HASH32; } diff --git a/arch/powerpc/mm/tlb_nohash.
[RESEND PATCH v2] powerpc/mm: export current mmu mode info
The kernel now supports both radix and hash MMU modes. Tools like crash and makedumpfile need to know the current MMU mode the kernel is using, to debug/analyze it. The current MMU mode depends on hardware support and also whether disable_radix cmdline parameter is passed to the kernel. The mmu_features member of cpu_spec structure holds the current MMU mode a cpu is using. But the above mentioned tools need to know the MMU mode early in their init process, when they may not have access to offset info of structure members. A hard-coded offset may help but it won't be robust. This patch introduces a new global variable, which holds the current MMU mode the kernel is running in and can be accessed by tools early in thier init process, helping tools to initialize accurately for each MMU mode. This patch also optimizes the radix_enabled() function call. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: * Patch name changed from "ppc64/book3s: export mmu type info" * Optimized radix_enabled() function * Removed current_mmu_mode vmcoreinfo from under flags arch/powerpc/include/asm/mmu.h | 22 +- arch/powerpc/kernel/machine_kexec.c |1 + arch/powerpc/mm/hash_utils_64.c |2 ++ arch/powerpc/mm/pgtable-radix.c |2 ++ arch/powerpc/mm/pgtable.c |6 ++ arch/powerpc/mm/tlb_hash32.c|1 + arch/powerpc/mm/tlb_nohash.c|2 ++ 7 files changed, 35 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e2fb408..558987c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -199,6 +199,21 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; +/* + * Possible MMU modes + */ +#define MMU_MODE_NONE 0 +#define MMU_MODE_RADIX 1 +#define MMU_MODE_HASH 2 +#define MMU_MODE_HASH32 3 +#define MMU_MODE_NOHASH 4 +#define MMU_MODE_NOHASH32 5 + +/* + * current MMU mode + */ +extern unsigned int current_mmu_mode __read_mostly; + #ifdef CONFIG_PPC64 /* This is our real memory area size on ppc64 server, on embedded, we * make it match the size our of bolted TLB area @@ -218,7 +233,12 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #ifdef CONFIG_PPC_RADIX_MMU static inline bool radix_enabled(void) { - return mmu_has_feature(MMU_FTR_TYPE_RADIX); + if (current_mmu_mode == MMU_MODE_RADIX) + return true; + else if (current_mmu_mode != MMU_MODE_NONE) + return false; + else + return mmu_has_feature(MMU_FTR_TYPE_RADIX); } static inline bool early_radix_enabled(void) diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 2694d07..2a32694 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -87,6 +87,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_STRUCT_SIZE(mmu_psize_def); VMCOREINFO_OFFSET(mmu_psize_def, shift); #endif + VMCOREINFO_SYMBOL(current_mmu_mode); } /* diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0821556..a566a95 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -886,6 +886,8 @@ void __init hash__early_init_devtree(void) void __init hash__early_init_mmu(void) { + current_mmu_mode = MMU_MODE_HASH; + htab_init_page_sizes(); /* diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index af897d9..4b0ad48 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -298,6 +298,8 @@ void __init radix__early_init_mmu(void) { unsigned long lpcr; + current_mmu_mode = MMU_MODE_RADIX; + #ifdef CONFIG_PPC_64K_PAGES /* PAGE_SIZE mappings */ mmu_virtual_psize = MMU_PAGE_64K; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 0b6fb24..4638a00 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -31,6 +31,12 @@ #include #include +/* + * current MMU mode + */ +unsigned int current_mmu_mode __read_mostly = MMU_MODE_NONE; +EXPORT_SYMBOL(current_mmu_mode); + static inline int is_exec_fault(void) { return current->thread.regs && TRAP(current->thread.regs) == 0x400; diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 702d768..0b55425 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -170,4 +170,5 @@ EXPORT_SYMBOL(flush_tlb_range); void __init early_init_mmu(void) { + current_mmu_mode = MMU_MODE_HASH32; } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 050badc..74300a7 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm
Re: [PATCH v2] powerpc/mm: export current mmu mode info
Hi Aneesh, On Thursday 22 September 2016 09:54 PM, Aneesh Kumar K.V wrote: Hari Bathini <hbath...@linux.vnet.ibm.com> writes: The kernel now supports both radix and hash MMU modes. Tools like crash and makedumpfile need to know the current MMU mode the kernel is using, to debug/analyze it. The current MMU mode depends on hardware support and also whether disable_radix cmdline parameter is passed to the kernel. The mmu_features member of cpu_spec structure holds the current MMU mode a cpu is using. But the above mentioned tools need to know the MMU mode early in their init process, when they may not have access to offset info of structure members. A hard-coded offset may help but it won't be robust. IIUC, you walk the linux page table and that should be more or less same Taking the case of crash tool, vmemmap start value is currently hard-coded to 0xf000UL but it changed to 0xc00aUL in case of radix. between radix/hash right except few bits. Now what crash will be interested in will be the RPN part of the table which should be same between hash/radix. Though the walk is pretty much the same, the tool still needs to know the right index values and vmemmap start to use, as they are different for radix and hash.. This patch introduces a new global variable, which holds the current MMU mode the kernel is running in and can be accessed by tools early in thier init process, Init process of what ? kernel or crash tool ? tool initialization - crash or makedumpfile.. helping tools to initialize accurately for each MMU mode. This patch also optimizes the radix_enabled() function call. how do you differentiate between the hold linux page table format and the new ? Can you also summarize what crash tool look for in the page table ? It needs the index sizes, masked bit values and page flag info to do the page table walk. Since they can be different for hash and radix.. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: * Patch name changed from "ppc64/book3s: export mmu type info" * Optimized radix_enabled() function arch/powerpc/include/asm/mmu.h | 22 +- arch/powerpc/kernel/machine_kexec.c |3 +++ arch/powerpc/mm/hash_utils_64.c |2 ++ arch/powerpc/mm/pgtable-radix.c |2 ++ arch/powerpc/mm/pgtable.c |6 ++ arch/powerpc/mm/tlb_hash32.c|1 + arch/powerpc/mm/tlb_nohash.c|2 ++ 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e2fb408..558987c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -199,6 +199,21 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; +/* + * Possible MMU modes + */ +#define MMU_MODE_NONE 0 +#define MMU_MODE_RADIX 1 +#define MMU_MODE_HASH 2 +#define MMU_MODE_HASH32 3 +#define MMU_MODE_NOHASH 4 +#define MMU_MODE_NOHASH32 5 + +/* + * current MMU mode + */ +extern unsigned int current_mmu_mode __read_mostly; + #ifdef CONFIG_PPC64 /* This is our real memory area size on ppc64 server, on embedded, we * make it match the size our of bolted TLB area @@ -218,7 +233,12 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #ifdef CONFIG_PPC_RADIX_MMU static inline bool radix_enabled(void) { - return mmu_has_feature(MMU_FTR_TYPE_RADIX); + if (current_mmu_mode == MMU_MODE_RADIX) + return true; + else if (current_mmu_mode != MMU_MODE_NONE) + return false; + else + return mmu_has_feature(MMU_FTR_TYPE_RADIX); } That is not optimization, that makes it slow. We hotpatch mmu_has_feature(). Ugh! I didn't consider that.. Thanks Hari static inline bool early_radix_enabled(void) diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 2694d07..4ecc184 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -77,6 +77,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) +#ifdef CONFIG_PPC_BOOK3S + VMCOREINFO_SYMBOL(current_mmu_mode); +#endif VMCOREINFO_SYMBOL(vmemmap_list); VMCOREINFO_SYMBOL(mmu_vmemmap_psize); VMCOREINFO_SYMBOL(mmu_psize_defs); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0821556..a566a95 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -886,6 +886,8 @@ void __init hash__early_init_devtree(void) void __init hash__early_init_mmu(void) { + current_mmu_mode = MMU_MODE_HASH; + htab_init_page_sizes(); /* diff --git a/arch/pow
Re: [PATCH v3 2/2] powerpc/fadump: parse fadump reserve memory size based on memory range
On Thursday 25 August 2016 12:31 PM, Dave Young wrote: On 08/10/16 at 03:35pm, Hari Bathini wrote: When fadump is enabled, by default 5% of system RAM is reserved for fadump kernel. While that works for most cases, it is not good enough for every case. Currently, to override the default value, fadump supports specifying memory to reserve with fadump_reserve_mem=size, where only a fixed size can be specified. This patch adds support to specify memory size to reserve for different memory ranges as below: fadump_reserve_mem=:[,:,...] Hi, Hari Hi Dave, I do not understand why you need introduce the new cmdline param, what's the difference between the "fadump reserved" memory and the memory I am not introducing a new parameter but adding a new syntax for an existing parameter. reserved by "crashkernel="? Can fadump just use crashkernel= to reserve memory? Not all syntaxes supported by crashkernel apply for fadump_reserve_mem. Nonetheless, it is worth considering reuse of crashkernel parameter instead of fadump_reserve_mem. Let me see what I can do about this.. Thanks Hari Thanks Dave Supporting range based input for "fadump_reserve_mem" parameter helps using the same commandline parameter for different system memory sizes. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> Reviewed-by: Mahesh J Salgaonkar <mah...@linux.vnet.ibm.com> --- Changes from v2: 1. Updated changelog arch/powerpc/kernel/fadump.c | 63 -- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b3a6633..7c01b5b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -193,6 +193,55 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, return addr; } +/* + * This function parses command line for fadump_reserve_mem= + * + * Supports the below two syntaxes: + *1. fadump_reserve_mem=size + *2. fadump_reserve_mem=ramsize-range:size[,...] + * + * Sets fw_dump.reserve_bootvar with the memory size + * provided, 0 otherwise + * + * The function returns -EINVAL on failure, 0 otherwise. + */ +static int __init parse_fadump_reserve_mem(void) +{ + char *name = "fadump_reserve_mem="; + char *fadump_cmdline = NULL, *cur; + + fw_dump.reserve_bootvar = 0; + + /* find fadump_reserve_mem and use the last one if there are many */ + cur = strstr(boot_command_line, name); + while (cur) { + fadump_cmdline = cur; + cur = strstr(cur+1, name); + } + + /* when no fadump_reserve_mem= cmdline option is provided */ + if (!fadump_cmdline) + return 0; + + fadump_cmdline += strlen(name); + + /* for fadump_reserve_mem=size cmdline syntax */ + if (!is_colon_in_param(fadump_cmdline)) { + fw_dump.reserve_bootvar = memparse(fadump_cmdline, NULL); + return 0; + } + + /* for fadump_reserve_mem=ramsize-range:size[,...] cmdline syntax */ + cur = fadump_cmdline; + fw_dump.reserve_bootvar = parse_mem_range_size("fadump_reserve_mem", + , memblock_phys_mem_size()); + if (cur == fadump_cmdline) { + return -EINVAL; + } + + return 0; +} + /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -212,12 +261,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) { unsigned long size; + /* sets fw_dump.reserve_bootvar */ + parse_fadump_reserve_mem(); + /* * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ if (fw_dump.reserve_bootvar) return fw_dump.reserve_bootvar; + else + printk(KERN_INFO "fadump: calculating default boot size\n"); /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -348,15 +402,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, ); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc; ___ kexec mailing list ke...@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 3/3] powerpc/fadump: update documentation about crashkernel parameter reuse
As we are reusing crashkernel parameter instead of fadump_reserve_mem parameter to specify the memory to reserve for fadump's crash kernel, update the documentation accordingly. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Documentation/powerpc/firmware-assisted-dump.txt | 23 ++ 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 3007bc9..8394bc8 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -55,10 +55,14 @@ as follows: booted with restricted memory. By default, the boot memory size will be the larger of 5% of system RAM or 256MB. Alternatively, user can also specify boot memory size - through boot parameter 'fadump_reserve_mem=' which will - override the default calculated size. Use this option - if default boot memory size is not sufficient for second - kernel to boot successfully. + through boot parameter 'crashkernel=' which will override + the default calculated size. Use this option if default + boot memory size is not sufficient for second kernel to + boot successfully. For syntax of crashkernel= parameter, + refer to Documentation/kdump/kdump.txt. If any offset is + provided in crashkernel= parameter, it will be ignored + as fadump reserves memory at end of RAM for boot memory + dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the firmware will reset PCI and other hardware state. It will @@ -158,13 +162,16 @@ How to enable firmware-assisted dump (fadump): 1. Set config option CONFIG_FA_DUMP=y and build kernel. 2. Boot into linux kernel with 'fadump=on' kernel cmdline option. -3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline +3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. -NOTE: If firmware-assisted dump fails to reserve memory then it will - fallback to existing kdump mechanism if 'crashkernel=' option - is set at kernel cmdline. +NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead + use 'crashkernel=' to specify size of the memory to reserve + for boot memory dump preservation. + 2. If firmware-assisted dump fails to reserve memory then it + will fallback to existing kdump mechanism if 'crashkernel=' + option is set at kernel cmdline. Sysfs/debugfs files:
[PATCH 1/3] crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE
Traditionally, kdump is used to save vmcore in case of a crash. Some architectures like powerpc can save vmcore using architecture specific support instead of kexec/kdump mechanism. Such architecture specific support also needs to reserve memory, to be used by dump capture kernel. crashkernel parameter can be a reused, for memory reservation, by such architecture specific infrastructure. But currently, code related to vmcoreinfo and parsing of crashkernel parameter is built under CONFIG_KEXEC_CORE. This patch introduces CONFIG_CRASH_CORE and moves the above mentioned code under this config, allowing code reuse without dependency on CONFIG_KEXEC. While here, removing the multiple definitions of append_elf_note() and final_note() for one defined under CONFIG_CONFIG_CORE. There is no functional change with this patch. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/Kconfig |4 arch/ia64/kernel/crash.c | 22 -- arch/powerpc/Kconfig | 10 - arch/powerpc/include/asm/fadump.h |2 arch/powerpc/kernel/crash.c|2 arch/powerpc/kernel/fadump.c | 34 --- arch/powerpc/kernel/setup-common.c |5 include/linux/crash_core.h | 75 ++ include/linux/kexec.h | 63 - kernel/Makefile|1 kernel/crash_core.c| 450 kernel/kexec_core.c| 435 --- 12 files changed, 550 insertions(+), 553 deletions(-) create mode 100644 include/linux/crash_core.h create mode 100644 kernel/crash_core.c diff --git a/arch/Kconfig b/arch/Kconfig index 659bdd0..4ad34b9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -2,7 +2,11 @@ # General architecture dependent options # +config CRASH_CORE + bool + config KEXEC_CORE + select CRASH_CORE bool config OPROFILE diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index 2955f35..75859a0 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -27,28 +27,6 @@ static int kdump_freeze_monarch; static int kdump_on_init = 1; static int kdump_on_fatal_mca = 1; -static inline Elf64_Word -*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note *note = (struct elf_note *)buf; - note->n_namesz = strlen(name) + 1; - note->n_descsz = data_len; - note->n_type = type; - buf += (sizeof(*note) + 3)/4; - memcpy(buf, name, note->n_namesz); - buf += (note->n_namesz + 3)/4; - memcpy(buf, data, data_len); - buf += (data_len + 3)/4; - return buf; -} - -static void -final_note(void *buf) -{ - memset(buf, 0, sizeof(struct elf_note)); -} - extern void ia64_dump_cpu_regs(void *); static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65fba4c..644703f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -479,21 +479,23 @@ config RELOCATABLE load address of the kernel (eg. u-boot/mkimage). config CRASH_DUMP - bool "Build a kdump crash kernel" + bool "Build a dump capture kernel" depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE help - Build a kernel suitable for use as a kdump capture kernel. + Build a kernel suitable for use as a dump capture kernel. The same kernel binary can be used as production kernel and dump capture kernel. config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC + depends on PPC64 && PPC_RTAS + select CRASH_CORE + select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with assistance from firmware. This approach does not use kexec, - instead firmware assists in booting the kdump kernel + instead firmware assists in booting the capture kernel while preserving memory contents. Firmware-assisted dump is meant to be a kdump replacement offering robustness and speed not possible without system firmware assistance. diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 0031806..60b9108 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -73,6 +73,8 @@ reg_entry++;\ }) +extern int crashing_cpu; + /* Kernel Dump section info */ struct fadump_section { __be32 request_flag; diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 47b63de..cbabb5a 100644 --- a/arch/powerpc/kernel/crash.c
[PATCH 2/3] powerpc/fadump: reuse crashkernel parameter for fadump memory reservation
fadump supports specifying memory to reserve for fadump's crash kernel with fadump_reserve_mem kernel parameter. This parameter currently supports passing a fixed memory size, like fadump_reserve_mem= only. This patch aims to add support for other syntaxes like range-based memory size :[,:,:,...] which allows using the same parameter to boot the kernel with different system RAM sizes. As crashkernel parameter already supports the above mentioned syntaxes, this patch removes fadump_reserve_mem parameter and reuses crashkernel parameter instead, to specify memory for fadump's crash kernel memory reservation as well. If any offset is provided in crashkernel parameter, it will be ignored in case of fadump, as fadump reserves memory at end of RAM. Advantages using crashkernel parameter instead of fadump_reserve_mem parameter are one less kernel parameter overall, code reuse and support for multiple syntaxes to specify memory. Suggested-by: Dave Young <dyo...@redhat.com> Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/kernel/fadump.c | 23 ++- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index db0b339..de7d39a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -210,14 +210,20 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, */ static inline unsigned long fadump_calculate_reserve_size(void) { - unsigned long size; + int ret; + unsigned long long base, size; /* -* Check if the size is specified through fadump_reserve_mem= cmdline -* option. If yes, then use that. +* Check if the size is specified through crashkernel= cmdline +* option. If yes, then use that but ignore base as fadump +* reserves memory at end of RAM. */ - if (fw_dump.reserve_bootvar) + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + , ); + if (ret == 0 && size > 0) { + fw_dump.reserve_bootvar = (unsigned long)size; return fw_dump.reserve_bootvar; + } /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -353,15 +359,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, ); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc;
[PATCH 0/3] kexec/fadump: remove dependency with CONFIG_KEXEC and reuse crashkernel parameter for fadump
Traditionally, kdump is used to save vmcore in case of a crash. Some architectures like powerpc can save vmcore using architecture specific support instead of kexec/kdump mechanism. Such architecture specific support also needs to reserve memory, to be used by dump capture kernel. crashkernel parameter can be a reused, for memory reservation, by such architecture specific infrastructure. This patchset removes dependency with CONFIG_KEXEC for crashkernel parameter and vmcoreinfo related code as it can be reused without kexec support. Also, crashkernel parameter is reused instead of fadump_reserve_mem to reserve memory for fadump. The first patch moves crashkernel parameter parsing and vmcoreinfo related code under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE to reuse without depending on it. The second patch reuses crashkernel for reserving memory for fadump instead of fadump_reserve_mem. This has the advantage of using all the syntaxes crashkernel supports, for fadump as well. The third patch updates fadump kernel documentation about use of crashkernel parameter. --- Hari Bathini (3): crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE powerpc/fadump: reuse crashkernel parameter for fadump memory reservation powerpc/fadump: update documentation about crashkernel parameter reuse Documentation/powerpc/firmware-assisted-dump.txt | 23 + arch/Kconfig |4 arch/ia64/kernel/crash.c | 22 - arch/powerpc/Kconfig | 10 arch/powerpc/include/asm/fadump.h|2 arch/powerpc/kernel/crash.c |2 arch/powerpc/kernel/fadump.c | 57 +-- arch/powerpc/kernel/setup-common.c |5 include/linux/crash_core.h | 75 include/linux/kexec.h| 63 --- kernel/Makefile |1 kernel/crash_core.c | 450 ++ kernel/kexec_core.c | 435 - 13 files changed, 575 insertions(+), 574 deletions(-) create mode 100644 include/linux/crash_core.h create mode 100644 kernel/crash_core.c
Re: [PATCH 1/3] crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE
On Monday 14 November 2016 11:06 AM, Baoquan He wrote: On 11/10/16 at 05:27pm, Hari Bathini wrote: Traditionally, kdump is used to save vmcore in case of a crash. Some architectures like powerpc can save vmcore using architecture specific support instead of kexec/kdump mechanism. Such architecture specific support also needs to reserve memory, to be used by dump capture kernel. crashkernel parameter can be a reused, for memory reservation, by such architecture specific infrastructure. But currently, code related to vmcoreinfo and parsing of crashkernel parameter is built under CONFIG_KEXEC_CORE. This patch introduces CONFIG_CRASH_CORE and moves the above mentioned code under this config, allowing code reuse without dependency on CONFIG_KEXEC. While here, removing the multiple definitions of append_elf_note() and final_note() for one defined under CONFIG_CONFIG_CORE. There is no functional change with this patch. Can't think of a reason to object. Could it be that do the moving from kexec_core.c to crash_core.c only, then do the arch specific clean up in another patch? Right. Will move arch specific code into a separate patch, on the next version.. Besides there's already a file crash_dump.h, can we reuse that? Did think about it. But as it is meant for dump capture kernel (CONFIG_CRASH_DUMP) and CONFIG_KEXEC_CORE being independent, didn't pursue it.. Thanks Hari
[PATCH v2 0/5] kexec/fadump: remove dependency with CONFIG_KEXEC and reuse crashkernel parameter for fadump
Traditionally, kdump is used to save vmcore in case of a crash. Some architectures like powerpc can save vmcore using architecture specific support instead of kexec/kdump mechanism. Such architecture specific support also needs to reserve memory, to be used by dump capture kernel. crashkernel parameter can be a reused, for memory reservation, by such architecture specific infrastructure. This patchset removes dependency with CONFIG_KEXEC for crashkernel parameter and vmcoreinfo related code as it can be reused without kexec support. Also, crashkernel parameter is reused instead of fadump_reserve_mem to reserve memory for fadump. The first patch moves crashkernel parameter parsing and vmcoreinfo related code under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE. The second patch reuses the definitions of append_elf_note() & final_note() functions under CONFIG_CRASH_CORE in IA64 arch code. The third patch removes dependency on CONFIG_KEXEC for firmware-assisted dump (fadump) in powerpc. The next patch reuses crashkernel parameter for reserving memory for fadump, instead of the fadump_reserve_mem parameter. This has the advantage of using all syntaxes crashkernel parameter supports, for fadump as well. The last patch updates fadump kernel documentation about use of crashkernel parameter. Changes from v1: * Moved arch specify code to seperate patches. Introduced one patch for IA64 arch and another patch for powerpc in the process. --- Hari Bathini (5): crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE ia64: reuse append_elf_note() and final_note() functions powerpc/fadump: remove dependency with CONFIG_KEXEC powerpc/fadump: reuse crashkernel parameter for fadump memory reservation powerpc/fadump: update documentation about crashkernel parameter reuse Documentation/powerpc/firmware-assisted-dump.txt | 23 + arch/Kconfig |4 arch/ia64/kernel/crash.c | 22 - arch/powerpc/Kconfig | 10 arch/powerpc/include/asm/fadump.h|2 arch/powerpc/kernel/crash.c |2 arch/powerpc/kernel/fadump.c | 57 +-- arch/powerpc/kernel/setup-common.c |5 include/linux/crash_core.h | 75 include/linux/kexec.h| 63 --- kernel/Makefile |1 kernel/crash_core.c | 450 ++ kernel/kexec_core.c | 435 - 13 files changed, 575 insertions(+), 574 deletions(-) create mode 100644 include/linux/crash_core.h create mode 100644 kernel/crash_core.c
[PATCH v2 3/5] powerpc/fadump: remove dependency with CONFIG_KEXEC
Now that crashkernel parameter parsing and vmcoreinfo related code is moved under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE, remove dependency with CONFIG_KEXEC for CONFIG_FA_DUMP. While here, get rid of definitions of fadump_append_elf_note() & fadump_final_note() functions to reuse similar functions compiled under CONFIG_CRASH_CORE. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/powerpc/Kconfig | 10 ++ arch/powerpc/include/asm/fadump.h |2 ++ arch/powerpc/kernel/crash.c|2 -- arch/powerpc/kernel/fadump.c | 34 +++--- arch/powerpc/kernel/setup-common.c |5 + 5 files changed, 16 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65fba4c..644703f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -479,21 +479,23 @@ config RELOCATABLE load address of the kernel (eg. u-boot/mkimage). config CRASH_DUMP - bool "Build a kdump crash kernel" + bool "Build a dump capture kernel" depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE help - Build a kernel suitable for use as a kdump capture kernel. + Build a kernel suitable for use as a dump capture kernel. The same kernel binary can be used as production kernel and dump capture kernel. config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC + depends on PPC64 && PPC_RTAS + select CRASH_CORE + select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with assistance from firmware. This approach does not use kexec, - instead firmware assists in booting the kdump kernel + instead firmware assists in booting the capture kernel while preserving memory contents. Firmware-assisted dump is meant to be a kdump replacement offering robustness and speed not possible without system firmware assistance. diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 0031806..60b9108 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -73,6 +73,8 @@ reg_entry++;\ }) +extern int crashing_cpu; + /* Kernel Dump section info */ struct fadump_section { __be32 request_flag; diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 47b63de..cbabb5a 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -43,8 +43,6 @@ #define IPI_TIMEOUT1 #define REAL_MODE_TIMEOUT 1 -/* This keeps a track of which one is the crashing cpu. */ -int crashing_cpu = -1; static int time_to_dump; #define CRASH_HANDLER_MAX 3 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8f0c7c5..db0b339 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -486,34 +486,6 @@ fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) return reg_entry; } -static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type, - void *data, size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, , sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void fadump_final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, , sizeof(note)); -} - static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -524,8 +496,8 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) * prstatus.pr_pid = */ elf_core_copy_kernel_regs(_reg, regs); - buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - , sizeof(prstatus)); + buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + , sizeof(prstatus)); return buf; } @@ -666,7 +638,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) note_buf = fadump_regs_to_elf_notes(note_buf, ); } } - fadump_final_note(note_buf); + final_note(note_buf); if (fdh) { pr_debug(&
[PATCH v2 5/5] powerpc/fadump: update documentation about crashkernel parameter reuse
As we are reusing crashkernel parameter instead of fadump_reserve_mem parameter to specify the memory to reserve for fadump's crash kernel, update the documentation accordingly. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Documentation/powerpc/firmware-assisted-dump.txt | 23 ++ 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 3007bc9..8394bc8 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -55,10 +55,14 @@ as follows: booted with restricted memory. By default, the boot memory size will be the larger of 5% of system RAM or 256MB. Alternatively, user can also specify boot memory size - through boot parameter 'fadump_reserve_mem=' which will - override the default calculated size. Use this option - if default boot memory size is not sufficient for second - kernel to boot successfully. + through boot parameter 'crashkernel=' which will override + the default calculated size. Use this option if default + boot memory size is not sufficient for second kernel to + boot successfully. For syntax of crashkernel= parameter, + refer to Documentation/kdump/kdump.txt. If any offset is + provided in crashkernel= parameter, it will be ignored + as fadump reserves memory at end of RAM for boot memory + dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the firmware will reset PCI and other hardware state. It will @@ -158,13 +162,16 @@ How to enable firmware-assisted dump (fadump): 1. Set config option CONFIG_FA_DUMP=y and build kernel. 2. Boot into linux kernel with 'fadump=on' kernel cmdline option. -3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline +3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. -NOTE: If firmware-assisted dump fails to reserve memory then it will - fallback to existing kdump mechanism if 'crashkernel=' option - is set at kernel cmdline. +NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead + use 'crashkernel=' to specify size of the memory to reserve + for boot memory dump preservation. + 2. If firmware-assisted dump fails to reserve memory then it + will fallback to existing kdump mechanism if 'crashkernel=' + option is set at kernel cmdline. Sysfs/debugfs files:
[PATCH v2 1/5] crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE
Traditionally, kdump is used to save vmcore in case of a crash. Some architectures like powerpc can save vmcore using architecture specific support instead of kexec/kdump mechanism. Such architecture specific support also needs to reserve memory, to be used by dump capture kernel. crashkernel parameter can be a reused, for memory reservation, by such architecture specific infrastructure. But currently, code related to vmcoreinfo and parsing of crashkernel parameter is built under CONFIG_KEXEC_CORE. This patch introduces CONFIG_CRASH_CORE and moves the above mentioned code under this config, allowing code reuse without dependency on CONFIG_KEXEC. There is no functional change with this patch. Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- arch/Kconfig |4 include/linux/crash_core.h | 71 +++ include/linux/kexec.h | 63 -- kernel/Makefile|1 kernel/crash_core.c| 450 kernel/kexec_core.c| 407 6 files changed, 530 insertions(+), 466 deletions(-) create mode 100644 include/linux/crash_core.h create mode 100644 kernel/crash_core.c diff --git a/arch/Kconfig b/arch/Kconfig index 659bdd0..4ad34b9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -2,7 +2,11 @@ # General architecture dependent options # +config CRASH_CORE + bool + config KEXEC_CORE + select CRASH_CORE bool config OPROFILE diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h new file mode 100644 index 000..9a4f4b0 --- /dev/null +++ b/include/linux/crash_core.h @@ -0,0 +1,71 @@ +#ifndef LINUX_CRASH_CORE_H +#define LINUX_CRASH_CORE_H + +#include +#include +#include + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ +CRASH_CORE_NOTE_NAME_BYTES + \ +CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ +VMCOREINFO_NOTE_NAME_BYTES + \ +VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; + +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PAGE_OFFSET(value) \ + vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value) +#define VMCOREINFO_VMALLOC_START(value) \ + vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value) +#define VMCOREINFO_VMEMMAP_START(value) \ + vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value) + +extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern size_t vmcoreinfo_size; +extern size_t vmcoreinfo_max_size; + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, + unsi