[PATCH v2 5/5] powerpc/vdso: Declare vdso_patches[] as __initdata
vdso_patches[] table is used only at init time. Mark it __initdata. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 4ad042995ccc..dfa08a7b4e7c 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -76,7 +76,7 @@ struct vdso_patch_def * Currently, we only change sync_dicache to do nothing on processors * with a coherent icache */ -static struct vdso_patch_def vdso_patches[] = { +static struct vdso_patch_def vdso_patches[] __initdata = { { CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, "__kernel_sync_dicache", "__kernel_sync_dicache_p5" -- 2.25.0
[PATCH v2 4/5] powerpc/vdso: Declare constant vars as __ro_after_init
To avoid any risk of modification of vital VDSO variables, declare them __ro_after_init. vdso32_kbase and vdso64_kbase could be made 'const', but it would have high impact on all functions using them as the compiler doesn't expect const property to be discarded. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index fb393266b9cb..4ad042995ccc 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -38,19 +38,19 @@ #define VDSO_ALIGNMENT (1 << 16) extern char vdso32_start, vdso32_end; -static unsigned int vdso32_pages; -static void *vdso32_kbase = &vdso32_start; -unsigned long vdso32_sigtramp; -unsigned long vdso32_rt_sigtramp; +static unsigned int vdso32_pages __ro_after_init; +static void *vdso32_kbase __ro_after_init = &vdso32_start; +unsigned long vdso32_sigtramp __ro_after_init; +unsigned long vdso32_rt_sigtramp __ro_after_init; extern char vdso64_start, vdso64_end; -static void *vdso64_kbase = &vdso64_start; -static unsigned int vdso64_pages; +static void *vdso64_kbase __ro_after_init = &vdso64_start; +static unsigned int vdso64_pages __ro_after_init; #ifdef CONFIG_PPC64 -unsigned long vdso64_rt_sigtramp; +unsigned long vdso64_rt_sigtramp __ro_after_init; #endif /* CONFIG_PPC64 */ -static int vdso_ready; +static int vdso_ready __ro_after_init; /* * The vdso data page (aka. systemcfg for old ppc64 fans) is here. -- 2.25.0
[PATCH v2 3/5] powerpc/vdso: Initialise vdso32_kbase at compile time
Initialise vdso32_kbase at compile time like vdso64_kbase. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 8f245e988a8a..fb393266b9cb 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -37,13 +37,12 @@ /* The alignment of the vDSO */ #define VDSO_ALIGNMENT (1 << 16) +extern char vdso32_start, vdso32_end; static unsigned int vdso32_pages; -static void *vdso32_kbase; +static void *vdso32_kbase = &vdso32_start; unsigned long vdso32_sigtramp; unsigned long vdso32_rt_sigtramp; -extern char vdso32_start, vdso32_end; - extern char vdso64_start, vdso64_end; static void *vdso64_kbase = &vdso64_start; static unsigned int vdso64_pages; @@ -689,8 +688,6 @@ static int __init vdso_init(void) */ vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; - vdso32_kbase = &vdso32_start; - /* * Calculate the size of the 32 bits vDSO */ -- 2.25.0
[PATCH v2 1/5] powerpc/vdso: Remove DBG()
DBG() is defined as void when DEBUG is not defined, and DEBUG is explicitly undefined. It means there is no other way than modifying source code to get the messages printed. It was most likely useful in the first days of VDSO, but today the only 3 DBG() calls don't deserve a special handling. Just remove them. If one day someone need such messages back, use a standard pr_debug() or equivalent. Signed-off-by: Christophe Leroy --- This is a follow up series, applying on top of the series that switches powerpc VDSO to _install_special_mapping(), rebased on today's powerpc/next-test (dd419a93bd99) v2 removes the modification to arch_setup_additional_pages() to consider when is_32bit_task() returning true when CONFIG_VDSO32 not set, as this should never happen. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 13 - 1 file changed, 13 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index e2568d9ecdff..3ef3fc546ac8 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -31,14 +31,6 @@ #include #include -#undef DEBUG - -#ifdef DEBUG -#define DBG(fmt...) printk(fmt) -#else -#define DBG(fmt...) -#endif - /* Max supported size for symbol names */ #define MAX_SYMNAME64 @@ -567,9 +559,6 @@ static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32, if (!match) continue; - DBG("replacing %s with %s...\n", patch->gen_name, - patch->fix_name ? "NONE" : patch->fix_name); - /* * Patch the 32 bits and 64 bits symbols. Note that we do not * patch the "." symbol on 64 bits. @@ -704,7 +693,6 @@ static int __init vdso_init(void) * Calculate the size of the 64 bits vDSO */ vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; - DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages); vdso32_kbase = &vdso32_start; @@ -712,7 +700,6 @@ static int __init vdso_init(void) * Calculate the size of the 32 bits vDSO */ vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; - DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); /* * Setup the syscall map in the vDOS -- 2.25.0
[PATCH v2 2/5] powerpc/vdso: Don't rely on vdso_pages being 0 for failure
If vdso initialisation failed, vdso_ready is not set. Otherwise, vdso_pages is only 0 when it is a 32 bits task and CONFIG_VDSO32 is not selected. As arch_setup_additional_pages() now bails out directly in that case, we don't need to set vdso_pages to 0. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 23 ++- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 3ef3fc546ac8..8f245e988a8a 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -176,11 +176,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) current->mm->context.vdso_base = 0; - /* vDSO has a problem and was disabled, just don't "enable" it for the -* process -*/ - if (vdso_pages == 0) - return 0; /* Add a page to the vdso size for the data page */ vdso_pages ++; @@ -710,14 +705,16 @@ static int __init vdso_init(void) * Initialize the vDSO images in memory, that is do necessary * fixups of vDSO symbols, locate trampolines, etc... */ - if (vdso_setup()) - goto setup_failed; + if (vdso_setup()) { + pr_err("vDSO setup failure, not enabled !\n"); + return 0; + } if (IS_ENABLED(CONFIG_VDSO32)) { /* Make sure pages are in the correct state */ pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *), GFP_KERNEL); if (!pagelist) - goto alloc_failed; + return 0; pagelist[0] = virt_to_page(vdso_data); @@ -730,7 +727,7 @@ static int __init vdso_init(void) if (IS_ENABLED(CONFIG_PPC64)) { pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), GFP_KERNEL); if (!pagelist) - goto alloc_failed; + return 0; pagelist[0] = virt_to_page(vdso_data); @@ -743,14 +740,6 @@ static int __init vdso_init(void) smp_wmb(); vdso_ready = 1; - return 0; - -setup_failed: - pr_err("vDSO setup failure, not enabled !\n"); -alloc_failed: - vdso32_pages = 0; - vdso64_pages = 0; - return 0; } arch_initcall(vdso_init); -- 2.25.0
Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization
Le 28/08/2020 à 07:40, Christophe Leroy a écrit : Le 27/08/2020 à 15:19, Michael Ellerman a écrit : Christophe Leroy writes: On 08/26/2020 02:58 PM, Michael Ellerman wrote: Christophe Leroy writes: diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index daef14a284a3..bbb69832fd46 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -718,16 +710,14 @@ static int __init vdso_init(void) ... - -#ifdef CONFIG_VDSO32 vdso32_kbase = &vdso32_start; /* @@ -735,8 +725,6 @@ static int __init vdso_init(void) */ vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); -#endif This didn't build for ppc64le: /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to `vdso32_end' /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to `vdso32_start' make[1]: *** [/scratch/michael/build/maint/Makefile:1166: vmlinux] Error 1 make: *** [Makefile:185: __sub-make] Error 2 So I just put that ifdef back. The problem is because is_32bit() can still return true even when CONFIG_VDSO32 is not set. Hmm, you're right. My config had CONFIG_COMPAT enabled. But that seems like a bug, if someone enables COMPAT on ppc64le they are almost certainly going to want VDSO32 as well. So I think I'll do a lead up patch as below. Ah yes, and with that then no need to consider the case where is_32bit_task() is true and CONFIG_VDSO32 is not selected. I'll update my leading series accordingly. I meant follow up series. Christophe Christophe cheers diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d4fd109f177e..cf2da1e401ef 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -501,13 +501,12 @@ endmenu config VDSO32 def_bool y - depends on PPC32 || CPU_BIG_ENDIAN + depends on PPC32 || COMPAT help This symbol controls whether we build the 32-bit VDSO. We obviously want to do that if we're building a 32-bit kernel. If we're building - a 64-bit kernel then we only want a 32-bit VDSO if we're building for - big endian. That is because the only little endian configuration we - support is ppc64le which is 64-bit only. + a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling + COMPAT. choice prompt "Endianness selection"
Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization
Le 27/08/2020 à 15:19, Michael Ellerman a écrit : Christophe Leroy writes: On 08/26/2020 02:58 PM, Michael Ellerman wrote: Christophe Leroy writes: diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index daef14a284a3..bbb69832fd46 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -718,16 +710,14 @@ static int __init vdso_init(void) ... - -#ifdef CONFIG_VDSO32 vdso32_kbase = &vdso32_start; /* @@ -735,8 +725,6 @@ static int __init vdso_init(void) */ vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); -#endif This didn't build for ppc64le: /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to `vdso32_end' /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to `vdso32_start' make[1]: *** [/scratch/michael/build/maint/Makefile:1166: vmlinux] Error 1 make: *** [Makefile:185: __sub-make] Error 2 So I just put that ifdef back. The problem is because is_32bit() can still return true even when CONFIG_VDSO32 is not set. Hmm, you're right. My config had CONFIG_COMPAT enabled. But that seems like a bug, if someone enables COMPAT on ppc64le they are almost certainly going to want VDSO32 as well. So I think I'll do a lead up patch as below. Ah yes, and with that then no need to consider the case where is_32bit_task() is true and CONFIG_VDSO32 is not selected. I'll update my leading series accordingly. Christophe cheers diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d4fd109f177e..cf2da1e401ef 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -501,13 +501,12 @@ endmenu config VDSO32 def_bool y - depends on PPC32 || CPU_BIG_ENDIAN + depends on PPC32 || COMPAT help This symbol controls whether we build the 32-bit VDSO. We obviously want to do that if we're building a 32-bit kernel. If we're building - a 64-bit kernel then we only want a 32-bit VDSO if we're building for - big endian. That is because the only little endian configuration we - support is ppc64le which is 64-bit only. + a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling + COMPAT. choice prompt "Endianness selection"
Re: [PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift
On 28/08/2020 01:32, Leonardo Bras wrote: > Hello Alexey, thank you for this feedback! > > On Sat, 2020-08-22 at 19:33 +1000, Alexey Kardashevskiy wrote: >>> +#define TCE_RPN_BITS 52 /* Bits 0-51 represent >>> RPN on TCE */ >> >> Ditch this one and use MAX_PHYSMEM_BITS instead? I am pretty sure this >> is the actual limit. > > I understand this MAX_PHYSMEM_BITS(51) comes from the maximum physical memory > addressable in the machine. IIUC, it means we can access physical address up > to (1ul << MAX_PHYSMEM_BITS). > > This 52 comes from PAPR "Table 9. TCE Definition" which defines bits > 0-51 as the RPN. By looking at code, I understand that it means we may input > any address < (1ul << 52) to TCE. > > In practice, MAX_PHYSMEM_BITS should be enough as of today, because I suppose > we can't ever pass a physical page address over > (1ul << 51), and TCE accepts up to (1ul << 52). > But if we ever increase MAX_PHYSMEM_BITS, it doesn't necessarily means that > TCE_RPN_BITS will also be increased, so I think they are independent values. > > Does it make sense? Please let me know if I am missing something. The underlying hardware is PHB3/4 about which the IODA2 Version 2.4 6Apr2012.pdf spec says: "The number of most significant RPN bits implemented in the TCE is dependent on the max size of System Memory to be supported by the platform". IODA3 is the same on this matter. This is MAX_PHYSMEM_BITS and PHB itself does not have any other limits on top of that. So the only real limit comes from MAX_PHYSMEM_BITS and where TCE_RPN_BITS comes from exactly - I have no idea. > >> >> >>> +#define TCE_RPN_MASK(ps) ((1ul << (TCE_RPN_BITS - (ps))) - 1) >>> #define TCE_VALID 0x800 /* TCE valid */ >>> #define TCE_ALLIO 0x400 /* TCE valid for all lpars */ >>> #define TCE_PCI_WRITE 0x2 /* write from PCI >>> allowed */ >>> diff --git a/arch/powerpc/platforms/pseries/iommu.c >>> b/arch/powerpc/platforms/pseries/iommu.c >>> index e4198700ed1a..8fe23b7dff3a 100644 >>> --- a/arch/powerpc/platforms/pseries/iommu.c >>> +++ b/arch/powerpc/platforms/pseries/iommu.c >>> @@ -107,6 +107,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, >>> long index, >>> u64 proto_tce; >>> __be64 *tcep; >>> u64 rpn; >>> + const unsigned long tceshift = tbl->it_page_shift; >>> + const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); >>> + const u64 rpn_mask = TCE_RPN_MASK(tceshift); >> >> Using IOMMU_PAGE_SIZE macro for the page size and not using >> IOMMU_PAGE_MASK for the mask - this incosistency makes my small brain >> explode :) I understand the history but man... Oh well, ok. >> > > Yeah, it feels kind of weird after two IOMMU related consts. :) > But sure IOMMU_PAGE_MASK() would not be useful here :) > > And this kind of let me thinking: >>> + rpn = __pa(uaddr) >> tceshift; >>> + *tcep = cpu_to_be64(proto_tce | (rpn & rpn_mask) << tceshift); > Why not: > rpn_mask = TCE_RPN_MASK(tceshift) << tceshift; A mask for a page number (but not the address!) hurts my brain, masks are good against addresses but numbers should already have all bits adjusted imho, may be it is just me :-/ > > rpn = __pa(uaddr) & rpn_mask; > *tcep = cpu_to_be64(proto_tce | rpn) > > I am usually afraid of changing stuff like this, but I think it's safe. > >> Good, otherwise. Thanks, > > Thank you for reviewing! > > > -- Alexey
Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
Dmitry Safonov <0x7f454...@gmail.com> writes: > Hello, > > On Wed, 26 Aug 2020 at 15:39, Michael Ellerman wrote: >> Christophe Leroy writes: > [..] >> > arch_remap() gets replaced by vdso_remap() >> > >> > For arch_unmap(), I'm wondering how/what other architectures do, because >> > powerpc seems to be the only one to erase the vdso context pointer when >> > unmapping the vdso. >> >> Yeah. The original unmap/remap stuff was added for CRIU, which I thought >> people tested on other architectures (more than powerpc even). >> >> Possibly no one really cares about vdso unmap though, vs just moving the >> vdso. >> >> We added a test for vdso unmap recently because it happened to trigger a >> KAUP failure, and someone actually hit it & reported it. > > You right, CRIU cares much more about moving vDSO. > It's done for each restoree and as on most setups vDSO is premapped and > used by the application - it's actively tested. > Speaking about vDSO unmap - that's concerning only for heterogeneous C/R, > i.e when an application is migrated from a system that uses vDSO to the one > which doesn't - it's much rare scenario. > (for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter) Ah OK that explains it. The case we hit of VDSO unmapping was some strange "library OS" thing which had explicitly unmapped the VDSO, so also very rare. > Looking at the code, it seems quite easy to provide/maintain .close() for > vm_special_mapping. A bit harder to add a test from CRIU side > (as glibc won't know on restore that it can't use vdso anymore), > but totally not impossible. > >> Running that test on arm64 segfaults: >> >> # ./sigreturn_vdso >> VDSO is at 0x8191f000-0x8191 (4096 bytes) >> Signal delivered OK with VDSO mapped >> VDSO moved to 0x8191a000-0x8191afff (4096 bytes) >> Signal delivered OK with VDSO moved >> Unmapped VDSO >> Remapped the stack executable >> [ 48.556191] potentially unexpected fatal signal 11. >> [ 48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted >> 5.9.0-rc2-00057-g2ac69819ba9e #190 >> [ 48.556990] Hardware name: linux,dummy-virt (DT) >> [ 48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--) >> [ 48.557475] pc : 8191a7bc >> [ 48.557603] lr : 8191a7bc >> [ 48.557697] sp : c13c9e90 >> [ 48.557873] x29: c13cb0e0 x28: >> [ 48.558201] x27: x26: >> [ 48.558337] x25: x24: >> [ 48.558754] x23: x22: >> [ 48.558893] x21: 004009b0 x20: >> [ 48.559046] x19: 00400ff0 x18: >> [ 48.559180] x17: 817da300 x16: 00412010 >> [ 48.559312] x15: x14: 001c >> [ 48.559443] x13: 656c626174756365 x12: 7865206b63617473 >> [ 48.559625] x11: 0003 x10: 0101010101010101 >> [ 48.559828] x9 : 818afda8 x8 : 0081 >> [ 48.559973] x7 : 6174732065687420 x6 : 64657070616d6552 >> [ 48.560115] x5 : 0e0388bd x4 : 0040135d >> [ 48.560270] x3 : x2 : 0001 >> [ 48.560412] x1 : 0003 x0 : 004120b8 >> Segmentation fault >> # >> >> So I think we need to keep the unmap hook. Maybe it should be handled by >> the special_mapping stuff generically. > > I'll cook a patch for vm_special_mapping if you don't mind :-) That would be great, thanks! cheers
[PATCH] powerpc/tools: Remove 90 line limit in checkpatch script
As of commit bdc48fa11e46, scripts/checkpatch.pl now has a default line length warning of 100 characters. The powerpc wrapper script was using a length of 90 instead of 80 in order to make checkpatch less restrictive, but now it's making it more restrictive instead. I think it makes sense to just use the default value now. Signed-off-by: Russell Currey --- arch/powerpc/tools/checkpatch.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/tools/checkpatch.sh b/arch/powerpc/tools/checkpatch.sh index 3ce5c093b19d..91c04802ec31 100755 --- a/arch/powerpc/tools/checkpatch.sh +++ b/arch/powerpc/tools/checkpatch.sh @@ -9,7 +9,6 @@ script_base=$(realpath $(dirname $0)) exec $script_base/../../../scripts/checkpatch.pl \ --subjective \ --no-summary \ - --max-line-length=90 \ --show-types \ --ignore ARCH_INCLUDE_LINUX \ --ignore BIT_MACRO \ -- 2.28.0
Re: [PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper
On 28/08/2020 08:11, Leonardo Bras wrote: > On Mon, 2020-08-24 at 13:46 +1000, Alexey Kardashevskiy wrote: >>> static int find_existing_ddw_windows(void) >>> { >>> int len; >>> @@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void) >>> if (!direct64) >>> continue; >>> >>> - window = kzalloc(sizeof(*window), GFP_KERNEL); >>> - if (!window || len < sizeof(struct dynamic_dma_window_prop)) { >>> + window = ddw_list_add(pdn, direct64); >>> + if (!window || len < sizeof(*direct64)) { >> >> Since you are touching this code, it looks like the "len < >> sizeof(*direct64)" part should go above to "if (!direct64)". > > Sure, makes sense. > It will be fixed for v2. > >> >> >> >>> kfree(window); >>> remove_ddw(pdn, true); >>> - continue; >>> } >>> - >>> - window->device = pdn; >>> - window->prop = direct64; >>> - spin_lock(&direct_window_list_lock); >>> - list_add(&window->list, &direct_window_list); >>> - spin_unlock(&direct_window_list_lock); >>> } >>> >>> return 0; >>> @@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct >>> device_node *pdn) >>> dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", >>> create.liobn, dn); >>> >>> - window = kzalloc(sizeof(*window), GFP_KERNEL); >>> + /* Add new window to existing DDW list */ >> >> The comment seems to duplicate what the ddw_list_add name already suggests. > > Ok, I will remove it then. > >>> + window = ddw_list_add(pdn, ddwprop); >>> if (!window) >>> goto out_clear_window; >>> >>> @@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct >>> device_node *pdn) >>> goto out_free_window; >>> } >>> >>> - window->device = pdn; >>> - window->prop = ddwprop; >>> - spin_lock(&direct_window_list_lock); >>> - list_add(&window->list, &direct_window_list); >>> - spin_unlock(&direct_window_list_lock); >> >> I'd leave these 3 lines here and in find_existing_ddw_windows() (which >> would make ddw_list_add -> ddw_prop_alloc). In general you want to have >> less stuff to do on the failure path. kmalloc may fail and needs kfree >> but you can safely delay list_add (which cannot fail) and avoid having >> the lock help twice in the same function (one of them is hidden inside >> ddw_list_add). >> Not sure if this change is really needed after all. Thanks, > > I understand this leads to better performance in case anything fails. > Also, I think list_add happening in the end is less error-prone (in > case the list is checked between list_add and a fail). Performance was not in my mind at all. I noticed you remove from a list with a lock help and it was not there before and there is a bunch on labels on the exit path and started looking for list_add() and if you do not double remove from the list. > But what if we put it at the end? > What is the chance of a kzalloc of 4 pointers (struct direct_window) > failing after walk_system_ram_range? This is not about chances really, it is about readability. If let's say kmalloc failed, you just to the error exit label and simply call kfree() on that pointer, kfree will do nothing if it is NULL already, simple. list_del() does not have this simplicity. > Is it not worthy doing that for making enable_ddw() easier to > understand? This is my goal here :) > > Best regards, > Leonardo > -- Alexey
Re: [PATCH v1 04/10] powerpc/kernel/iommu: Add new iommu_table_in_use() helper
On 28/08/2020 04:34, Leonardo Bras wrote: > On Sat, 2020-08-22 at 20:34 +1000, Alexey Kardashevskiy wrote: >>> + >>> + /*ignore reserved bit0*/ >> >> s/ignore reserved bit0/ ignore reserved bit0 / (add spaces) > > Fixed > >>> + if (tbl->it_offset == 0) >>> + p1_start = 1; >>> + >>> + /* Check if reserved memory is valid*/ >> >> A missing space here. > > Fixed > >> >>> + if (tbl->it_reserved_start >= tbl->it_offset && >>> + tbl->it_reserved_start <= (tbl->it_offset + tbl->it_size) && >>> + tbl->it_reserved_end >= tbl->it_offset && >>> + tbl->it_reserved_end <= (tbl->it_offset + tbl->it_size)) { >> >> Uff. What if tbl->it_reserved_end is bigger than tbl->it_offset + >> tbl->it_size? >> >> The reserved area is to preserve MMIO32 so it is for it_offset==0 only >> and the boundaries are checked in the only callsite, and it is unlikely >> to change soon or ever. >> >> Rather that bothering with fixing that, may be just add (did not test): >> >> if (WARN_ON(( >> (tbl->it_reserved_start || tbl->it_reserved_end) && (it_offset != 0)) >> (tbl->it_reserved_start > it_offset && tbl->it_reserved_end < it_offset >> + it_size) && (it_offset == 0)) ) >> return true; >> >> Or simply always look for it_offset..it_reserved_start and >> it_reserved_end..it_offset+it_size and if there is no reserved area, >> initialize it_reserved_start=it_reserved_end=it_offset so the first >> it_offset..it_reserved_start becomes a no-op. > > The problem here is that the values of it_reserved_{start,end} are not > necessarily valid. I mean, on iommu_table_reserve_pages() the values > are stored however they are given (bit reserving is done only if they > are valid). > > Having a it_reserved_{start,end} value outside the valid ranges would > cause find_next_bit() to run over memory outside the bitmap. > Even if the those values are < tbl->it_offset, the resulting > subtraction on unsigned would cause it to become a big value and run > over memory outside the bitmap. > > But I think you are right. That is not the place to check if the > reserved values are valid. It should just trust them here. > I intent to change iommu_table_reserve_pages() to only store the > parameters in it_reserved_{start,end} if they are in the range, and or > it_offset in both of them if they are not. > > What do you think? This should work, yes. > > Thanks for the feedback! > Leonardo Bras > > > -- Alexey
Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()
On 28/08/2020 02:51, Leonardo Bras wrote: > On Sat, 2020-08-22 at 20:07 +1000, Alexey Kardashevskiy wrote: >> >> On 18/08/2020 09:40, Leonardo Bras wrote: >>> Both iommu_alloc_coherent() and iommu_free_coherent() assume that once >>> size is aligned to PAGE_SIZE it will be aligned to IOMMU_PAGE_SIZE. >> >> The only case when it is not aligned is when IOMMU_PAGE_SIZE > PAGE_SIZE >> which is unlikely but not impossible, we could configure the kernel for >> 4K system pages and 64K IOMMU pages I suppose. Do we really want to do >> this here, or simply put WARN_ON(tbl->it_page_shift > PAGE_SHIFT)? > > I think it would be better to keep the code as much generic as possible > regarding page sizes. Then you need to test it. Does 4K guest even boot (it should but I would not bet much on it)? > >> Because if we want the former (==support), then we'll have to align the >> size up to the bigger page size when allocating/zeroing system pages, >> etc. > > This part I don't understand. Why do we need to align everything to the > bigger pagesize? > > I mean, is not that enough that the range [ret, ret + size[ is both > allocated by mm and mapped on a iommu range? > > Suppose a iommu_alloc_coherent() of 16kB on PAGESIZE = 4k and > IOMMU_PAGE_SIZE() == 64k. > Why 4 * cpu_pages mapped by a 64k IOMMU page is not enough? > All the space the user asked for is allocated and mapped for DMA. The user asked to map 16K, the rest - 48K - is used for something else (may be even mapped to another device) but you are making all 64K accessible by the device which only should be able to access 16K. In practice, if this happens, H_PUT_TCE will simply fail. > >> Bigger pages are not the case here as I understand it. > > I did not get this part, what do you mean? Possible IOMMU page sizes are 4K, 64K, 2M, 16M, 256M, 1GB, and the supported set of sizes is different for P8/P9 and type of IO (PHB, NVLink/CAPI). > >>> Update those functions to guarantee alignment with requested size >>> using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free(). >>> >>> Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift) >>> with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read. >>> >>> Signed-off-by: Leonardo Bras >>> --- >>> arch/powerpc/kernel/iommu.c | 17 + >>> 1 file changed, 9 insertions(+), 8 deletions(-) >>> >>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c >>> index 9704f3f76e63..d7086087830f 100644 >>> --- a/arch/powerpc/kernel/iommu.c >>> +++ b/arch/powerpc/kernel/iommu.c >>> @@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct device >>> *dev, >>> } >>> >>> if (dev) >>> - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, >>> - 1 << tbl->it_page_shift); >>> + boundary_size = IOMMU_PAGE_ALIGN(dma_get_seg_boundary(dev) + 1, >>> tbl); >> >> Run checkpatch.pl, should complain about a long line. > > It's 86 columns long, which is less than the new limit of 100 columns > Linus announced a few weeks ago. checkpatch.pl was updated too: > https://www.phoronix.com/scan.php?page=news_item&px=Linux-Kernel-Deprecates-80-Col Yay finally :) Thanks, > >> >> >>> else >>> - boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift); >>> + boundary_size = IOMMU_PAGE_ALIGN(1UL << 32, tbl); >>> /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ >>> >>> n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, >>> @@ -858,6 +857,7 @@ void *iommu_alloc_coherent(struct device *dev, struct >>> iommu_table *tbl, >>> unsigned int order; >>> unsigned int nio_pages, io_order; >>> struct page *page; >>> + size_t size_io = size; >>> >>> size = PAGE_ALIGN(size); >>> order = get_order(size); >>> @@ -884,8 +884,9 @@ void *iommu_alloc_coherent(struct device *dev, struct >>> iommu_table *tbl, >>> memset(ret, 0, size); >>> >>> /* Set up tces to cover the allocated range */ >>> - nio_pages = size >> tbl->it_page_shift; >>> - io_order = get_iommu_order(size, tbl); >>> + size_io = IOMMU_PAGE_ALIGN(size_io, tbl); >>> + nio_pages = size_io >> tbl->it_page_shift; >>> + io_order = get_iommu_order(size_io, tbl); >>> mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, >>> mask >> tbl->it_page_shift, io_order, 0); >>> if (mapping == DMA_MAPPING_ERROR) { >>> @@ -900,11 +901,11 @@ void iommu_free_coherent(struct iommu_table *tbl, >>> size_t size, >>> void *vaddr, dma_addr_t dma_handle) >>> { >>> if (tbl) { >>> - unsigned int nio_pages; >>> + size_t size_io = IOMMU_PAGE_ALIGN(size, tbl); >>> + unsigned int nio_pages = size_io >> tbl->it_page_shift; >>> >>> - size = PAGE_ALIGN(size); >>> - nio_pages = size >> tbl->it_page_shift; >>> iommu_free(tbl, dma_handle, nio_pages); >>> + >>
Re: [PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper
On Mon, 2020-08-24 at 13:46 +1000, Alexey Kardashevskiy wrote: > > static int find_existing_ddw_windows(void) > > { > > int len; > > @@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void) > > if (!direct64) > > continue; > > > > - window = kzalloc(sizeof(*window), GFP_KERNEL); > > - if (!window || len < sizeof(struct dynamic_dma_window_prop)) { > > + window = ddw_list_add(pdn, direct64); > > + if (!window || len < sizeof(*direct64)) { > > Since you are touching this code, it looks like the "len < > sizeof(*direct64)" part should go above to "if (!direct64)". Sure, makes sense. It will be fixed for v2. > > > > > kfree(window); > > remove_ddw(pdn, true); > > - continue; > > } > > - > > - window->device = pdn; > > - window->prop = direct64; > > - spin_lock(&direct_window_list_lock); > > - list_add(&window->list, &direct_window_list); > > - spin_unlock(&direct_window_list_lock); > > } > > > > return 0; > > @@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct > > device_node *pdn) > > dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", > > create.liobn, dn); > > > > - window = kzalloc(sizeof(*window), GFP_KERNEL); > > + /* Add new window to existing DDW list */ > > The comment seems to duplicate what the ddw_list_add name already suggests. Ok, I will remove it then. > > + window = ddw_list_add(pdn, ddwprop); > > if (!window) > > goto out_clear_window; > > > > @@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct > > device_node *pdn) > > goto out_free_window; > > } > > > > - window->device = pdn; > > - window->prop = ddwprop; > > - spin_lock(&direct_window_list_lock); > > - list_add(&window->list, &direct_window_list); > > - spin_unlock(&direct_window_list_lock); > > I'd leave these 3 lines here and in find_existing_ddw_windows() (which > would make ddw_list_add -> ddw_prop_alloc). In general you want to have > less stuff to do on the failure path. kmalloc may fail and needs kfree > but you can safely delay list_add (which cannot fail) and avoid having > the lock help twice in the same function (one of them is hidden inside > ddw_list_add). > Not sure if this change is really needed after all. Thanks, I understand this leads to better performance in case anything fails. Also, I think list_add happening in the end is less error-prone (in case the list is checked between list_add and a fail). But what if we put it at the end? What is the chance of a kzalloc of 4 pointers (struct direct_window) failing after walk_system_ram_range? Is it not worthy doing that for making enable_ddw() easier to understand? Best regards, Leonardo
Re: fsl_espi errors on v5.7.15
On 27/08/20 7:12 pm, Nicholas Piggin wrote: > Excerpts from Heiner Kallweit's message of August 26, 2020 4:38 pm: >> On 26.08.2020 08:07, Chris Packham wrote: >>> On 26/08/20 1:48 pm, Chris Packham wrote: On 26/08/20 10:22 am, Chris Packham wrote: > On 25/08/20 7:22 pm, Heiner Kallweit wrote: > > >> I've been staring at spi-fsl-espi.c for while now and I think I've >>> identified a couple of deficiencies that may or may not be related >>> to my >>> issue. >>> >>> First I think the 'Transfer done but SPIE_DON isn't set' message >>> can be >>> generated spuriously. In fsl_espi_irq() we read the ESPI_SPIE >>> register. >>> We also write back to it to clear the current events. We re-read it in >>> fsl_espi_cpu_irq() and complain when SPIE_DON is not set. But we can >>> naturally end up in that situation if we're doing a large read. >>> Consider >>> the messages for reading a block of data from a spi-nor chip >>> >>> tx = READ_OP + ADDR >>> rx = data >>> >>> We setup the transfer and pump out the tx_buf. The first interrupt >>> goes >>> off and ESPI_SPIE has SPIM_DON and SPIM_RXT set. We empty the rx fifo, >>> clear ESPI_SPIE and wait for the next interrupt. The next interrupt >>> fires and this time we have ESPI_SPIE with just SPIM_RXT set. This >>> continues until we've received all the data and we finish with >>> ESPI_SPIE >>> having only SPIM_RXT set. When we re-read it we complain that SPIE_DON >>> isn't set. >>> >>> The other deficiency is that we only get an interrupt when the >>> amount of >>> data in the rx fifo is above FSL_ESPI_RXTHR. If there are fewer than >>> FSL_ESPI_RXTHR left to be received we will never pull them out of >>> the fifo. >>> >> SPIM_DON will trigger an interrupt once the last characters have been >> transferred, and read the remaining characters from the FIFO. > The T2080RM that I have says the following about the DON bit > > "Last character was transmitted. The last character was transmitted > and a new command can be written for the next frame." > > That does at least seem to fit with my assertion that it's all about > the TX direction. But the fact that it doesn't happen all the time > throws some doubt on it. > >> I think the reason I'm seeing some variability is because of how fast >>> (or slow) the interrupts get processed and how fast the spi-nor >>> chip can >>> fill the CPUs rx fifo. >>> >> To rule out timing issues at high bus frequencies I initially asked >> for re-testing at lower frequencies. If you e.g. limit the bus to 1 MHz >> or even less, then timing shouldn't be an issue. > Yes I've currently got spi-max-frequency = <100>; in my dts. I > would also expect a slower frequency would fit my "DON is for TX" > narrative. >> Last relevant functional changes have been done almost 4 years ago. >> And yours is the first such report I see. So question is what could >> be so >> special with your setup that it seems you're the only one being >> affected. >> The scenarios you describe are standard, therefore much more people >> should be affected in case of a driver bug. > Agreed. But even on my hardware (which may have a latent issue > despite being in the field for going on 5 years) the issue only > triggers under some fairly specific circumstances. >> You said that kernel config impacts how frequently the issue happens. >> Therefore question is what's the diff in kernel config, and how could >> the differences be related to SPI. > It did seem to be somewhat random. Things like CONFIG_PREEMPT have an > impact but every time I found something that seemed to be having an > impact I've been able to disprove it. I actually think its about how > busy the system is which may or may not affect when we get round to > processing the interrupts. > > I have managed to get the 'Transfer done but SPIE_DON isn't set!' to > occur on the T2080RDB. > > I've had to add the following to expose the environment as a mtd > partition > > diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi > b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi > index ff87e67c70da..fbf95fc1fd68 100644 > --- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi > +++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi > @@ -116,6 +116,15 @@ flash@0 { > compatible = "micron,n25q512ax3", > "jedec,spi-nor"; > reg = <0>; > spi-max-frequency = <1000>; /* > input clock */ > + > + partition@u-boot { > + reg = <0x 0x0010>; > + la
Re: [PATCH v1 05/10] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
On Mon, 2020-08-24 at 10:38 +1000, Alexey Kardashevskiy wrote: > > On 18/08/2020 09:40, Leonardo Bras wrote: > > Creates a helper to allow allocating a new iommu_table without the need > > to reallocate the iommu_group. > > > > This will be helpful for replacing the iommu_table for the new DMA window, > > after we remove the old one with iommu_tce_table_put(). > > > > Signed-off-by: Leonardo Bras > > --- > > arch/powerpc/platforms/pseries/iommu.c | 25 ++--- > > 1 file changed, 14 insertions(+), 11 deletions(-) > > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c > > b/arch/powerpc/platforms/pseries/iommu.c > > index 8fe23b7dff3a..39617ce0ec83 100644 > > --- a/arch/powerpc/platforms/pseries/iommu.c > > +++ b/arch/powerpc/platforms/pseries/iommu.c > > @@ -53,28 +53,31 @@ enum { > > DDW_EXT_QUERY_OUT_SIZE = 2 > > }; > > > > -static struct iommu_table_group *iommu_pseries_alloc_group(int node) > > +static struct iommu_table *iommu_pseries_alloc_table(int node) > > { > > - struct iommu_table_group *table_group; > > struct iommu_table *tbl; > > > > - table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, > > - node); > > - if (!table_group) > > - return NULL; > > - > > tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); > > if (!tbl) > > - goto free_group; > > + return NULL; > > > > INIT_LIST_HEAD_RCU(&tbl->it_group_list); > > kref_init(&tbl->it_kref); > > + return tbl; > > +} > > > > - table_group->tables[0] = tbl; > > +static struct iommu_table_group *iommu_pseries_alloc_group(int node) > > +{ > > + struct iommu_table_group *table_group; > > + > > + table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); > > I'd prefer you did not make unrelated changes (sizeof(struct > iommu_table_group) -> sizeof(*table_group)) so the diff stays shorter > and easier to follow. You changed sizeof(struct iommu_table_group) but > not sizeof(struct iommu_table) and this confused me enough to spend more > time than this straight forward change deserves. Sorry, I will keep this in mind for future patches. Thank you for the tip! > > Not important in this case though so > > Reviewed-by: Alexey Kardashevskiy Thank you!
Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
Hello, On Wed, 26 Aug 2020 at 15:39, Michael Ellerman wrote: > Christophe Leroy writes: [..] > > arch_remap() gets replaced by vdso_remap() > > > > For arch_unmap(), I'm wondering how/what other architectures do, because > > powerpc seems to be the only one to erase the vdso context pointer when > > unmapping the vdso. > > Yeah. The original unmap/remap stuff was added for CRIU, which I thought > people tested on other architectures (more than powerpc even). > > Possibly no one really cares about vdso unmap though, vs just moving the > vdso. > > We added a test for vdso unmap recently because it happened to trigger a > KAUP failure, and someone actually hit it & reported it. You right, CRIU cares much more about moving vDSO. It's done for each restoree and as on most setups vDSO is premapped and used by the application - it's actively tested. Speaking about vDSO unmap - that's concerning only for heterogeneous C/R, i.e when an application is migrated from a system that uses vDSO to the one which doesn't - it's much rare scenario. (for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter) Looking at the code, it seems quite easy to provide/maintain .close() for vm_special_mapping. A bit harder to add a test from CRIU side (as glibc won't know on restore that it can't use vdso anymore), but totally not impossible. > Running that test on arm64 segfaults: > > # ./sigreturn_vdso > VDSO is at 0x8191f000-0x8191 (4096 bytes) > Signal delivered OK with VDSO mapped > VDSO moved to 0x8191a000-0x8191afff (4096 bytes) > Signal delivered OK with VDSO moved > Unmapped VDSO > Remapped the stack executable > [ 48.556191] potentially unexpected fatal signal 11. > [ 48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted > 5.9.0-rc2-00057-g2ac69819ba9e #190 > [ 48.556990] Hardware name: linux,dummy-virt (DT) > [ 48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--) > [ 48.557475] pc : 8191a7bc > [ 48.557603] lr : 8191a7bc > [ 48.557697] sp : c13c9e90 > [ 48.557873] x29: c13cb0e0 x28: > [ 48.558201] x27: x26: > [ 48.558337] x25: x24: > [ 48.558754] x23: x22: > [ 48.558893] x21: 004009b0 x20: > [ 48.559046] x19: 00400ff0 x18: > [ 48.559180] x17: 817da300 x16: 00412010 > [ 48.559312] x15: x14: 001c > [ 48.559443] x13: 656c626174756365 x12: 7865206b63617473 > [ 48.559625] x11: 0003 x10: 0101010101010101 > [ 48.559828] x9 : 818afda8 x8 : 0081 > [ 48.559973] x7 : 6174732065687420 x6 : 64657070616d6552 > [ 48.560115] x5 : 0e0388bd x4 : 0040135d > [ 48.560270] x3 : x2 : 0001 > [ 48.560412] x1 : 0003 x0 : 004120b8 > Segmentation fault > # > > So I think we need to keep the unmap hook. Maybe it should be handled by > the special_mapping stuff generically. I'll cook a patch for vm_special_mapping if you don't mind :-) Thanks, Dmitry
[powerpc:next-test] BUILD SUCCESS adf3447eddc59967b18faf51dff97c07a5a8220b
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test branch HEAD: adf3447eddc59967b18faf51dff97c07a5a8220b powerpc: Update documentation of ISA versions for Power10 elapsed time: 735m configs tested: 118 configs skipped: 13 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm64allyesconfig arm64 defconfig arm allyesconfig arm allmodconfig riscvnommu_k210_defconfig mips decstation_defconfig arm milbeaut_m10v_defconfig powerpcmpc7448_hpc2_defconfig sh ul2_defconfig mips ip28_defconfig armclps711x_defconfig m68k m5208evb_defconfig arm vf610m4_defconfig m68kmac_defconfig powerpc ps3_defconfig arm rpc_defconfig shshmin_defconfig arm s3c2410_defconfig m68km5272c3_defconfig shmigor_defconfig armmulti_v7_defconfig mips mpc30x_defconfig powerpc ppc40x_defconfig parisc allyesconfig powerpc mpc5200_defconfig arm pxa255-idp_defconfig mips ip22_defconfig arc tb10x_defconfig sparc64 alldefconfig shsh7757lcr_defconfig mips maltaaprp_defconfig arm hackkit_defconfig arm moxart_defconfig parisc alldefconfig armmvebu_v7_defconfig nios2 3c120_defconfig arm h3600_defconfig sparc sparc32_defconfig arm mainstone_defconfig m68k allmodconfig c6x dsk6455_defconfig m68k apollo_defconfig sh sh03_defconfig shedosk7760_defconfig sh sh2007_defconfig s390 zfcpdump_defconfig x86_64 allyesconfig mips pic32mzda_defconfig shsh7763rdp_defconfig arm netwinder_defconfig arm at91_dt_defconfig sh kfr2r09_defconfig arm pcm027_defconfig powerpc ppc6xx_defconfig nios2alldefconfig powerpcamigaone_defconfig powerpc skiroot_defconfig powerpc tqm8xx_defconfig arm aspeed_g4_defconfig ia64 allmodconfig ia64defconfig ia64 allyesconfig m68kdefconfig m68k allyesconfig nios2 defconfig arc allyesconfig nds32 allnoconfig c6x allyesconfig nds32 defconfig nios2allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig arc defconfig sh allmodconfig parisc defconfig s390 allyesconfig s390defconfig i386 allyesconfig sparcallyesconfig sparc defconfig i386defconfig mips allyesconfig mips allmodconfig powerpc defconfig powerpc allyesconfig powerpc allmodconfig powerpc allnoconfig x86_64 randconfig-a003-20200827 x86_64 randconfig-a002-20200827 x86_64 randconfig-a001-20200827 x86_64 randconfig-a005-20200827 x86_64 randconfig-a006-20200827 x86_64 randconfig-a004-20200827 i386 randconfig-a002-20200827 i386 randconfig-a004-20200827 i386 randconfig-a003-20200827 i386 randconfig
[powerpc:merge] BUILD SUCCESS 2281d5a219ba2dca30e0e74e49b811ed8bfc6a81
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge branch HEAD: 2281d5a219ba2dca30e0e74e49b811ed8bfc6a81 Automatic merge of 'master', 'next' and 'fixes' (2020-08-27 17:47) elapsed time: 722m configs tested: 115 configs skipped: 14 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm64allyesconfig arm64 defconfig arm allyesconfig arm allmodconfig riscvnommu_k210_defconfig mips decstation_defconfig arm milbeaut_m10v_defconfig powerpcmpc7448_hpc2_defconfig sh ul2_defconfig powerpc ps3_defconfig arm rpc_defconfig shshmin_defconfig arm s3c2410_defconfig m68km5272c3_defconfig shmigor_defconfig armmulti_v7_defconfig sh sdk7780_defconfig arm ezx_defconfig mipse55_defconfig arm pxa910_defconfig shapsh4ad0a_defconfig mips ath25_defconfig mips mpc30x_defconfig powerpc ppc40x_defconfig parisc allyesconfig powerpc mpc5200_defconfig arm pxa255-idp_defconfig mips ip22_defconfig arc tb10x_defconfig sparc64 alldefconfig shsh7757lcr_defconfig mips maltaaprp_defconfig arm hackkit_defconfig arm moxart_defconfig parisc alldefconfig armmvebu_v7_defconfig nios2 3c120_defconfig arm h3600_defconfig sparc sparc32_defconfig arm mainstone_defconfig m68k allmodconfig c6x dsk6455_defconfig s390 zfcpdump_defconfig x86_64 allyesconfig mips pic32mzda_defconfig shsh7763rdp_defconfig arm netwinder_defconfig arm at91_dt_defconfig sh kfr2r09_defconfig arm pcm027_defconfig powerpc ppc6xx_defconfig nios2alldefconfig powerpcamigaone_defconfig powerpc skiroot_defconfig powerpc tqm8xx_defconfig arm aspeed_g4_defconfig ia64 allmodconfig ia64defconfig ia64 allyesconfig m68kdefconfig m68k allyesconfig nios2 defconfig arc allyesconfig nds32 allnoconfig c6x allyesconfig nds32 defconfig nios2allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig arc defconfig sh allmodconfig parisc defconfig s390 allyesconfig s390defconfig i386 allyesconfig sparcallyesconfig sparc defconfig i386defconfig mips allyesconfig mips allmodconfig powerpc defconfig powerpc allyesconfig powerpc allmodconfig powerpc allnoconfig x86_64 randconfig-a003-20200827 x86_64 randconfig-a002-20200827 x86_64 randconfig-a001-20200827 x86_64 randconfig-a005-20200827 x86_64 randconfig-a006-20200827 x86_64 randconfig-a004-20200827 i386 randconfig-a002-20200827 i386 randconfig-a004-20200827 i386 randconfig-a003-20200827 i386 randconfig-a005-20200827 i386 randconfig-a006-20200827 i386
[powerpc:fixes-test] BUILD SUCCESS 16d83a540ca4e7f1ebb2b3756869b77451d31414
nfig h8300allyesconfig arc defconfig sh allmodconfig parisc defconfig s390 allyesconfig parisc allyesconfig s390defconfig i386 allyesconfig i386defconfig mips allyesconfig mips allmodconfig powerpc defconfig powerpc allyesconfig powerpc allmodconfig powerpc allnoconfig x86_64 randconfig-a005-20200827 x86_64 randconfig-a006-20200827 x86_64 randconfig-a004-20200827 x86_64 randconfig-a003-20200827 x86_64 randconfig-a002-20200827 x86_64 randconfig-a001-20200827 i386 randconfig-a002-20200827 i386 randconfig-a004-20200827 i386 randconfig-a003-20200827 i386 randconfig-a005-20200827 i386 randconfig-a006-20200827 i386 randconfig-a001-20200827 i386 randconfig-a016-20200827 i386 randconfig-a015-20200827 i386 randconfig-a013-20200827 i386 randconfig-a012-20200827 i386 randconfig-a011-20200827 i386 randconfig-a014-20200827 riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig x86_64 rhel x86_64rhel-7.6-kselftests x86_64 defconfig x86_64 rhel-8.3 x86_64 kexec --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
Re: [PATCH v1 04/10] powerpc/kernel/iommu: Add new iommu_table_in_use() helper
On Sat, 2020-08-22 at 20:34 +1000, Alexey Kardashevskiy wrote: > > + > > + /*ignore reserved bit0*/ > > s/ignore reserved bit0/ ignore reserved bit0 / (add spaces) Fixed > > + if (tbl->it_offset == 0) > > + p1_start = 1; > > + > > + /* Check if reserved memory is valid*/ > > A missing space here. Fixed > > > + if (tbl->it_reserved_start >= tbl->it_offset && > > + tbl->it_reserved_start <= (tbl->it_offset + tbl->it_size) && > > + tbl->it_reserved_end >= tbl->it_offset && > > + tbl->it_reserved_end <= (tbl->it_offset + tbl->it_size)) { > > Uff. What if tbl->it_reserved_end is bigger than tbl->it_offset + > tbl->it_size? > > The reserved area is to preserve MMIO32 so it is for it_offset==0 only > and the boundaries are checked in the only callsite, and it is unlikely > to change soon or ever. > > Rather that bothering with fixing that, may be just add (did not test): > > if (WARN_ON(( > (tbl->it_reserved_start || tbl->it_reserved_end) && (it_offset != 0)) > (tbl->it_reserved_start > it_offset && tbl->it_reserved_end < it_offset > + it_size) && (it_offset == 0)) ) > return true; > > Or simply always look for it_offset..it_reserved_start and > it_reserved_end..it_offset+it_size and if there is no reserved area, > initialize it_reserved_start=it_reserved_end=it_offset so the first > it_offset..it_reserved_start becomes a no-op. The problem here is that the values of it_reserved_{start,end} are not necessarily valid. I mean, on iommu_table_reserve_pages() the values are stored however they are given (bit reserving is done only if they are valid). Having a it_reserved_{start,end} value outside the valid ranges would cause find_next_bit() to run over memory outside the bitmap. Even if the those values are < tbl->it_offset, the resulting subtraction on unsigned would cause it to become a big value and run over memory outside the bitmap. But I think you are right. That is not the place to check if the reserved values are valid. It should just trust them here. I intent to change iommu_table_reserve_pages() to only store the parameters in it_reserved_{start,end} if they are in the range, and or it_offset in both of them if they are not. What do you think? Thanks for the feedback! Leonardo Bras
[PATCH v2] powerpc/32s: Disable VMAP stack which CONFIG_ADB_PMU
low_sleep_handler() can't restore the context from virtual stack because the stack can hardly be accessed with MMU OFF. For now, disable VMAP stack when CONFIG_ADB_PMU is selected. Reported-by: Giuseppe Sacco Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK") Signed-off-by: Christophe Leroy --- v2: Argh, went too quick. CONFIG_ADB_PMU ==> ADB_PMU --- arch/powerpc/platforms/Kconfig.cputype | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 87737ec86d39..1dc9d3c81872 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -36,7 +36,7 @@ config PPC_BOOK3S_6xx select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP select PPC_HAVE_KUAP - select HAVE_ARCH_VMAP_STACK + select HAVE_ARCH_VMAP_STACK if !ADB_PMU config PPC_BOOK3S_601 bool "PowerPC 601" -- 2.25.0
Re: kernel since 5.6 do not boot anymore on Apple PowerBook
Le 27/08/2020 à 16:37, Giuseppe Sacco a écrit : Il giorno gio, 27/08/2020 alle 12.39 +0200, Christophe Leroy ha scritto: Hi, Le 27/08/2020 à 10:28, Giuseppe Sacco a écrit : [...] Sorry, I made a mistake. The real problem is down, on the same function, when it calls low_sleep_handler(). This is where the problem probably is. Great, you spotted the problem. I see what it is, it is in low_sleep_handler() in arch/powerpc/platforms/powermac/sleep.S All critical registers are saved on the stack. At restore, they are restore BEFORE re-enabling MMU (because they are needed for that). But when we have VMAP_STACK, the stack can hardly be accessed without the MMU enabled. tophys() doesn't work for virtual stack addresses. Therefore, the low_sleep_handler() has to be reworked for using an area in the linear mem instead of the stack. I am sorry, but I don't know how to fix it. Should I open a bug for tracking this problem? Yes please, at https://github.com/linuxppc/issues/issues In the meantime, I have sent a patch to disable CONFIG_VMAP_STACK when CONFIG_ADB_PMU is selected until this is fixed. Have you tried without CONFIG_ADB_PMU ? Or does it make no sense ? Christophe
[PATCH] powerpc/32s: Disable VMAP stack which CONFIG_ADB_PMU
low_sleep_handler() can't restore the context from virtual stack because the stack can hardly be accessed with MMU OFF. For now, disable VMAP stack when CONFIG_ADB_PMU is selected. Reported-by: Giuseppe Sacco Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK") Signed-off-by: Christophe Leroy --- arch/powerpc/platforms/Kconfig.cputype | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 87737ec86d39..c12768242c17 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -36,7 +36,7 @@ config PPC_BOOK3S_6xx select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP select PPC_HAVE_KUAP - select HAVE_ARCH_VMAP_STACK + select HAVE_ARCH_VMAP_STACK if !CONFIG_ADB_PMU config PPC_BOOK3S_601 bool "PowerPC 601" -- 2.25.0
Re: [PATCH 08/10] x86: remove address space overrides using set_fs()
On Thu, Aug 27, 2020 at 8:00 AM Christoph Hellwig wrote: > > SYM_FUNC_START(__get_user_2) > add $1,%_ASM_AX > jc bad_get_user This no longer makes sense, and > - mov PER_CPU_VAR(current_task), %_ASM_DX > - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX > + LOAD_TASK_SIZE_MAX > + cmp %_ASM_DX,%_ASM_AX This should be LOAD_TASK_SIZE_MAX_MINUS_N(1) cmp %_ASM_DX,%_ASM_AX instead (and then because we no longer modify _ASM_AX, we'd also remove the offset on the access). > SYM_FUNC_START(__put_user_2) > - ENTER > - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX > + LOAD_TASK_SIZE_MAX > sub $1,%_ASM_BX It's even more obvious here. We load a constant and then immediately do a "sub $1" on that value. It's not a huge deal, you don't have to respin the series for this, I just wanted to point it out so that people are aware of it and if I forget somebody else will hopefully remember that "we should fix that too". Linus
Re: [PATCH v3 3/6] Add LKDTM test to hijack a patch mapping (powerpc, x86_64)
Hi "Christopher, Thank you for the patch! Yet something to improve: [auto build test ERROR on powerpc/next] [also build test ERROR on char-misc/char-misc-testing tip/x86/core v5.9-rc2 next-20200827] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Christopher-M-Riedl/Use-per-CPU-temporary-mappings-for-patching/20200827-161532 base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next config: parisc-allyesconfig (attached as .config) compiler: hppa-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=parisc If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): >> hppa-linux-ld: drivers/misc/lkdtm/core.o:(.rodata+0x1b4): undefined >> reference to `lkdtm_HIJACK_PATCH' --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [PATCH 05/10] lkdtm: disable set_fs-based tests for !CONFIG_SET_FS
On Thu, Aug 27, 2020 at 8:00 AM Christoph Hellwig wrote: > > Once we can't manipulate the address limit, we also can't test what > happens when the manipulation is abused. Just remove these tests entirely. Once set_fs() doesn't exist on x86, the tests no longer make any sense what-so-ever, because test coverage will be basically zero. So don't make the code uglier just to maintain a fiction that something is tested when it isn't really. Linus
Re: [PATCH v1 03/10] powerpc/kernel/iommu: Use largepool as a last resort when !largealloc
On Sat, 2020-08-22 at 20:09 +1000, Alexey Kardashevskiy wrote: > > + goto again; > > + > > A nit: unnecessary new line. I was following the pattern used above. There is a newline after every "goto again" in this 'if'. > Reviewed-by: Alexey Kardashevskiy Thank you!
Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()
On Sat, 2020-08-22 at 20:07 +1000, Alexey Kardashevskiy wrote: > > On 18/08/2020 09:40, Leonardo Bras wrote: > > Both iommu_alloc_coherent() and iommu_free_coherent() assume that once > > size is aligned to PAGE_SIZE it will be aligned to IOMMU_PAGE_SIZE. > > The only case when it is not aligned is when IOMMU_PAGE_SIZE > PAGE_SIZE > which is unlikely but not impossible, we could configure the kernel for > 4K system pages and 64K IOMMU pages I suppose. Do we really want to do > this here, or simply put WARN_ON(tbl->it_page_shift > PAGE_SHIFT)? I think it would be better to keep the code as much generic as possible regarding page sizes. > Because if we want the former (==support), then we'll have to align the > size up to the bigger page size when allocating/zeroing system pages, > etc. This part I don't understand. Why do we need to align everything to the bigger pagesize? I mean, is not that enough that the range [ret, ret + size[ is both allocated by mm and mapped on a iommu range? Suppose a iommu_alloc_coherent() of 16kB on PAGESIZE = 4k and IOMMU_PAGE_SIZE() == 64k. Why 4 * cpu_pages mapped by a 64k IOMMU page is not enough? All the space the user asked for is allocated and mapped for DMA. > Bigger pages are not the case here as I understand it. I did not get this part, what do you mean? > > Update those functions to guarantee alignment with requested size > > using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free(). > > > > Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift) > > with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read. > > > > Signed-off-by: Leonardo Bras > > --- > > arch/powerpc/kernel/iommu.c | 17 + > > 1 file changed, 9 insertions(+), 8 deletions(-) > > > > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c > > index 9704f3f76e63..d7086087830f 100644 > > --- a/arch/powerpc/kernel/iommu.c > > +++ b/arch/powerpc/kernel/iommu.c > > @@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct device > > *dev, > > } > > > > if (dev) > > - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, > > - 1 << tbl->it_page_shift); > > + boundary_size = IOMMU_PAGE_ALIGN(dma_get_seg_boundary(dev) + 1, > > tbl); > > Run checkpatch.pl, should complain about a long line. It's 86 columns long, which is less than the new limit of 100 columns Linus announced a few weeks ago. checkpatch.pl was updated too: https://www.phoronix.com/scan.php?page=news_item&px=Linux-Kernel-Deprecates-80-Col > > > > else > > - boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift); > > + boundary_size = IOMMU_PAGE_ALIGN(1UL << 32, tbl); > > /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ > > > > n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, > > @@ -858,6 +857,7 @@ void *iommu_alloc_coherent(struct device *dev, struct > > iommu_table *tbl, > > unsigned int order; > > unsigned int nio_pages, io_order; > > struct page *page; > > + size_t size_io = size; > > > > size = PAGE_ALIGN(size); > > order = get_order(size); > > @@ -884,8 +884,9 @@ void *iommu_alloc_coherent(struct device *dev, struct > > iommu_table *tbl, > > memset(ret, 0, size); > > > > /* Set up tces to cover the allocated range */ > > - nio_pages = size >> tbl->it_page_shift; > > - io_order = get_iommu_order(size, tbl); > > + size_io = IOMMU_PAGE_ALIGN(size_io, tbl); > > + nio_pages = size_io >> tbl->it_page_shift; > > + io_order = get_iommu_order(size_io, tbl); > > mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, > > mask >> tbl->it_page_shift, io_order, 0); > > if (mapping == DMA_MAPPING_ERROR) { > > @@ -900,11 +901,11 @@ void iommu_free_coherent(struct iommu_table *tbl, > > size_t size, > > void *vaddr, dma_addr_t dma_handle) > > { > > if (tbl) { > > - unsigned int nio_pages; > > + size_t size_io = IOMMU_PAGE_ALIGN(size, tbl); > > + unsigned int nio_pages = size_io >> tbl->it_page_shift; > > > > - size = PAGE_ALIGN(size); > > - nio_pages = size >> tbl->it_page_shift; > > iommu_free(tbl, dma_handle, nio_pages); > > + > > Unrelated new line. Will be removed. Thanks! > > > > size = PAGE_ALIGN(size); > > free_pages((unsigned long)vaddr, get_order(size)); > > } > >
Re: [PATCH v2 25/25] powerpc/signal32: Transform save_user_regs() and save_tm_user_regs() in 'unsafe' version
Le 27/08/2020 à 11:07, kernel test robot a écrit : Hi Christophe, I love your patch! Yet something to improve: [auto build test ERROR on powerpc/next] [also build test ERROR on linus/master v5.9-rc2 next-20200827] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Christophe-Leroy/powerpc-Switch-signal-32-to-using-unsafe_put_user-and-friends/20200819-012411 base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next config: powerpc64-randconfig-r005-20200827 (attached as .config) compiler: powerpc-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): arch/powerpc/kernel/signal_32.c: In function 'save_user_regs_unsafe': arch/powerpc/kernel/signal_32.c:314:34: error: macro "unsafe_copy_to_user" requires 4 arguments, but only 3 given 314 | ELF_NEVRREG * sizeof(u32)), failed); | ^ In file included from include/linux/uaccess.h:9, from include/linux/sched/task.h:11, from include/linux/sched/signal.h:9, from include/linux/rcuwait.h:6, from include/linux/percpu-rwsem.h:7, from include/linux/fs.h:33, from include/linux/huge_mm.h:8, from include/linux/mm.h:672, from arch/powerpc/kernel/signal_32.c:17: arch/powerpc/include/asm/uaccess.h:605: note: macro "unsafe_copy_to_user" defined here 605 | #define unsafe_copy_to_user(d, s, l, e) \ | arch/powerpc/kernel/signal_32.c:313:3: error: 'unsafe_copy_to_user' undeclared (first use in this function); did you mean 'raw_copy_to_user'? 313 | unsafe_copy_to_user(&frame->mc_vregs, current->thread.evr, | ^~~ | raw_copy_to_user arch/powerpc/kernel/signal_32.c:313:3: note: each undeclared identifier is reported only once for each function it appears in arch/powerpc/kernel/signal_32.c:314:37: error: 'failed' undeclared (first use in this function) 314 | ELF_NEVRREG * sizeof(u32)), failed); | ^~ arch/powerpc/kernel/signal_32.c:314:35: warning: left-hand operand of comma expression has no effect [-Wunused-value] 314 | ELF_NEVRREG * sizeof(u32)), failed); | ^ arch/powerpc/kernel/signal_32.c:314:43: error: expected ';' before ')' token 314 | ELF_NEVRREG * sizeof(u32)), failed); | ^ | ; arch/powerpc/kernel/signal_32.c:314:43: error: expected statement before ')' token Should be fixed by: diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index f795fe0240a1..123682299d4f 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -311,7 +311,7 @@ static int save_user_regs_unsafe(struct pt_regs *regs, struct mcontext __user *f /* save spe registers */ if (current->thread.used_spe) { unsafe_copy_to_user(&frame->mc_vregs, current->thread.evr, - ELF_NEVRREG * sizeof(u32)), failed); + ELF_NEVRREG * sizeof(u32), failed); /* set MSR_SPE in the saved MSR value to indicate that frame->mc_vregs contains valid data */ msr |= MSR_SPE; --- Christophe
RE: [PATCH 01/10] fs: don't allow kernel reads and writes without iter ops
From: Christoph Hellwig > Sent: 27 August 2020 16:00 > > Don't allow calling ->read or ->write with set_fs as a preparation for > killing off set_fs. All the instances that we use kernel_read/write on > are using the iter ops already. > > If a file has both the regular ->read/->write methods and the iter > variants those could have different semantics for messed up enough > drivers. Also fails the kernel access to them in that case. Is there a real justification for that? For system calls supplying both methods makes sense to avoid the extra code paths for a simple read/write. Any one stupid enough to make them behave differently gets what they deserve. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
[PATCH v1 6/6] powerpc/vdso: Declare vdso_patches[] as __initdata
vdso_patches[] table is used only at init time. Mark it __initdata. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 600df1164a0b..efaaee94f273 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -76,7 +76,7 @@ struct vdso_patch_def * Currently, we only change sync_dicache to do nothing on processors * with a coherent icache */ -static struct vdso_patch_def vdso_patches[] = { +static struct vdso_patch_def vdso_patches[] __initdata = { { CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, "__kernel_sync_dicache", "__kernel_sync_dicache_p5" -- 2.25.0
[PATCH v1 4/6] powerpc/vdso: Initialise vdso32_kbase at compile time
Initialise vdso32_kbase at compile time like vdso64_kbase. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index c173c70ca7d2..6390a37dacea 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -37,13 +37,12 @@ /* The alignment of the vDSO */ #define VDSO_ALIGNMENT (1 << 16) +extern char vdso32_start, vdso32_end; static unsigned int vdso32_pages; -static void *vdso32_kbase; +static void *vdso32_kbase = &vdso32_start; unsigned long vdso32_sigtramp; unsigned long vdso32_rt_sigtramp; -extern char vdso32_start, vdso32_end; - extern char vdso64_start, vdso64_end; static void *vdso64_kbase = &vdso64_start; static unsigned int vdso64_pages; @@ -691,8 +690,6 @@ static int __init vdso_init(void) */ vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; - vdso32_kbase = &vdso32_start; - /* * Calculate the size of the 32 bits vDSO */ -- 2.25.0
[PATCH v1 3/6] powerpc/vdso: Don't rely on vdso_pages being 0 for failure
If vdso initialisation failed, vdso_ready is not set. Otherwise, vdso_pages is only 0 when it is a 32 bits task and CONFIG_VDSO32 is not selected. As arch_setup_additional_pages() now bails out directly in that case, we don't need to set vdso_pages to 0. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 23 ++- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 465150253c31..c173c70ca7d2 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -178,11 +178,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) current->mm->context.vdso_base = 0; - /* vDSO has a problem and was disabled, just don't "enable" it for the -* process -*/ - if (vdso_pages == 0) - return 0; /* Add a page to the vdso size for the data page */ vdso_pages ++; @@ -712,14 +707,16 @@ static int __init vdso_init(void) * Initialize the vDSO images in memory, that is do necessary * fixups of vDSO symbols, locate trampolines, etc... */ - if (vdso_setup()) - goto setup_failed; + if (vdso_setup()) { + pr_err("vDSO setup failure, not enabled !\n"); + return 0; + } if (IS_ENABLED(CONFIG_VDSO32)) { /* Make sure pages are in the correct state */ pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *), GFP_KERNEL); if (!pagelist) - goto alloc_failed; + return 0; pagelist[0] = virt_to_page(vdso_data); @@ -732,7 +729,7 @@ static int __init vdso_init(void) if (IS_ENABLED(CONFIG_PPC64)) { pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), GFP_KERNEL); if (!pagelist) - goto alloc_failed; + return 0; pagelist[0] = virt_to_page(vdso_data); @@ -745,14 +742,6 @@ static int __init vdso_init(void) smp_wmb(); vdso_ready = 1; - return 0; - -setup_failed: - pr_err("vDSO setup failure, not enabled !\n"); -alloc_failed: - vdso32_pages = 0; - vdso64_pages = 0; - return 0; } arch_initcall(vdso_init); -- 2.25.0
[PATCH v1 5/6] powerpc/vdso: Declare constant vars as __ro_after_init
To avoid any risk of modification of vital VDSO variables, declare them __ro_after_init. vdso32_kbase and vdso64_kbase could be made 'const', but it would have high impact on all functions using them as the compiler doesn't expect const property to be discarded. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 6390a37dacea..600df1164a0b 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -38,19 +38,19 @@ #define VDSO_ALIGNMENT (1 << 16) extern char vdso32_start, vdso32_end; -static unsigned int vdso32_pages; -static void *vdso32_kbase = &vdso32_start; -unsigned long vdso32_sigtramp; -unsigned long vdso32_rt_sigtramp; +static unsigned int vdso32_pages __ro_after_init; +static void *vdso32_kbase __ro_after_init = &vdso32_start; +unsigned long vdso32_sigtramp __ro_after_init; +unsigned long vdso32_rt_sigtramp __ro_after_init; extern char vdso64_start, vdso64_end; -static void *vdso64_kbase = &vdso64_start; -static unsigned int vdso64_pages; +static void *vdso64_kbase __ro_after_init = &vdso64_start; +static unsigned int vdso64_pages __ro_after_init; #ifdef CONFIG_PPC64 -unsigned long vdso64_rt_sigtramp; +unsigned long vdso64_rt_sigtramp __ro_after_init; #endif /* CONFIG_PPC64 */ -static int vdso_ready; +static int vdso_ready __ro_after_init; /* * The vdso data page (aka. systemcfg for old ppc64 fans) is here. -- 2.25.0
[PATCH v1 1/6] powerpc/vdso: Remove DBG()
DBG() is defined as void when DEBUG is not defined, and DEBUG is explicitly undefined. It means there is no other way than modifying source code to get the messages printed. It was most likely useful in the first days of VDSO, but today the only 3 DBG() calls don't deserve a special handling. Just remove them. If one day someone need such messages back, use a standard pr_debug() or equivalent. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 13 - 1 file changed, 13 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index e8aaeeae9e9f..a44e8e6a4692 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -31,14 +31,6 @@ #include #include -#undef DEBUG - -#ifdef DEBUG -#define DBG(fmt...) printk(fmt) -#else -#define DBG(fmt...) -#endif - /* Max supported size for symbol names */ #define MAX_SYMNAME64 @@ -567,9 +559,6 @@ static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32, if (!match) continue; - DBG("replacing %s with %s...\n", patch->gen_name, - patch->fix_name ? "NONE" : patch->fix_name); - /* * Patch the 32 bits and 64 bits symbols. Note that we do not * patch the "." symbol on 64 bits. @@ -704,7 +693,6 @@ static int __init vdso_init(void) * Calculate the size of the 64 bits vDSO */ vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; - DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages); vdso32_kbase = &vdso32_start; @@ -713,7 +701,6 @@ static int __init vdso_init(void) * Calculate the size of the 32 bits vDSO */ vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; - DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); #endif /* -- 2.25.0
[PATCH v1 2/6] powerpc/vdso: Don't reference vdso32 static functions/vars without CONFIG_VDSO32
When CONFIG_VDSO32 is not selected, just don't reference the static vdso32 variables and functions. This allows the compiler to optimise them out, and allows to drop an #ifdef CONFIG_VDSO32. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index a44e8e6a4692..465150253c31 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -159,11 +159,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso_ready) return 0; - if (is_32bit_task()) { - vdso_spec = &vdso32_spec; - vdso_pages = vdso32_pages; - vdso_base = VDSO32_MBASE; - } else { + if (!is_32bit_task()) { vdso_spec = &vdso64_spec; vdso_pages = vdso64_pages; /* @@ -172,6 +168,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * and most likely share a SLB entry. */ vdso_base = 0; + } else if (IS_ENABLED(CONFIG_VDSO32)) { + vdso_spec = &vdso32_spec; + vdso_pages = vdso32_pages; + vdso_base = VDSO32_MBASE; + } else { + return 0; } current->mm->context.vdso_base = 0; @@ -696,12 +698,10 @@ static int __init vdso_init(void) vdso32_kbase = &vdso32_start; -#ifdef CONFIG_VDSO32 /* * Calculate the size of the 32 bits vDSO */ vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; -#endif /* * Setup the syscall map in the vDOS -- 2.25.0
Re: [PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift
Hello Alexey, thank you for this feedback! On Sat, 2020-08-22 at 19:33 +1000, Alexey Kardashevskiy wrote: > > +#define TCE_RPN_BITS 52 /* Bits 0-51 represent > > RPN on TCE */ > > Ditch this one and use MAX_PHYSMEM_BITS instead? I am pretty sure this > is the actual limit. I understand this MAX_PHYSMEM_BITS(51) comes from the maximum physical memory addressable in the machine. IIUC, it means we can access physical address up to (1ul << MAX_PHYSMEM_BITS). This 52 comes from PAPR "Table 9. TCE Definition" which defines bits 0-51 as the RPN. By looking at code, I understand that it means we may input any address < (1ul << 52) to TCE. In practice, MAX_PHYSMEM_BITS should be enough as of today, because I suppose we can't ever pass a physical page address over (1ul << 51), and TCE accepts up to (1ul << 52). But if we ever increase MAX_PHYSMEM_BITS, it doesn't necessarily means that TCE_RPN_BITS will also be increased, so I think they are independent values. Does it make sense? Please let me know if I am missing something. > > > > +#define TCE_RPN_MASK(ps) ((1ul << (TCE_RPN_BITS - (ps))) - 1) > > #define TCE_VALID 0x800 /* TCE valid */ > > #define TCE_ALLIO 0x400 /* TCE valid for all lpars */ > > #define TCE_PCI_WRITE 0x2 /* write from PCI > > allowed */ > > diff --git a/arch/powerpc/platforms/pseries/iommu.c > > b/arch/powerpc/platforms/pseries/iommu.c > > index e4198700ed1a..8fe23b7dff3a 100644 > > --- a/arch/powerpc/platforms/pseries/iommu.c > > +++ b/arch/powerpc/platforms/pseries/iommu.c > > @@ -107,6 +107,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, > > long index, > > u64 proto_tce; > > __be64 *tcep; > > u64 rpn; > > + const unsigned long tceshift = tbl->it_page_shift; > > + const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); > > + const u64 rpn_mask = TCE_RPN_MASK(tceshift); > > Using IOMMU_PAGE_SIZE macro for the page size and not using > IOMMU_PAGE_MASK for the mask - this incosistency makes my small brain > explode :) I understand the history but man... Oh well, ok. > Yeah, it feels kind of weird after two IOMMU related consts. :) But sure IOMMU_PAGE_MASK() would not be useful here :) And this kind of let me thinking: > > + rpn = __pa(uaddr) >> tceshift; > > + *tcep = cpu_to_be64(proto_tce | (rpn & rpn_mask) << tceshift); Why not: rpn_mask = TCE_RPN_MASK(tceshift) << tceshift; rpn = __pa(uaddr) & rpn_mask; *tcep = cpu_to_be64(proto_tce | rpn) I am usually afraid of changing stuff like this, but I think it's safe. > Good, otherwise. Thanks, Thank you for reviewing!
Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v2
> Diffstat: Actually no diffstat here as David Howells pointed out. Here we go: arch/Kconfig |3 arch/alpha/Kconfig |1 arch/arc/Kconfig |1 arch/arm/Kconfig |1 arch/arm64/Kconfig |1 arch/c6x/Kconfig |1 arch/csky/Kconfig |1 arch/h8300/Kconfig |1 arch/hexagon/Kconfig |1 arch/ia64/Kconfig |1 arch/m68k/Kconfig |1 arch/microblaze/Kconfig|1 arch/mips/Kconfig |1 arch/nds32/Kconfig |1 arch/nios2/Kconfig |1 arch/openrisc/Kconfig |1 arch/parisc/Kconfig|1 arch/powerpc/include/asm/processor.h |7 - arch/powerpc/include/asm/thread_info.h |5 - arch/powerpc/include/asm/uaccess.h | 78 --- arch/powerpc/kernel/signal.c |3 arch/powerpc/lib/sstep.c |6 - arch/riscv/Kconfig |1 arch/s390/Kconfig |1 arch/sh/Kconfig|1 arch/sparc/Kconfig |1 arch/um/Kconfig|1 arch/x86/ia32/ia32_aout.c |1 arch/x86/include/asm/page_32_types.h | 11 ++ arch/x86/include/asm/page_64_types.h | 38 + arch/x86/include/asm/processor.h | 60 --- arch/x86/include/asm/thread_info.h |2 arch/x86/include/asm/uaccess.h | 26 -- arch/x86/kernel/asm-offsets.c |3 arch/x86/lib/getuser.S | 28 --- arch/x86/lib/putuser.S | 21 +++-- arch/xtensa/Kconfig|1 drivers/misc/lkdtm/bugs.c |4 + drivers/misc/lkdtm/usercopy.c |4 + fs/read_write.c| 69 ++--- fs/splice.c| 130 +++-- include/linux/fs.h |2 include/linux/uaccess.h| 18 lib/test_bitmap.c | 10 ++ 44 files changed, 235 insertions(+), 316 deletions(-)
[PATCH 10/10] powerpc: remove address space overrides using set_fs()
Stop providing the possibility to override the address space using set_fs() now that there is no need for that any more. Signed-off-by: Christoph Hellwig --- arch/powerpc/Kconfig | 1 - arch/powerpc/include/asm/processor.h | 7 --- arch/powerpc/include/asm/thread_info.h | 5 +-- arch/powerpc/include/asm/uaccess.h | 62 -- arch/powerpc/kernel/signal.c | 3 -- arch/powerpc/lib/sstep.c | 6 +-- 6 files changed, 22 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3f09d6fdf89405..1f48bbfb3ce99d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -249,7 +249,6 @@ config PPC select PCI_SYSCALL if PCI select PPC_DAWR if PPC64 select RTC_LIB - select SET_FS select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index ed0d633ab5aa42..f01e4d650c520a 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -83,10 +83,6 @@ struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); void release_thread(struct task_struct *); -typedef struct { - unsigned long seg; -} mm_segment_t; - #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET] #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET] @@ -148,7 +144,6 @@ struct thread_struct { unsigned long ksp_vsid; #endif struct pt_regs *regs; /* Pointer to saved register state */ - mm_segment_taddr_limit; /* for get_fs() validation */ #ifdef CONFIG_BOOKE /* BookE base exception scratch space; align on cacheline */ unsigned long normsave[8] cacheline_aligned; @@ -295,7 +290,6 @@ struct thread_struct { #define INIT_THREAD { \ .ksp = INIT_SP, \ .ksp_limit = INIT_SP_LIMIT, \ - .addr_limit = KERNEL_DS, \ .pgdir = swapper_pg_dir, \ .fpexc_mode = MSR_FE0 | MSR_FE1, \ SPEFSCR_INIT \ @@ -303,7 +297,6 @@ struct thread_struct { #else #define INIT_THREAD { \ .ksp = INIT_SP, \ - .addr_limit = KERNEL_DS, \ .fpexc_mode = 0, \ } #endif diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index ca6c9702570494..46a210b03d2b80 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -90,7 +90,6 @@ void arch_setup_new_exec(void); #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_FSCHECK3 /* Check FS is USER_DS on return */ #define TIF_SYSCALL_EMU4 /* syscall emulation active */ #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ #define TIF_PATCH_PENDING 6 /* pending live patching update */ @@ -130,7 +129,6 @@ void arch_setup_new_exec(void); #define _TIF_SYSCALL_TRACEPOINT(1< #include -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - * - * The fs/ds values are now the highest legal address in the "segment". - * This simplifies the checking in the routines below. - */ - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(~0UL) #ifdef __powerpc64__ /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */ -#define USER_DSMAKE_MM_SEG(TASK_SIZE_USER64 - 1) -#else -#define USER_DSMAKE_MM_SEG(TASK_SIZE - 1) -#endif - -#define get_fs() (current->thread.addr_limit) +#define TASK_SIZE_MAX TASK_SIZE_USER64 -static inline void set_fs(mm_segment_t fs) +static inline bool __access_ok(unsigned long addr, unsigned long size) { - current->thread.addr_limit = fs; - /* On user-mode return check addr_limit (fs) is correct */ - set_thread_flag(TIF_FSCHECK); + if (addr >= TASK_SIZE_MAX) + return false; + /* +* This check is sufficient because there is a large enough gap between +* user addresses and the kernel addresses. +*/ + return size <= TASK_SIZE_MAX; } - -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#define user_addr_max()(get_fs().seg) - -#ifdef __powerpc64__ -/* - * This check is sufficient because there is a large enough - * gap between user addresses and the kernel addresses - */ -#define __access_ok(addr, size, segment) \ - (((addr) <= (segment).seg) && ((size) <= (segment).seg)) - #else +#define TASK_SIZE_MAX
[PATCH 08/10] x86: remove address space overrides using set_fs()
Stop providing the possibility to override the address space using set_fs() now that there is no need for that any more. To properly handle the TASK_SIZE_MAX checking for 4 vs 5-level page tables on x86 a new alternative is introduced, which just like the one in entry_64.S has to use the hardcoded virtual address bits to escape the fact that TASK_SIZE_MAX isn't actually a constant when 5-level page tables are enabled. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook --- arch/x86/Kconfig | 1 - arch/x86/ia32/ia32_aout.c | 1 - arch/x86/include/asm/processor.h | 11 +-- arch/x86/include/asm/thread_info.h | 2 -- arch/x86/include/asm/uaccess.h | 26 +- arch/x86/kernel/asm-offsets.c | 3 --- arch/x86/lib/getuser.S | 28 ++-- arch/x86/lib/putuser.S | 21 - 8 files changed, 32 insertions(+), 61 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f85c13355732fe..7101ac64bb209d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -237,7 +237,6 @@ config X86 select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMESif PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS - select SET_FS imply IMA_SECURE_AND_OR_TRUSTED_BOOTif EFI config INSTRUCTION_DECODER diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index ca8a657edf5977..a09fc37ead9d47 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -239,7 +239,6 @@ static int load_aout_binary(struct linux_binprm *bprm) (regs)->ss = __USER32_DS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - set_fs(USER_DS); return 0; } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1618eeb08361a9..189573d95c3af6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -482,10 +482,6 @@ extern unsigned int fpu_user_xstate_size; struct perf_event; -typedef struct { - unsigned long seg; -} mm_segment_t; - struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -538,8 +534,6 @@ struct thread_struct { */ unsigned long iopl_emul; - mm_segment_taddr_limit; - unsigned intsig_on_uaccess_err:1; /* Floating point and extended processor state */ @@ -785,15 +779,12 @@ static inline void spin_lock_prefetch(const void *x) #define INIT_THREAD { \ .sp0= TOP_OF_INIT_STACK, \ .sysenter_cs= __KERNEL_CS,\ - .addr_limit = KERNEL_DS, \ } #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -#define INIT_THREAD { \ - .addr_limit = KERNEL_DS,\ -} +#define INIT_THREAD { } extern unsigned long KSTK_ESP(struct task_struct *task); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 267701ae3d86dd..44733a4bfc4294 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -102,7 +102,6 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X3230 /* 32-bit native x86-64 binary */ -#define TIF_FSCHECK31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -131,7 +130,6 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32(1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ecefaffd15d4c8..a4ceda0510ea87 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -12,30 +12,6 @@ #include #include -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - */ - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(-1UL) -#define USER_DSMAKE_MM_SEG(TASK_SIZE_MAX)
[PATCH 09/10] powerpc: use non-set_fs based maccess routines
Provide __get_kernel_nofault and __put_kernel_nofault routines to implement the maccess routines without messing with set_fs and without opening up access to user space. Signed-off-by: Christoph Hellwig --- arch/powerpc/include/asm/uaccess.h | 16 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 00699903f1efca..7fe3531ad36a77 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -623,4 +623,20 @@ do { \ __put_user_goto(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e);\ } while (0) +#define HAVE_GET_KERNEL_NOFAULT + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __kr_err; \ + \ + __get_user_size_allowed(*((type *)(dst)), (__force type __user *)(src),\ + sizeof(type), __kr_err);\ + if (unlikely(__kr_err)) \ + goto err_label; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ + __put_user_size_goto(*((type *)(src)), \ + (__force type __user *)(dst), sizeof(type), err_label) + #endif /* _ARCH_POWERPC_UACCESS_H */ -- 2.28.0
[PATCH 07/10] x86: make TASK_SIZE_MAX usable from assembly code
For 64-bit the only thing missing was a strategic _AC, and for 32-bit we need to use __PAGE_OFFSET instead of PAGE_OFFSET in the TASK_SIZE definition to escape the explicit unsigned long cast. This just works because __PAGE_OFFSET is defined using _AC itself and thus never needs the cast anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook --- arch/x86/include/asm/page_32_types.h | 4 ++-- arch/x86/include/asm/page_64_types.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 26236925fb2c36..f462895a33e452 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -44,8 +44,8 @@ /* * User space process size: 3GB (default). */ -#define IA32_PAGE_OFFSET PAGE_OFFSET -#define TASK_SIZE PAGE_OFFSET +#define IA32_PAGE_OFFSET __PAGE_OFFSET +#define TASK_SIZE __PAGE_OFFSET #define TASK_SIZE_LOW TASK_SIZE #define TASK_SIZE_MAX TASK_SIZE #define DEFAULT_MAP_WINDOW TASK_SIZE diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 996595c9897e0a..838515daf87b36 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -76,7 +76,7 @@ * * With page table isolation enabled, we map the LDT in ... [stay tuned] */ -#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) +#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) -- 2.28.0
[PATCH 06/10] x86: move PAGE_OFFSET, TASK_SIZE & friends to page_{32, 64}_types.h
At least for 64-bit this moves them closer to some of the defines they are based on, and it prepares for using the TASK_SIZE_MAX definition from assembly. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook --- arch/x86/include/asm/page_32_types.h | 11 +++ arch/x86/include/asm/page_64_types.h | 38 + arch/x86/include/asm/processor.h | 49 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 565ad755c785e2..26236925fb2c36 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -41,6 +41,17 @@ #define __VIRTUAL_MASK_SHIFT 32 #endif /* CONFIG_X86_PAE */ +/* + * User space process size: 3GB (default). + */ +#define IA32_PAGE_OFFSET PAGE_OFFSET +#define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE +#define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + /* * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 288b065955b729..996595c9897e0a 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -58,6 +58,44 @@ #define __VIRTUAL_MASK_SHIFT 47 #endif +/* + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] + */ +#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc000 : 0xe000) + +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) +#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) +#define TASK_SIZE_OF(child)((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) + +#define STACK_TOP TASK_SIZE_LOW +#define STACK_TOP_MAX TASK_SIZE_MAX + /* * Maximum kernel image size is limited to 1 GiB, due to the fixmap living * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 97143d87994c24..1618eeb08361a9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -782,17 +782,6 @@ static inline void spin_lock_prefetch(const void *x) }) #ifdef CONFIG_X86_32 -/* - * User space process size: 3GB (default). - */ -#define IA32_PAGE_OFFSET PAGE_OFFSET -#define TASK_SIZE PAGE_OFFSET -#define TASK_SIZE_LOW TASK_SIZE -#define TASK_SIZE_MAX TASK_SIZE -#define DEFAULT_MAP_WINDOW TASK_SIZE -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - #define INIT_THREAD { \ .sp0= TOP_OF_INIT_STACK, \ .sysenter_cs= __KERNEL_CS,\ @@ -802,44 +791,6 @@ static inline void spin_lock_prefetch(const void *x) #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -/* - * User space process size. This is the first address outside the user range. - * There are a few constraints that determine this: - * - * On Intel CPUs, if a SYSCALL instruction is at the highest canonical - * address, then that syscall will enter the kernel with a - * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything executable - * from being mapped at the maximum canonical address. - * - * On AMD CPUs in the Ryzen family, there's a nasty bug in which the - * CPUs malfuncti
[PATCH 04/10] test_bitmap: skip user bitmap tests for !CONFIG_SET_FS
We can't run the tests for userspace bitmap parsing if set_fs() doesn't exist. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook --- lib/test_bitmap.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index df903c53952bb9..49b1d25fbaf546 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -365,6 +365,7 @@ static void __init __test_bitmap_parselist(int is_user) for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) { #define ptest parselist_tests[i] +#ifdef CONFIG_SET_FS if (is_user) { mm_segment_t orig_fs = get_fs(); size_t len = strlen(ptest.in); @@ -375,7 +376,9 @@ static void __init __test_bitmap_parselist(int is_user) bmap, ptest.nbits); time = ktime_get() - time; set_fs(orig_fs); - } else { + } else +#endif /* CONFIG_SET_FS */ + { time = ktime_get(); err = bitmap_parselist(ptest.in, bmap, ptest.nbits); time = ktime_get() - time; @@ -454,6 +457,7 @@ static void __init __test_bitmap_parse(int is_user) for (i = 0; i < ARRAY_SIZE(parse_tests); i++) { struct test_bitmap_parselist test = parse_tests[i]; +#ifdef CONFIG_SET_FS if (is_user) { size_t len = strlen(test.in); mm_segment_t orig_fs = get_fs(); @@ -464,7 +468,9 @@ static void __init __test_bitmap_parse(int is_user) bmap, test.nbits); time = ktime_get() - time; set_fs(orig_fs); - } else { + } else +#endif /* CONFIG_SET_FS */ + { size_t len = test.flags & NO_LEN ? UINT_MAX : strlen(test.in); time = ktime_get(); -- 2.28.0
[PATCH 02/10] fs: don't allow splice read/write without explicit ops
default_file_splice_write is the last piece of generic code that uses set_fs to make the uaccess routines operate on kernel pointers. It implements a "fallback loop" for splicing from files that do not actually provide a proper splice_read method. The usual file systems and other high bandwith instances all provide a ->splice_read, so this just removes support for various device drivers and procfs/debugfs files. If splice support for any of those turns out to be important it can be added back by switching them to the iter ops and using generic_file_splice_read. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook --- fs/read_write.c| 2 +- fs/splice.c| 130 + include/linux/fs.h | 2 - 3 files changed, 15 insertions(+), 119 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 702c4301d9eb6b..8c61f67453e3d3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1077,7 +1077,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, } EXPORT_SYMBOL(vfs_iter_write); -ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, +static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; diff --git a/fs/splice.c b/fs/splice.c index d7c8a7c4db07ff..412df7b48f9eb7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -342,89 +342,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = { }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); -static ssize_t kernel_readv(struct file *file, const struct kvec *vec, - unsigned long vlen, loff_t offset) -{ - mm_segment_t old_fs; - loff_t pos = offset; - ssize_t res; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); - set_fs(old_fs); - - return res; -} - -static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, -struct pipe_inode_info *pipe, size_t len, -unsigned int flags) -{ - struct kvec *vec, __vec[PIPE_DEF_BUFFERS]; - struct iov_iter to; - struct page **pages; - unsigned int nr_pages; - unsigned int mask; - size_t offset, base, copied = 0; - ssize_t res; - int i; - - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) - return -EAGAIN; - - /* -* Try to keep page boundaries matching to source pagecache ones - -* it probably won't be much help, but... -*/ - offset = *ppos & ~PAGE_MASK; - - iov_iter_pipe(&to, READ, pipe, len + offset); - - res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); - if (res <= 0) - return -ENOMEM; - - nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE); - - vec = __vec; - if (nr_pages > PIPE_DEF_BUFFERS) { - vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL); - if (unlikely(!vec)) { - res = -ENOMEM; - goto out; - } - } - - mask = pipe->ring_size - 1; - pipe->bufs[to.head & mask].offset = offset; - pipe->bufs[to.head & mask].len -= offset; - - for (i = 0; i < nr_pages; i++) { - size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); - vec[i].iov_base = page_address(pages[i]) + offset; - vec[i].iov_len = this_len; - len -= this_len; - offset = 0; - } - - res = kernel_readv(in, vec, nr_pages, *ppos); - if (res > 0) { - copied = res; - *ppos += res; - } - - if (vec != __vec) - kfree(vec); -out: - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); - kvfree(pages); - iov_iter_advance(&to, copied); /* truncates and discards */ - return res; -} - /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. @@ -788,33 +705,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, EXPORT_SYMBOL(iter_file_splice_write); -static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) -{ - int ret; - void *data; - loff_t tmp = sd->pos; - - data = kmap(buf->page); - ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); - kunmap(buf->page); - - return ret; -} - -static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, -struct file *out, loff_t *ppos, -size_t len
[PATCH 05/10] lkdtm: disable set_fs-based tests for !CONFIG_SET_FS
Once we can't manipulate the address limit, we also can't test what happens when the manipulation is abused. Signed-off-by: Christoph Hellwig --- drivers/misc/lkdtm/bugs.c | 4 drivers/misc/lkdtm/usercopy.c | 4 2 files changed, 8 insertions(+) diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index 4dfbfd51bdf774..0d5b93694a0183 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -315,11 +315,15 @@ void lkdtm_CORRUPT_LIST_DEL(void) /* Test if unbalanced set_fs(KERNEL_DS)/set_fs(USER_DS) check exists. */ void lkdtm_CORRUPT_USER_DS(void) { +#ifdef CONFIG_SET_FS pr_info("setting bad task size limit\n"); set_fs(KERNEL_DS); /* Make sure we do not keep running with a KERNEL_DS! */ force_sig(SIGKILL); +#else + pr_err("XFAIL: this requires set_fs()\n"); +#endif } /* Test that VMAP_STACK is actually allocating with a leading guard page */ diff --git a/drivers/misc/lkdtm/usercopy.c b/drivers/misc/lkdtm/usercopy.c index b833367a45d053..04d10063835241 100644 --- a/drivers/misc/lkdtm/usercopy.c +++ b/drivers/misc/lkdtm/usercopy.c @@ -327,6 +327,7 @@ void lkdtm_USERCOPY_KERNEL(void) void lkdtm_USERCOPY_KERNEL_DS(void) { +#ifdef CONFIG_SET_FS char __user *user_ptr = (char __user *)(0xFUL << (sizeof(unsigned long) * 8 - 4)); mm_segment_t old_fs = get_fs(); @@ -338,6 +339,9 @@ void lkdtm_USERCOPY_KERNEL_DS(void) if (copy_to_user(user_ptr, buf, sizeof(buf)) == 0) pr_err("copy_to_user() to noncanonical address succeeded!?\n"); set_fs(old_fs); +#else + pr_err("XFAIL: this requires set_fs()\n"); +#endif } void __init lkdtm_usercopy_init(void) -- 2.28.0
[PATCH 03/10] uaccess: add infrastructure for kernel builds with set_fs()
Add a CONFIG_SET_FS option that is selected by architecturess that implement set_fs, which is all of them initially. If the option is not set stubs for routines related to overriding the address space are provided so that architectures can start to opt out of providing set_fs. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook --- arch/Kconfig| 3 +++ arch/alpha/Kconfig | 1 + arch/arc/Kconfig| 1 + arch/arm/Kconfig| 1 + arch/arm64/Kconfig | 1 + arch/c6x/Kconfig| 1 + arch/csky/Kconfig | 1 + arch/h8300/Kconfig | 1 + arch/hexagon/Kconfig| 1 + arch/ia64/Kconfig | 1 + arch/m68k/Kconfig | 1 + arch/microblaze/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/nds32/Kconfig | 1 + arch/nios2/Kconfig | 1 + arch/openrisc/Kconfig | 1 + arch/parisc/Kconfig | 1 + arch/powerpc/Kconfig| 1 + arch/riscv/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/um/Kconfig | 1 + arch/x86/Kconfig| 1 + arch/xtensa/Kconfig | 1 + include/linux/uaccess.h | 18 ++ 26 files changed, 45 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index af14a567b493fc..3fab619a6aa51a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -24,6 +24,9 @@ config KEXEC_ELF config HAVE_IMA_KEXEC bool +config SET_FS + bool + config HOTPLUG_SMT bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9c5f06e8eb9bc0..d6e9fc7a7b19e2 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -39,6 +39,7 @@ config ALPHA select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 select MMU_GATHER_NO_RANGE + select SET_FS help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ba00c4e1e1c271..c49f5754a11e40 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -48,6 +48,7 @@ config ARC select PCI_SYSCALL if PCI select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 + select SET_FS config ARCH_HAS_CACHE_LINE_SIZE def_bool y diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e00d94b1665876..87e1478a42dc4f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -118,6 +118,7 @@ config ARM select PCI_SYSCALL if PCI select PERF_USE_VMALLOC select RTC_LIB + select SET_FS select SYS_SUPPORTS_APM_EMULATION # Above selects are sorted alphabetically; please add new ones # according to that. Thanks. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6d232837cbeee8..fbd9e35bef096f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -192,6 +192,7 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY + select SET_FS select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 6444ebfd06a665..48d66bf0465d68 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -22,6 +22,7 @@ config C6X select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select MMU_GATHER_NO_RANGE if MMU + select SET_FS config MMU def_bool n diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 3d5afb5f568543..2836f6e76fdb2d 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -78,6 +78,7 @@ config CSKY select PCI_DOMAINS_GENERIC if PCI select PCI_SYSCALL if PCI select PCI_MSI if PCI + select SET_FS config LOCKDEP_SUPPORT def_bool y diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index d11666d538fea8..7945de067e9fcc 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -25,6 +25,7 @@ config H8300 select HAVE_ARCH_KGDB select HAVE_ARCH_HASH select CPU_NO_EFFICIENT_FFS + select SET_FS select UACCESS_MEMCPY config CPU_BIG_ENDIAN diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 667cfc511cf999..f2afabbadd430e 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,6 +31,7 @@ config HEXAGON select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES + select SET_FS help Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 5b4ec80bf5863a..22a6853840e235 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -56,6 +56,7 @@ config IA64 select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH select NUMA if !FLATMEM + select SET_FS
remove the last set_fs() in common code, and remove it for x86 and powerpc v2
Hi all, this series removes the last set_fs() used to force a kernel address space for the uaccess code in the kernel read/write/splice code, and then stops implementing the address space overrides entirely for x86 and powerpc. The file system part has been posted a few times, and the read/write side has been pretty much unchanced. For splice this series drops the conversion of the seq_file and sysctl code to the iter ops, and thus loses the splice support for them. The reasons for that is that it caused a lot of churn for not much use - splice for these small files really isn't much of a win, even if existing userspace uses it. All callers I found do the proper fallback, but if this turns out to be an issue the conversion can be resurrected. Besides x86 and powerpc I plan to eventually convert all other architectures, although this will be a slow process, starting with the easier ones once the infrastructure is merged. The process to convert architectures is roughtly: (1) ensure there is no set_fs(KERNEL_DS) left in arch specific code (2) implement __get_kernel_nofault and __put_kernel_nofault (3) remove the arch specific address limitation functionality Changes since v1: - drop the patch to remove the non-iter ops for /dev/zero and /dev/null as they caused a performance regression - don't enable user access in __get_kernel on powerpc - xfail the set_fs() based lkdtm tests Diffstat:
[PATCH 01/10] fs: don't allow kernel reads and writes without iter ops
Don't allow calling ->read or ->write with set_fs as a preparation for killing off set_fs. All the instances that we use kernel_read/write on are using the iter ops already. If a file has both the regular ->read/->write methods and the iter variants those could have different semantics for messed up enough drivers. Also fails the kernel access to them in that case. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook --- fs/read_write.c | 67 +++-- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 5db58b8c78d0dd..702c4301d9eb6b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -419,27 +419,41 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo return ret; } +static int warn_unsupported(struct file *file, const char *op) +{ + pr_warn_ratelimited( + "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", + op, file, current->pid, current->comm); + return -EINVAL; +} + ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { - mm_segment_t old_fs = get_fs(); + struct kvec iov = { + .iov_base = buf, + .iov_len= min_t(size_t, count, MAX_RW_COUNT), + }; + struct kiocb kiocb; + struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; + /* +* Also fail if ->read_iter and ->read are both wired up as that +* implies very convoluted semantics. +*/ + if (unlikely(!file->f_op->read_iter || file->f_op->read)) + return warn_unsupported(file, "read"); - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - set_fs(KERNEL_DS); - if (file->f_op->read) - ret = file->f_op->read(file, (void __user *)buf, count, pos); - else if (file->f_op->read_iter) - ret = new_sync_read(file, (void __user *)buf, count, pos); - else - ret = -EINVAL; - set_fs(old_fs); + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *pos; + iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len); + ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { + *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } @@ -510,28 +524,31 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { - mm_segment_t old_fs; - const char __user *p; + struct kvec iov = { + .iov_base = (void *)buf, + .iov_len= min_t(size_t, count, MAX_RW_COUNT), + }; + struct kiocb kiocb; + struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; + /* +* Also fail if ->write_iter and ->write are both wired up as that +* implies very convoluted semantics. +*/ + if (unlikely(!file->f_op->write_iter || file->f_op->write)) + return warn_unsupported(file, "write"); - old_fs = get_fs(); - set_fs(KERNEL_DS); - p = (__force const char __user *)buf; - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - if (file->f_op->write) - ret = file->f_op->write(file, p, count, pos); - else if (file->f_op->write_iter) - ret = new_sync_write(file, p, count, pos); - else - ret = -EINVAL; - set_fs(old_fs); + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *pos; + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + ret = file->f_op->write_iter(&kiocb, &iter); if (ret > 0) { + *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } -- 2.28.0
Re: kernel since 5.6 do not boot anymore on Apple PowerBook
Il giorno gio, 27/08/2020 alle 12.39 +0200, Christophe Leroy ha scritto: > Hi, > > Le 27/08/2020 à 10:28, Giuseppe Sacco a écrit : [...] > > Sorry, I made a mistake. The real problem is down, on the same > > function, when it calls low_sleep_handler(). This is where the problem > > probably is. > > Great, you spotted the problem. > > I see what it is, it is in low_sleep_handler() in > arch/powerpc/platforms/powermac/sleep.S > > All critical registers are saved on the stack. At restore, they are > restore BEFORE re-enabling MMU (because they are needed for that). But > when we have VMAP_STACK, the stack can hardly be accessed without the > MMU enabled. tophys() doesn't work for virtual stack addresses. > > Therefore, the low_sleep_handler() has to be reworked for using an area > in the linear mem instead of the stack. I am sorry, but I don't know how to fix it. Should I open a bug for tracking this problem? Thank you, Giuseppe
Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization
Christophe Leroy writes: > On 08/26/2020 02:58 PM, Michael Ellerman wrote: >> Christophe Leroy writes: >>> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c >>> index daef14a284a3..bbb69832fd46 100644 >>> --- a/arch/powerpc/kernel/vdso.c >>> +++ b/arch/powerpc/kernel/vdso.c >>> @@ -718,16 +710,14 @@ static int __init vdso_init(void) >> ... >>> >>> - >>> -#ifdef CONFIG_VDSO32 >>> vdso32_kbase = &vdso32_start; >>> >>> /* >>> @@ -735,8 +725,6 @@ static int __init vdso_init(void) >>> */ >>> vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; >>> DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); >>> -#endif >> >> This didn't build for ppc64le: >> >> >> /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: >> arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to `vdso32_end' >> >> /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: >> arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to `vdso32_start' >>make[1]: *** [/scratch/michael/build/maint/Makefile:1166: vmlinux] Error 1 >>make: *** [Makefile:185: __sub-make] Error 2 >> >> So I just put that ifdef back. >> > > The problem is because is_32bit() can still return true even when > CONFIG_VDSO32 is not set. Hmm, you're right. My config had CONFIG_COMPAT enabled. But that seems like a bug, if someone enables COMPAT on ppc64le they are almost certainly going to want VDSO32 as well. So I think I'll do a lead up patch as below. cheers diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d4fd109f177e..cf2da1e401ef 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -501,13 +501,12 @@ endmenu config VDSO32 def_bool y - depends on PPC32 || CPU_BIG_ENDIAN + depends on PPC32 || COMPAT help This symbol controls whether we build the 32-bit VDSO. We obviously want to do that if we're building a 32-bit kernel. If we're building - a 64-bit kernel then we only want a 32-bit VDSO if we're building for - big endian. That is because the only little endian configuration we - support is ppc64le which is 64-bit only. + a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling + COMPAT. choice prompt "Endianness selection"
Re: [PATCH v3 13/13] mm/debug_vm_pgtable: populate a pte entry before fetching it
Hi "Aneesh, I love your patch! Perhaps something to improve: [auto build test WARNING on hnaz-linux-mm/master] [also build test WARNING on powerpc/next v5.9-rc2 next-20200827] [cannot apply to mmotm/master] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/mm-debug_vm_pgtable-fixes/20200827-160758 base: https://github.com/hnaz/linux-mm master config: x86_64-randconfig-s022-20200827 (attached as .config) compiler: gcc-9 (Debian 9.3.0-15) 9.3.0 reproduce: # apt-get install sparse # sparse version: v0.6.2-191-g10164920-dirty # save the attached .config to linux build tree make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot sparse warnings: (new ones prefixed by >>) mm/debug_vm_pgtable.c:509:9: sparse: sparse: incompatible types in conditional expression (different base types): mm/debug_vm_pgtable.c:509:9: sparse:void mm/debug_vm_pgtable.c:509:9: sparse:int mm/debug_vm_pgtable.c:528:9: sparse: sparse: incompatible types in conditional expression (different base types): mm/debug_vm_pgtable.c:528:9: sparse:void mm/debug_vm_pgtable.c:528:9: sparse:int mm/debug_vm_pgtable.c: note: in included file (through include/linux/pgtable.h, include/linux/mm.h, include/linux/highmem.h): >> arch/x86/include/asm/pgtable.h:587:27: sparse: sparse: context imbalance in >> 'debug_vm_pgtable' - unexpected unlock # https://github.com/0day-ci/linux/commit/9370726f47eaffdf772fdc273d180ec03b245cca git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Aneesh-Kumar-K-V/mm-debug_vm_pgtable-fixes/20200827-160758 git checkout 9370726f47eaffdf772fdc273d180ec03b245cca vim +/debug_vm_pgtable +587 arch/x86/include/asm/pgtable.h b534816b552d35 Jeremy Fitzhardinge 2009-02-04 586 fb43d6cb91ef57 Dave Hansen 2018-04-06 @587 static inline pgprotval_t check_pgprot(pgprot_t pgprot) fb43d6cb91ef57 Dave Hansen 2018-04-06 588 { fb43d6cb91ef57 Dave Hansen 2018-04-06 589 pgprotval_t massaged_val = massage_pgprot(pgprot); fb43d6cb91ef57 Dave Hansen 2018-04-06 590 fb43d6cb91ef57 Dave Hansen 2018-04-06 591 /* mmdebug.h can not be included here because of dependencies */ fb43d6cb91ef57 Dave Hansen 2018-04-06 592 #ifdef CONFIG_DEBUG_VM fb43d6cb91ef57 Dave Hansen 2018-04-06 593 WARN_ONCE(pgprot_val(pgprot) != massaged_val, fb43d6cb91ef57 Dave Hansen 2018-04-06 594"attempted to set unsupported pgprot: %016llx " fb43d6cb91ef57 Dave Hansen 2018-04-06 595"bits: %016llx supported: %016llx\n", fb43d6cb91ef57 Dave Hansen 2018-04-06 596 (u64)pgprot_val(pgprot), fb43d6cb91ef57 Dave Hansen 2018-04-06 597 (u64)pgprot_val(pgprot) ^ massaged_val, fb43d6cb91ef57 Dave Hansen 2018-04-06 598 (u64)__supported_pte_mask); fb43d6cb91ef57 Dave Hansen 2018-04-06 599 #endif fb43d6cb91ef57 Dave Hansen 2018-04-06 600 fb43d6cb91ef57 Dave Hansen 2018-04-06 601 return massaged_val; fb43d6cb91ef57 Dave Hansen 2018-04-06 602 } fb43d6cb91ef57 Dave Hansen 2018-04-06 603 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [PATCH 2/2] ASoC: fsl: imx-es8328: add missing put_device() call in imx_es8328_probe()
On 20-08-25 20:05, Yu Kuai wrote: > if of_find_device_by_node() succeed, imx_es8328_probe() doesn't have > a corresponding put_device(). Why do we need the ssi_pdev reference here at all? Regards, Marco
Re: kernel since 5.6 do not boot anymore on Apple PowerBook
Hi, Le 27/08/2020 à 10:28, Giuseppe Sacco a écrit : Il giorno gio, 27/08/2020 alle 09.46 +0200, Giuseppe Sacco ha scritto: Il giorno gio, 27/08/2020 alle 00.28 +0200, Giuseppe Sacco ha scritto: Hello Christophe, Il giorno mer, 26/08/2020 alle 15.53 +0200, Christophe Leroy ha scritto: [...] If there is no warning, then the issue is something else, bad luck. Could you increase the loglevel and try again both with and without VMAP_STACK ? Maybe we'll get more information on where it stops. The problem is related to the CPU frequency changes. This is where the system stop: cpufreq get the CPU frequency limits and then start the default governor (performance) and then calls function cpufreq_gov_performance_limits() that never returns. Rebuilding after enabling pr_debug for cpufreq.c, I've got some more lines of output: cpufreq: setting new policy for CPU 0: 667000 - 867000 kHz cpufreq: new min and max freqs are 667000 - 867000 kHz cpufreq: governor switch cpufreq: cpufreq_init_governor: for CPU 0 cpufreq: cpufreq_start_governor: for CPU 0 cpufreq: target for CPU 0: 867000 kHz, relation 1, requested 867000 kHz cpufreq: __target_index: cpu: 0, oldfreq: 667000, new freq: 867000 cpufreq: notification 0 of frequency transition to 867000 kHz cpufreq: saving 133328 as reference value for loops_per_jiffy; freq is 667000 kHz no more lines are printed. I think this output only refers to the notification sent prior to the frequency change. I was thinking that selecting a governor that would run at 667mHz would probably skip the problem. I added cpufreq.default_governor=powersave to the command line parameters but it did not work: the selected governor was still performance and the system crashed. I kept following the thread and found that CPU frequency is changed in function pmu_set_cpu_speed() in file drivers/cpufreq/pmac32-cpufreq.c. As first thing, the function calls the macro preempt_disable() and this is where it stops. Sorry, I made a mistake. The real problem is down, on the same function, when it calls low_sleep_handler(). This is where the problem probably is. Great, you spotted the problem. I see what it is, it is in low_sleep_handler() in arch/powerpc/platforms/powermac/sleep.S All critical registers are saved on the stack. At restore, they are restore BEFORE re-enabling MMU (because they are needed for that). But when we have VMAP_STACK, the stack can hardly be accessed without the MMU enabled. tophys() doesn't work for virtual stack addresses. Therefore, the low_sleep_handler() has to be reworked for using an area in the linear mem instead of the stack. Christophe
Re: [PATCH v3 3/6] Add LKDTM test to hijack a patch mapping (powerpc, x86_64)
Hi "Christopher, Thank you for the patch! Yet something to improve: [auto build test ERROR on powerpc/next] [also build test ERROR on char-misc/char-misc-testing tip/x86/core v5.9-rc2 next-20200827] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Christopher-M-Riedl/Use-per-CPU-temporary-mappings-for-patching/20200827-161532 base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next config: x86_64-allmodconfig (attached as .config) compiler: gcc-9 (Debian 9.3.0-15) 9.3.0 reproduce (this is a W=1 build): # save the attached .config to linux build tree make W=1 ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): drivers/misc/lkdtm/perms.c: In function 'lkdtm_HIJACK_PATCH': >> drivers/misc/lkdtm/perms.c:318:38: error: implicit declaration of function >> 'read_cpu_patching_addr' [-Werror=implicit-function-declaration] 318 | addr = offset_in_page(patch_site) | read_cpu_patching_addr(patching_cpu); | ^~ cc1: some warnings being treated as errors # https://github.com/0day-ci/linux/commit/36a98d779ee4620e6e091cbe3b438b52faa108ad git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Christopher-M-Riedl/Use-per-CPU-temporary-mappings-for-patching/20200827-161532 git checkout 36a98d779ee4620e6e091cbe3b438b52faa108ad vim +/read_cpu_patching_addr +318 drivers/misc/lkdtm/perms.c 289 290 void lkdtm_HIJACK_PATCH(void) 291 { 292 #ifdef CONFIG_PPC 293 struct ppc_inst original_insn = ppc_inst_read(READ_ONCE(patch_site)); 294 #endif 295 #ifdef CONFIG_X86_64 296 int original_insn = READ_ONCE(*patch_site); 297 #endif 298 struct task_struct *patching_kthrd; 299 int patching_cpu, hijacker_cpu, attempts; 300 unsigned long addr; 301 bool hijacked; 302 const int bad_data = 0xbad00bad; 303 304 if (num_online_cpus() < 2) { 305 pr_warn("need at least two cpus\n"); 306 return; 307 } 308 309 hijacker_cpu = smp_processor_id(); 310 patching_cpu = cpumask_any_but(cpu_online_mask, hijacker_cpu); 311 312 patching_kthrd = kthread_create_on_node(&lkdtm_patching_cpu, NULL, 313 cpu_to_node(patching_cpu), 314 "lkdtm_patching_cpu"); 315 kthread_bind(patching_kthrd, patching_cpu); 316 wake_up_process(patching_kthrd); 317 > 318 addr = offset_in_page(patch_site) | > read_cpu_patching_addr(patching_cpu); 319 320 pr_info("starting hijacker_cpu=%d\n", hijacker_cpu); 321 for (attempts = 0; attempts < 10; ++attempts) { 322 /* Use __put_user to catch faults without an Oops */ 323 hijacked = !__put_user(bad_data, (int *)addr); 324 325 if (hijacked) { 326 if (kthread_stop(patching_kthrd)) 327 pr_err("error trying to stop patching thread\n"); 328 break; 329 } 330 } 331 pr_info("hijack attempts: %d\n", attempts); 332 333 if (hijacked) { 334 if (lkdtm_verify_patch(bad_data)) 335 pr_err("overwrote kernel text\n"); 336 /* 337 * There are window conditions where the hijacker cpu manages to 338 * write to the patch site but the site gets overwritten again by 339 * the patching cpu. We still consider that a "successful" hijack 340 * since the hijacker cpu did not fault on the write. 341 */ 342 pr_err("FAIL: wrote to another cpu's patching area\n"); 343 } else { 344 kthread_stop(patching_kthrd); 345 } 346 347 /* Restore the original insn for any future lkdtm tests */ 348 #ifdef CONFIG_PPC 349 patch_instruction(patch_site, original_insn); 350 #endif 351 #ifdef CONFIG_X86_64 352 lkdtm_do_patch(original_insn); 353 #endif 354 } 355 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [PATCH 09/11] x86: remove address space overrides using set_fs()
On Mon, Aug 17, 2020 at 08:23:11AM +, David Laight wrote: > From: Christoph Hellwig > > Sent: 17 August 2020 08:32 > > > > Stop providing the possibility to override the address space using > > set_fs() now that there is no need for that any more. To properly > > handle the TASK_SIZE_MAX checking for 4 vs 5-level page tables on > > x86 a new alternative is introduced, which just like the one in > > entry_64.S has to use the hardcoded virtual address bits to escape > > the fact that TASK_SIZE_MAX isn't actually a constant when 5-level > > page tables are enabled. > > > @@ -93,7 +69,7 @@ static inline bool pagefault_disabled(void); > > #define access_ok(addr, size) \ > > ({ \ > > WARN_ON_IN_IRQ(); \ > > - likely(!__range_not_ok(addr, size, user_addr_max())); \ > > + likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \ > > }) > > Can't that always compare against a constant even when 5-levl > page tables are enabled on x86-64? > > On x86-64 it can (probably) reduce to (addr | (addr + size)) < 0. I'll leave that to the x86 maintainers as a future cleanup if wanted.
Re: [PATCHv5 1/2] powerpc/pseries: group lmb operation and memblock's
Le 10/08/2020 à 10:52, Pingfan Liu a écrit : This patch prepares for the incoming patch which swaps the order of KOBJ_ADD/REMOVE uevent and dt's updating. The dt updating should come after lmb operations, and before __remove_memory()/__add_memory(). Accordingly, grouping all lmb operations before the memblock's. I can't find the link between this commit description and the code's changes below. Signed-off-by: Pingfan Liu Cc: Michael Ellerman Cc: Hari Bathini Cc: Nathan Lynch Cc: Nathan Fontenot Cc: Laurent Dufour To: linuxppc-dev@lists.ozlabs.org Cc: ke...@lists.infradead.org --- v4 -> v5: fix the miss of clearing DRCONF_MEM_ASSIGNED in a failure path arch/powerpc/platforms/pseries/hotplug-memory.c | 28 + 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 5d545b7..46cbcd1 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -355,7 +355,8 @@ static int dlpar_add_lmb(struct drmem_lmb *); static int dlpar_remove_lmb(struct drmem_lmb *lmb) { unsigned long block_sz; - int rc; + phys_addr_t base_addr; + int rc, nid; if (!lmb_is_removable(lmb)) return -EINVAL; @@ -364,17 +365,19 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) if (rc) return rc; + base_addr = lmb->base_addr; + nid = lmb->nid; block_sz = pseries_memory_block_size(); - __remove_memory(lmb->nid, lmb->base_addr, block_sz); - - /* Update memory regions for memory remove */ - memblock_remove(lmb->base_addr, block_sz); - invalidate_lmb_associativity_index(lmb); lmb_clear_nid(lmb); lmb->flags &= ~DRCONF_MEM_ASSIGNED; + __remove_memory(nid, base_addr, block_sz); + + /* Update memory regions for memory remove */ + memblock_remove(base_addr, block_sz); + return 0; } @@ -603,22 +606,29 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) } lmb_set_nid(lmb); + lmb->flags |= DRCONF_MEM_ASSIGNED; + block_sz = memory_block_size_bytes(); /* Add the memory */ rc = __add_memory(lmb->nid, lmb->base_addr, block_sz); if (rc) { invalidate_lmb_associativity_index(lmb); + lmb_clear_nid(lmb); + lmb->flags &= ~DRCONF_MEM_ASSIGNED; return rc; } rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(lmb->nid, lmb->base_addr, block_sz); + int nid = lmb->nid; + phys_addr_t base_addr = lmb->base_addr; + invalidate_lmb_associativity_index(lmb); lmb_clear_nid(lmb); - } else { - lmb->flags |= DRCONF_MEM_ASSIGNED; + lmb->flags &= ~DRCONF_MEM_ASSIGNED; + + __remove_memory(nid, base_addr, block_sz); } return rc;
Re: kernel since 5.6 do not boot anymore on Apple PowerBook
Il giorno gio, 27/08/2020 alle 09.46 +0200, Giuseppe Sacco ha scritto: > Il giorno gio, 27/08/2020 alle 00.28 +0200, Giuseppe Sacco ha scritto: > > Hello Christophe, > > > > Il giorno mer, 26/08/2020 alle 15.53 +0200, Christophe Leroy ha > > scritto: > > [...] > > > If there is no warning, then the issue is something else, bad luck. > > > > > > Could you increase the loglevel and try again both with and without > > > VMAP_STACK ? Maybe we'll get more information on where it stops. > > > > The problem is related to the CPU frequency changes. This is where the > > system stop: cpufreq get the CPU frequency limits and then start the > > default governor (performance) and then calls function > > cpufreq_gov_performance_limits() that never returns. > > > > Rebuilding after enabling pr_debug for cpufreq.c, I've got some more > > lines of output: > > > > cpufreq: setting new policy for CPU 0: 667000 - 867000 kHz > > cpufreq: new min and max freqs are 667000 - 867000 kHz > > cpufreq: governor switch > > cpufreq: cpufreq_init_governor: for CPU 0 > > cpufreq: cpufreq_start_governor: for CPU 0 > > cpufreq: target for CPU 0: 867000 kHz, relation 1, requested 867000 kHz > > cpufreq: __target_index: cpu: 0, oldfreq: 667000, new freq: 867000 > > cpufreq: notification 0 of frequency transition to 867000 kHz > > cpufreq: saving 133328 as reference value for loops_per_jiffy; freq is > > 667000 kHz > > > > no more lines are printed. I think this output only refers to the > > notification sent prior to the frequency change. > > > > I was thinking that selecting a governor that would run at 667mHz would > > probably skip the problem. I added cpufreq.default_governor=powersave > > to the command line parameters but it did not work: the selected > > governor was still performance and the system crashed. > > I kept following the thread and found that CPU frequency is changed in > function pmu_set_cpu_speed() in file drivers/cpufreq/pmac32-cpufreq.c. > As first thing, the function calls the macro preempt_disable() and this > is where it stops. Sorry, I made a mistake. The real problem is down, on the same function, when it calls low_sleep_handler(). This is where the problem probably is. Bye, Giuseppe
Re: kernel since 5.6 do not boot anymore on Apple PowerBook
Il giorno gio, 27/08/2020 alle 00.28 +0200, Giuseppe Sacco ha scritto: > Hello Christophe, > > Il giorno mer, 26/08/2020 alle 15.53 +0200, Christophe Leroy ha > scritto: > [...] > > If there is no warning, then the issue is something else, bad luck. > > > > Could you increase the loglevel and try again both with and without > > VMAP_STACK ? Maybe we'll get more information on where it stops. > > The problem is related to the CPU frequency changes. This is where the > system stop: cpufreq get the CPU frequency limits and then start the > default governor (performance) and then calls function > cpufreq_gov_performance_limits() that never returns. > > Rebuilding after enabling pr_debug for cpufreq.c, I've got some more > lines of output: > > cpufreq: setting new policy for CPU 0: 667000 - 867000 kHz > cpufreq: new min and max freqs are 667000 - 867000 kHz > cpufreq: governor switch > cpufreq: cpufreq_init_governor: for CPU 0 > cpufreq: cpufreq_start_governor: for CPU 0 > cpufreq: target for CPU 0: 867000 kHz, relation 1, requested 867000 kHz > cpufreq: __target_index: cpu: 0, oldfreq: 667000, new freq: 867000 > cpufreq: notification 0 of frequency transition to 867000 kHz > cpufreq: saving 133328 as reference value for loops_per_jiffy; freq is 667000 > kHz > > no more lines are printed. I think this output only refers to the > notification sent prior to the frequency change. > > I was thinking that selecting a governor that would run at 667mHz would > probably skip the problem. I added cpufreq.default_governor=powersave > to the command line parameters but it did not work: the selected > governor was still performance and the system crashed. I kept following the thread and found that CPU frequency is changed in function pmu_set_cpu_speed() in file drivers/cpufreq/pmac32-cpufreq.c. As first thing, the function calls the macro preempt_disable() and this is where it stops. Bye, Giuseppe
[PATCH v3 13/13] mm/debug_vm_pgtable: populate a pte entry before fetching it
pte_clear_tests operate on an existing pte entry. Make sure that is not a none pte entry. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 21329c7d672f..8527ebb75f2c 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -546,7 +546,7 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, unsigned long vaddr) { - pte_t pte = ptep_get(ptep); + pte_t pte = ptep_get_and_clear(mm, vaddr, ptep); pr_debug("Validating PTE clear\n"); pte = __pte(pte_val(pte) | RANDOM_ORVALUE); @@ -944,7 +944,7 @@ static int __init debug_vm_pgtable(void) p4d_t *p4dp, *saved_p4dp; pud_t *pudp, *saved_pudp; pmd_t *pmdp, *saved_pmdp, pmd; - pte_t *ptep; + pte_t *ptep, pte; pgtable_t saved_ptep; pgprot_t prot, protnone; phys_addr_t paddr; @@ -1049,6 +1049,8 @@ static int __init debug_vm_pgtable(void) */ ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + pte = pfn_pte(pte_aligned, prot); + set_pte_at(mm, vaddr, ptep, pte); pte_clear_tests(mm, ptep, vaddr); pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); pte_unmap_unlock(ptep, ptl); -- 2.26.2
[PATCH v3 12/13] mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64
The seems to be missing quite a lot of details w.r.t allocating the correct pgtable_t page (huge_pte_alloc()), holding the right lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA. ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these. Hence disable the test on ppc64. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 4 1 file changed, 4 insertions(+) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index a188b6e4e37e..21329c7d672f 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -813,6 +813,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } +#ifndef CONFIG_PPC_BOOK3S_64 static void __init hugetlb_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep, unsigned long pfn, @@ -855,6 +856,7 @@ static void __init hugetlb_advanced_tests(struct mm_struct *mm, pte = huge_ptep_get(ptep); WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte))); } +#endif #else /* !CONFIG_HUGETLB_PAGE */ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { } static void __init hugetlb_advanced_tests(struct mm_struct *mm, @@ -1065,7 +1067,9 @@ static int __init debug_vm_pgtable(void) pud_populate_tests(mm, pudp, saved_pmdp); spin_unlock(ptl); +#ifndef CONFIG_PPC_BOOK3S_64 hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); +#endif spin_lock(&mm->page_table_lock); p4d_clear_tests(mm, p4dp); -- 2.26.2
[PATCH v3 11/13] mm/debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries
pmd_clear() should not be used to clear pmd level pte entries. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 0a6e771ebd13..a188b6e4e37e 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -196,6 +196,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmd = READ_ONCE(*pmdp); WARN_ON(pmd_young(pmd)); + /* Clear the pte entries */ + pmdp_huge_get_and_clear(mm, vaddr, pmdp); pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } @@ -321,6 +323,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pudp_test_and_clear_young(vma, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(pud_young(pud)); + + pudp_huge_get_and_clear(mm, vaddr, pudp); } static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -444,8 +448,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, * This entry points to next level page table page. * Hence this must not qualify as pud_bad(). */ - pmd_clear(pmdp); - pud_clear(pudp); pud_populate(mm, pudp, pmdp); pud = READ_ONCE(*pudp); WARN_ON(pud_bad(pud)); @@ -577,7 +579,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, * This entry points to next level page table page. * Hence this must not qualify as pmd_bad(). */ - pmd_clear(pmdp); pmd_populate(mm, pmdp, pgtable); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_bad(pmd)); -- 2.26.2
[PATCH v3 09/13] mm/debug_vm_pgtable/locks: Move non page table modifying test together
This will help in adding proper locks in a later patch Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 52 --- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 0ce5c6a24c5b..78c8af3445ac 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -992,7 +992,7 @@ static int __init debug_vm_pgtable(void) p4dp = p4d_alloc(mm, pgdp, vaddr); pudp = pud_alloc(mm, p4dp, vaddr); pmdp = pmd_alloc(mm, pudp, vaddr); - ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + ptep = pte_alloc_map(mm, pmdp, vaddr); /* * Save all the page table page addresses as the page table @@ -1012,33 +1012,12 @@ static int __init debug_vm_pgtable(void) p4d_basic_tests(p4d_aligned, prot); pgd_basic_tests(pgd_aligned, prot); - pte_clear_tests(mm, ptep, vaddr); - pmd_clear_tests(mm, pmdp); - pud_clear_tests(mm, pudp); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); - - pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_leaf_tests(pmd_aligned, prot); pud_leaf_tests(pud_aligned, prot); - pmd_huge_tests(pmdp, pmd_aligned, prot); - pud_huge_tests(pudp, pud_aligned, prot); - pte_savedwrite_tests(pte_aligned, protnone); pmd_savedwrite_tests(pmd_aligned, protnone); - pte_unmap_unlock(ptep, ptl); - - pmd_populate_tests(mm, pmdp, saved_ptep); - pud_populate_tests(mm, pudp, saved_pmdp); - p4d_populate_tests(mm, p4dp, saved_pudp); - pgd_populate_tests(mm, pgdp, saved_p4dp); - pte_special_tests(pte_aligned, prot); pte_protnone_tests(pte_aligned, protnone); pmd_protnone_tests(pmd_aligned, protnone); @@ -1056,11 +1035,38 @@ static int __init debug_vm_pgtable(void) pmd_swap_tests(pmd_aligned, prot); swap_migration_tests(); - hugetlb_basic_tests(pte_aligned, prot); pmd_thp_tests(pmd_aligned, prot); pud_thp_tests(pud_aligned, prot); + /* +* Page table modifying tests +*/ + pte_clear_tests(mm, ptep, vaddr); + pmd_clear_tests(mm, pmdp); + pud_clear_tests(mm, pudp); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); + + ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); + pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); + hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); + + + pmd_huge_tests(pmdp, pmd_aligned, prot); + pud_huge_tests(pudp, pud_aligned, prot); + + pte_unmap_unlock(ptep, ptl); + + pmd_populate_tests(mm, pmdp, saved_ptep); + pud_populate_tests(mm, pudp, saved_pmdp); + p4d_populate_tests(mm, p4dp, saved_pudp); + pgd_populate_tests(mm, pgdp, saved_p4dp); + + hugetlb_basic_tests(pte_aligned, prot); + p4d_free(mm, saved_p4dp); pud_free(mm, saved_pudp); pmd_free(mm, saved_pmdp); -- 2.26.2
[PATCH v3 10/13] mm/debug_vm_pgtable/locks: Take correct page table lock
Make sure we call pte accessors with correct lock held. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 34 -- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 78c8af3445ac..0a6e771ebd13 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1039,33 +1039,39 @@ static int __init debug_vm_pgtable(void) pmd_thp_tests(pmd_aligned, prot); pud_thp_tests(pud_aligned, prot); + hugetlb_basic_tests(pte_aligned, prot); + /* * Page table modifying tests */ - pte_clear_tests(mm, ptep, vaddr); - pmd_clear_tests(mm, pmdp); - pud_clear_tests(mm, pudp); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + pte_clear_tests(mm, ptep, vaddr); pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - + pte_unmap_unlock(ptep, ptl); + ptl = pmd_lock(mm, pmdp); + pmd_clear_tests(mm, pmdp); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); pmd_huge_tests(pmdp, pmd_aligned, prot); + pmd_populate_tests(mm, pmdp, saved_ptep); + spin_unlock(ptl); + + ptl = pud_lock(mm, pudp); + pud_clear_tests(mm, pudp); + pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); pud_huge_tests(pudp, pud_aligned, prot); + pud_populate_tests(mm, pudp, saved_pmdp); + spin_unlock(ptl); - pte_unmap_unlock(ptep, ptl); + hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_populate_tests(mm, pmdp, saved_ptep); - pud_populate_tests(mm, pudp, saved_pmdp); + spin_lock(&mm->page_table_lock); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); p4d_populate_tests(mm, p4dp, saved_pudp); pgd_populate_tests(mm, pgdp, saved_p4dp); - - hugetlb_basic_tests(pte_aligned, prot); + spin_unlock(&mm->page_table_lock); p4d_free(mm, saved_p4dp); pud_free(mm, saved_pudp); -- 2.26.2
[PATCH v3 08/13] mm/debug_vm_pgtable/thp: Use page table depost/withdraw with THP
Architectures like ppc64 use deposited page table while updating the huge pte entries. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index f9f6358899a8..0ce5c6a24c5b 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -154,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, - pgprot_t prot) + pgprot_t prot, pgtable_t pgtable) { pmd_t pmd; @@ -165,6 +165,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, /* Align the address wrt HPAGE_PMD_SIZE */ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + pgtable_trans_huge_deposit(mm, pmdp, pgtable); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_set_wrprotect(mm, vaddr, pmdp); @@ -193,6 +195,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_test_and_clear_young(vma, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_young(pmd)); + + pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -373,7 +377,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, - pgprot_t prot) + pgprot_t prot, pgtable_t pgtable) { } static void __init pud_advanced_tests(struct mm_struct *mm, @@ -1015,7 +1019,7 @@ static int __init debug_vm_pgtable(void) pgd_clear_tests(mm, pgdp); pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); -- 2.26.2
[PATCH v3 07/13] mm/debug_vm_pgtable/set_pte/pmd/pud: Don't use set_*_at to update an existing pte entry
set_pte_at() should not be used to set a pte entry at locations that already holds a valid pte entry. Architectures like ppc64 don't do TLB invalidate in set_pte_at() and hence expect it to be used to set locations that are not a valid PTE. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 35 +++ 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index de83a20c1b30..f9f6358899a8 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -79,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm, { pte_t pte = pfn_pte(pfn, prot); + /* +* Architectures optimize set_pte_at by avoiding TLB flush. +* This requires set_pte_at to be not used to update an +* existing pte entry. Clear pte before we do set_pte_at +*/ + pr_debug("Validating PTE advanced\n"); pte = pfn_pte(pfn, prot); set_pte_at(mm, vaddr, ptep, pte); ptep_set_wrprotect(mm, vaddr, ptep); pte = ptep_get(ptep); WARN_ON(pte_write(pte)); - - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); ptep_get_and_clear(mm, vaddr, ptep); pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); @@ -101,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm, ptep_set_access_flags(vma, vaddr, ptep, pte, 1); pte = ptep_get(ptep); WARN_ON(!(pte_write(pte) && pte_dirty(pte))); - - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); ptep_get_and_clear_full(mm, vaddr, ptep, 1); pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); + pte = pfn_pte(pfn, prot); pte = pte_mkyoung(pte); set_pte_at(mm, vaddr, ptep, pte); ptep_test_and_clear_young(vma, vaddr, ptep); @@ -169,9 +170,6 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_set_wrprotect(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_write(pmd)); - - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); - set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_huge_get_and_clear(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); @@ -185,13 +183,11 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1); pmd = READ_ONCE(*pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); - - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); - set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1); pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); pmd = pmd_mkyoung(pmd); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_test_and_clear_young(vma, vaddr, pmdp); @@ -293,18 +289,10 @@ static void __init pud_advanced_tests(struct mm_struct *mm, WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED - - pud = pud_mkhuge(pfn_pud(pfn, prot)); - set_pud_at(mm, vaddr, pudp, pud); pudp_huge_get_and_clear(mm, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); - pud = pud_mkhuge(pfn_pud(pfn, prot)); - set_pud_at(mm, vaddr, pudp, pud); - pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); - pud = READ_ONCE(*pudp); - WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ pud = pud_mkhuge(pfn_pud(pfn, prot)); @@ -317,6 +305,13 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pud = READ_ONCE(*pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); +#ifndef __PAGETABLE_PMD_FOLDED + pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); +#endif /* __PAGETABLE_PMD_FOLDED */ + + pud = pud_mkhuge(pfn_pud(pfn, prot)); pud = pud_mkyoung(pud); set_pud_at(mm, vaddr, pudp, pud); pudp_test_and_clear_young(vma, vaddr, pudp); -- 2.26.2
[PATCH v3 06/13] mm/debug_vm_pgtable/THP: Mark the pte entry huge before using set_pmd/pud_at
kernel expects entries to be marked huge before we use set_pmd_at()/set_pud_at(). Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 21 - 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 5c0680836fe9..de83a20c1b30 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -155,7 +155,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, unsigned long pfn, unsigned long vaddr, pgprot_t prot) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd; if (!has_transparent_hugepage()) return; @@ -164,19 +164,19 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, /* Align the address wrt HPAGE_PMD_SIZE */ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; - pmd = pfn_pmd(pfn, prot); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_set_wrprotect(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_write(pmd)); - pmd = pfn_pmd(pfn, prot); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_huge_get_and_clear(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); - pmd = pfn_pmd(pfn, prot); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(mm, vaddr, pmdp, pmd); @@ -237,7 +237,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; @@ -277,7 +277,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, unsigned long pfn, unsigned long vaddr, pgprot_t prot) { - pud_t pud = pfn_pud(pfn, prot); + pud_t pud; if (!has_transparent_hugepage()) return; @@ -286,25 +286,28 @@ static void __init pud_advanced_tests(struct mm_struct *mm, /* Align the address wrt HPAGE_PUD_SIZE */ vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE; + pud = pud_mkhuge(pfn_pud(pfn, prot)); set_pud_at(mm, vaddr, pudp, pud); pudp_set_wrprotect(mm, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED - pud = pfn_pud(pfn, prot); + + pud = pud_mkhuge(pfn_pud(pfn, prot)); set_pud_at(mm, vaddr, pudp, pud); pudp_huge_get_and_clear(mm, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); - pud = pfn_pud(pfn, prot); + pud = pud_mkhuge(pfn_pud(pfn, prot)); set_pud_at(mm, vaddr, pudp, pud); pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ - pud = pfn_pud(pfn, prot); + + pud = pud_mkhuge(pfn_pud(pfn, prot)); pud = pud_wrprotect(pud); pud = pud_mkclean(pud); set_pud_at(mm, vaddr, pudp, pud); -- 2.26.2
[PATCH v3 05/13] mm/debug_vm_pgtable/savedwrite: Enable savedwrite test with CONFIG_NUMA_BALANCING
Saved write support was added to track the write bit of a pte after marking the pte protnone. This was done so that AUTONUMA can convert a write pte to protnone and still track the old write bit. When converting it back we set the pte write bit correctly thereby avoiding a write fault again. Hence enable the test only when CONFIG_NUMA_BALANCING is enabled and use protnone protflags. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 28f9d0558c20..5c0680836fe9 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -119,10 +119,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) { pte_t pte = pfn_pte(pfn, prot); + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + pr_debug("Validating PTE saved write\n"); WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte; WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte; } + #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { @@ -235,6 +239,9 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { pmd_t pmd = pfn_pmd(pfn, prot); + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + pr_debug("Validating PMD saved write\n"); WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd; WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd; @@ -1020,8 +1027,8 @@ static int __init debug_vm_pgtable(void) pmd_huge_tests(pmdp, pmd_aligned, prot); pud_huge_tests(pudp, pud_aligned, prot); - pte_savedwrite_tests(pte_aligned, prot); - pmd_savedwrite_tests(pmd_aligned, prot); + pte_savedwrite_tests(pte_aligned, protnone); + pmd_savedwrite_tests(pmd_aligned, protnone); pte_unmap_unlock(ptep, ptl); -- 2.26.2
[PATCH v3 04/13] mm/debug_vm_pgtables/hugevmap: Use the arch helper to identify huge vmap support.
ppc64 supports huge vmap only with radix translation. Hence use arch helper to determine the huge vmap support. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bbf9df0e64c6..28f9d0558c20 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -206,11 +207,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_leaf(pmd)); } +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + if (!arch_ioremap_pmd_supported()) return; pr_debug("Validating PMD huge\n"); @@ -224,6 +226,10 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); } +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } +#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ + static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { @@ -320,11 +326,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pud_leaf(pud)); } +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { pud_t pud; - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + if (!arch_ioremap_pud_supported()) return; pr_debug("Validating PUD huge\n"); @@ -338,6 +345,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); } +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } +#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ + #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } static void __init pud_advanced_tests(struct mm_struct *mm, -- 2.26.2
[PATCH v3 03/13] mm/debug_vm_pgtable/ppc64: Avoid setting top bits in radom value
ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting that bit in random value. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 086309fb9b6f..bbf9df0e64c6 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -44,10 +44,17 @@ * entry type. But these bits might affect the ability to clear entries with * pxx_clear() because of how dynamic page table folding works on s390. So * while loading up the entries do not change the lower 4 bits. It does not - * have affect any other platform. + * have affect any other platform. Also avoid the 62nd bit on ppc64 that is + * used to mark a pte entry. */ -#define S390_MASK_BITS 4 -#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS) +#define S390_SKIP_MASK GENMASK(3, 0) +#ifdef CONFIG_PPC_BOOK3S_64 +#define PPC64_SKIP_MASKGENMASK(62, 62) +#else +#define PPC64_SKIP_MASK0x0 +#endif +#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) +#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) -- 2.26.2
[PATCH v3 02/13] powerpc/mm: Move setting pte specific flags to pfn_pte
powerpc used to set the pte specific flags in set_pte_at(). This is different from other architectures. To be consistent with other architecture update pfn_pte to set _PAGE_PTE on ppc64. Also, drop now unused pte_mkpte. We add a VM_WARN_ON() to catch the usage of calling set_pte_at() without setting _PAGE_PTE bit. We will remove that after a few releases. With respect to huge pmd entries, pmd_mkhuge() takes care of adding the _PAGE_PTE bit. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/pgtable.h | 15 +-- arch/powerpc/include/asm/nohash/pgtable.h| 5 - arch/powerpc/mm/pgtable.c| 5 - 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 079211968987..2382fd516f6b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -619,7 +619,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) VM_BUG_ON(pfn >> (64 - PAGE_SHIFT)); VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK); - return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot)); + return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE); } static inline unsigned long pte_pfn(pte_t pte) @@ -655,11 +655,6 @@ static inline pte_t pte_mkexec(pte_t pte) return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC)); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); -} - static inline pte_t pte_mkwrite(pte_t pte) { /* @@ -823,6 +818,14 @@ static inline int pte_none(pte_t pte) static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, int percpu) { + + VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE))); + /* +* Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE +* in all the callers. +*/ +pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); + if (radix_enabled()) return radix__set_pte_at(mm, addr, ptep, pte, percpu); return hash__set_pte_at(mm, addr, ptep, pte, percpu); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 4b7c3472eab1..6277e7596ae5 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -140,11 +140,6 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return pte; -} - static inline pte_t pte_mkspecial(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPECIAL); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9c0547d77af3..ab57b07ef39a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -184,9 +184,6 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - /* Add the pte bit when trying to set a pte */ - pte = pte_mkpte(pte); - /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this * is called. @@ -275,8 +272,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = pte_mkpte(pte); - pte = set_pte_filter(pte); val = pte_val(pte); -- 2.26.2
[PATCH v3 01/13] powerpc/mm: Add DEBUG_VM WARN for pmd_clear
With the hash page table, the kernel should not use pmd_clear for clearing huge pte entries. Add a DEBUG_VM WARN to catch the wrong usage. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/pgtable.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 6de56c3b33c4..079211968987 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -868,6 +868,13 @@ static inline bool pte_ci(pte_t pte) static inline void pmd_clear(pmd_t *pmdp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* +* Don't use this if we can possibly have a hash page table +* entry mapping this. +*/ + WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pmdp = __pmd(0); } @@ -916,6 +923,13 @@ static inline int pmd_bad(pmd_t pmd) static inline void pud_clear(pud_t *pudp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* +* Don't use this if we can possibly have a hash page table +* entry mapping this. +*/ + WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pudp = __pud(0); } -- 2.26.2
[PATCH v3 00/13] mm/debug_vm_pgtable fixes
This patch series includes fixes for debug_vm_pgtable test code so that they follow page table updates rules correctly. The first two patches introduce changes w.r.t ppc64. The patches are included in this series for completeness. We can merge them via ppc64 tree if required. Hugetlb test is disabled on ppc64 because that needs larger change to satisfy page table update rules. The patches are on top of 15bc20c6af4ceee97a1f90b43c0e386643c071b4 (linus/master) Changes from v2: * Fix build failure with different configs and architecture. Changes from v1: * Address review feedback * drop test specific pfn_pte and pfn_pmd. * Update ppc64 page table helper to add _PAGE_PTE Aneesh Kumar K.V (13): powerpc/mm: Add DEBUG_VM WARN for pmd_clear powerpc/mm: Move setting pte specific flags to pfn_pte mm/debug_vm_pgtable/ppc64: Avoid setting top bits in radom value mm/debug_vm_pgtables/hugevmap: Use the arch helper to identify huge vmap support. mm/debug_vm_pgtable/savedwrite: Enable savedwrite test with CONFIG_NUMA_BALANCING mm/debug_vm_pgtable/THP: Mark the pte entry huge before using set_pmd/pud_at mm/debug_vm_pgtable/set_pte/pmd/pud: Don't use set_*_at to update an existing pte entry mm/debug_vm_pgtable/thp: Use page table depost/withdraw with THP mm/debug_vm_pgtable/locks: Move non page table modifying test together mm/debug_vm_pgtable/locks: Take correct page table lock mm/debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64 mm/debug_vm_pgtable: populate a pte entry before fetching it arch/powerpc/include/asm/book3s/64/pgtable.h | 29 +++- arch/powerpc/include/asm/nohash/pgtable.h| 5 - arch/powerpc/mm/pgtable.c| 5 - mm/debug_vm_pgtable.c| 170 --- 4 files changed, 131 insertions(+), 78 deletions(-) -- 2.26.2
[PATCH v3 5/6] powerpc: Initialize a temporary mm for code patching
When code patching a STRICT_KERNEL_RWX kernel the page containing the address to be patched is temporarily mapped with permissive memory protections. Currently, a per-cpu vmalloc patch area is used for this purpose. While the patch area is per-cpu, the temporary page mapping is inserted into the kernel page tables for the duration of the patching. The mapping is exposed to CPUs other than the patching CPU - this is undesirable from a hardening perspective. Use the `poking_init` init hook to prepare a temporary mm and patching address. Initialize the temporary mm by copying the init mm. Choose a randomized patching address inside the temporary mm userspace address portion. The next patch uses the temporary mm and patching address for code patching. Based on x86 implementation: commit 4fc19708b165 ("x86/alternatives: Initialize temporary mm for patching") Signed-off-by: Christopher M. Riedl --- arch/powerpc/lib/code-patching.c | 40 1 file changed, 40 insertions(+) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 89b37ece6d2f..051d7ae6d8ee 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include @@ -109,6 +111,44 @@ static inline void unuse_temporary_mm(struct temp_mm *temp_mm) } } +static struct mm_struct *patching_mm __ro_after_init; +static unsigned long patching_addr __ro_after_init; + +void __init poking_init(void) +{ + spinlock_t *ptl; /* for protecting pte table */ + pte_t *ptep; + + /* +* Some parts of the kernel (static keys for example) depend on +* successful code patching. Code patching under STRICT_KERNEL_RWX +* requires this setup - otherwise we cannot patch at all. We use +* BUG_ON() here and later since an early failure is preferred to +* buggy behavior and/or strange crashes later. +*/ + patching_mm = copy_init_mm(); + BUG_ON(!patching_mm); + + /* +* Choose a randomized, page-aligned address from the range: +* [PAGE_SIZE, DEFAULT_MAP_WINDOW - PAGE_SIZE] +* The lower address bound is PAGE_SIZE to avoid the zero-page. +* The upper address bound is DEFAULT_MAP_WINDOW - PAGE_SIZE to stay +* under DEFAULT_MAP_WINDOW in hash. +*/ + patching_addr = PAGE_SIZE + ((get_random_long() & PAGE_MASK) + % (DEFAULT_MAP_WINDOW - 2 * PAGE_SIZE)); + + /* +* PTE allocation uses GFP_KERNEL which means we need to pre-allocate +* the PTE here. We cannot do the allocation during patching with IRQs +* disabled (ie. "atomic" context). +*/ + ptep = get_locked_pte(patching_mm, patching_addr, &ptl); + BUG_ON(!ptep); + pte_unmap_unlock(ptep, ptl); +} + static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); #ifdef CONFIG_LKDTM -- 2.28.0
Re: [PATCHv5 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents
Le 10/08/2020 à 10:52, Pingfan Liu a écrit : A bug is observed on pseries by taking the following steps on rhel: -1. drmgr -c mem -r -q 5 -2. echo c > /proc/sysrq-trigger And then, the failure looks like: kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/ kdump: saving vmcore-dmesg.txt kdump: saving vmcore-dmesg.txt complete kdump: saving vmcore Checking for memory holes : [ 0.0 %] / Checking for memory holes : [100.0 %] | Excluding unnecessary pages : [100.0 %] \ Copying data : [ 0.3 %] - eta: 38s[ 44.337636] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 access=0x8004 current=makedumpfile [ 44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 psize 2 pte=0xc0005504 [ 44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 access=0x8004 current=makedumpfile [ 44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 psize 2 pte=0xc0005504 [ 44.337708] makedumpfile[469]: unhandled signal 7 at 7fffba40 nip 7fffbbc4d7fc lr 00011356ca3c code 2 [ 44.338548] Core dump to |/bin/false pipe failed /lib/kdump-lib-initramfs.sh: line 98: 469 Bus error $CORE_COLLECTOR /proc/vmcore $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete kdump: saving vmcore failed * Root cause * After analyzing, it turns out that in the current implementation, when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as the code __remove_memory() comes before drmem_update_dt(). So in kdump kernel, when read_from_oldmem() resorts to pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, as it can be observed "Bus error" From a viewpoint of listener and publisher, the publisher notifies the listener before data is ready. This introduces a problem where udev launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before updating. And in capture kernel, makedumpfile will access the memory based on the stale dt info, and hit a SIGBUS error due to an un-existed lmb. * Fix * This bug is introduced by commit 063b8b1251fd ("powerpc/pseries/memory-hotplug: Only update DT once per memory DLPAR request"), which tried to combine all the dt updating into one. To fix this issue, meanwhile not to introduce a quadratic runtime complexity by the model: dlpar_memory_add_by_count for_each_drmem_lmb <-- dlpar_add_lmb drmem_update_dt(_v1|_v2) for_each_drmem_lmb <-- The dt should still be only updated once, and just before the last memory online/offline event is ejected to user space. Achieve this by tracing the num of lmb added or removed. Signed-off-by: Pingfan Liu Cc: Michael Ellerman Cc: Hari Bathini Cc: Nathan Lynch Cc: Nathan Fontenot Cc: Laurent Dufour To: linuxppc-dev@lists.ozlabs.org Cc: ke...@lists.infradead.org --- v4 -> v5: change dlpar_add_lmb()/dlpar_remove_lmb() prototype to report whether dt is updated successfully. Fix a condition boundary check bug v3 -> v4: resolve a quadratic runtime complexity issue. This series is applied on next-test branch arch/powerpc/platforms/pseries/hotplug-memory.c | 102 +++- 1 file changed, 80 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 46cbcd1..1567d9f 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -350,13 +350,22 @@ static bool lmb_is_removable(struct drmem_lmb *lmb) return true; } -static int dlpar_add_lmb(struct drmem_lmb *); +enum dt_update_status { + DT_NOUPDATE, + DT_TOUPDATE, + DT_UPDATED, +}; + +/* "*dt_update" returns DT_UPDATED if updated */ +static int dlpar_add_lmb(struct drmem_lmb *lmb, + enum dt_update_status *dt_update); -static int dlpar_remove_lmb(struct drmem_lmb *lmb) +static int dlpar_remove_lmb(struct drmem_lmb *lmb, + enum dt_update_status *dt_update) { unsigned long block_sz; phys_addr_t base_addr; - int rc, nid; + int rc, ret, nid; if (!lmb_is_removable(lmb)) return -EINVAL; @@ -372,6 +381,13 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) invalidate_lmb_associativity_index(lmb); lmb_clear_nid(lmb); lmb->flags &= ~DRCONF_MEM_ASSIGNED; + if (*dt_update) { That test is wrong, you should do: if (*dt_update && *dt_update == DT_TOUPDATE) { With the current code, the device tree is updated all the time. Another option would be to pass a valid pointer (!= NULL) only when DT update is required, this way you d
Re: [PATCH v2 0/3] Reintroduce PROT_SAO
On Fri, 21 Aug 2020 13:55:55 -0500, Shawn Anastasio wrote: > Changes in v2: > - Update prot_sao selftest to skip ISA 3.1 > > This set re-introduces the PROT_SAO prot flag removed in > Commit 5c9fa16e8abd ("powerpc/64s: Remove PROT_SAO support"). > > To address concerns regarding live migration of guests using SAO > to P10 hosts without SAO support, the flag is disabled by default > in LPARs. A new config option, PPC_PROT_SAO_LPAR was added to > allow users to explicitly enable it if they will not be running > in an environment where this is a conern. > > [...] Applied to powerpc/fixes. [1/3] Revert "powerpc/64s: Remove PROT_SAO support" https://git.kernel.org/powerpc/c/12564485ed8caac3c18572793ec01330792c7191 [2/3] powerpc/64s: Disallow PROT_SAO in LPARs by default https://git.kernel.org/powerpc/c/9b725a90a8f127802e19466d4e336e701bcea0d2 [3/3] selftests/powerpc: Update PROT_SAO test to skip ISA 3.1 https://git.kernel.org/powerpc/c/24ded46f53f954b9cf246c5d4e3770c7a8aa84ce cheers
Re: [PATCH] Documentation/powerpc: fix malformed table in syscall64-abi
On Sun, 23 Aug 2020 17:31:16 -0700, Randy Dunlap wrote: > Fix malformed table warning in powerpc/syscall64-abi.rst by making > two tables and moving the headings. > > Documentation/powerpc/syscall64-abi.rst:53: WARNING: Malformed table. > Text in column margin in table line 2. > > === = > --- For the sc instruction, differences with the ELF ABI --- > r0 Volatile (System call number.) > r3 Volatile (Parameter 1, and return value.) > r4-r8 Volatile (Parameters 2-6.) > cr0 Volatile (cr0.SO is the return error condition.) > cr1, cr5-7 Nonvolatile > lr Nonvolatile > > [...] Applied to powerpc/fixes. [1/1] Documentation/powerpc: fix malformed table in syscall64-abi https://git.kernel.org/powerpc/c/aa661d7fab436d8a782618b3138da1a84ca28a31 cheers
Re: [PATCH] Revert "powerpc/powernv/idle: Replace CPU feature check with PVR check"
On Wed, 26 Aug 2020 13:59:18 +0530, Pratik Rajesh Sampat wrote: > Cpuidle stop state implementation has minor optimizations for P10 > where hardware preserves more SPR registers compared to P9. > The current P9 driver works for P10, although does few extra > save-restores. P9 driver can provide the required power management > features like SMT thread folding and core level power savings > on a P10 platform. > > [...] Applied to powerpc/fixes. [1/1] Revert "powerpc/powernv/idle: Replace CPU feature check with PVR check" https://git.kernel.org/powerpc/c/16d83a540ca4e7f1ebb2b3756869b77451d31414 cheers
Re: [PATCH] powerpc/64s: scv entry should set PPR
On Tue, 25 Aug 2020 17:53:09 +1000, Nicholas Piggin wrote: > Kernel entry sets PPR to HMT_MEDIUM by convention. The scv entry > path missed this. Applied to powerpc/fixes. [1/1] powerpc/64s: scv entry should set PPR https://git.kernel.org/powerpc/c/e5fe56092e753c50093c60e757561984abff335e cheers
Re: [PATCH] video: fbdev: controlfb: Fix build for COMPILE_TEST=y && PPC_PMAC=n
On Fri, 21 Aug 2020 20:49:10 +1000, Michael Ellerman wrote: > The build is currently broken, if COMPILE_TEST=y and PPC_PMAC=n: > > linux/drivers/video/fbdev/controlfb.c: In function > âcontrol_set_hardwareâ: > linux/drivers/video/fbdev/controlfb.c:276:2: error: implicit declaration of > function âbtext_update_displayâ > 276 | btext_update_display(p->frame_buffer_phys + CTRLFB_OFF, > | ^~~~ > > [...] Applied to powerpc/fixes. [1/1] video: fbdev: controlfb: Fix build for COMPILE_TEST=y && PPC_PMAC=n https://git.kernel.org/powerpc/c/4d618b9f3fcab84e9ec28c180de46fb2c929d096 cheers
Re: [PATCH] powerpc/64s: Fix crash in load_fp_state() due to fpexc_mode
On Tue, 25 Aug 2020 19:34:24 +1000, Michael Ellerman wrote: > The recent commit 01eb01877f33 ("powerpc/64s: Fix restore_math > unnecessarily changing MSR") changed some of the handling of floating > point/vector restore. > > In particular it caused current->thread.fpexc_mode to be copied into > the current MSR (via msr_check_and_set()), rather than just into > regs->msr (which is moved into MSR on return to userspace). > > [...] Applied to powerpc/fixes. [1/1] powerpc/64s: Fix crash in load_fp_state() due to fpexc_mode https://git.kernel.org/powerpc/c/b91eb5182405b01a8aeb42e9b5207831767e97ee cheers
Re: [PATCH] powerpc/32s: Fix module loading failure when VMALLOC_END is over 0xf0000000
On Fri, 21 Aug 2020 07:15:25 + (UTC), Christophe Leroy wrote: > In is_module_segment(), when VMALLOC_END is over 0xf000, > ALIGN(VMALLOC_END, SZ_256M) has value 0. > > In that case, addr >= ALIGN(VMALLOC_END, SZ_256M) is always > true then is_module_segment() always returns false. > > Use (ALIGN(VMALLOC_END, SZ_256M) - 1) which will have > value 0x and will be suitable for the comparison. Applied to powerpc/fixes. [1/1] powerpc/32s: Fix module loading failure when VMALLOC_END is over 0xf000 https://git.kernel.org/powerpc/c/541cebb51f3422d4f2c6cb95c1e5cc3dcc9e5021 cheers
Re: [PATCH] powerpc/perf/hv-24x7: Move cpumask file to top folder of hv-24x7 driver
On Fri, 21 Aug 2020 13:36:10 +0530, Kajol Jain wrote: > Commit 792f73f747b8 ("powerpc/hv-24x7: Add sysfs files inside hv-24x7 > device to show cpumask") added cpumask file as part of hv-24x7 driver > inside the interface folder. Cpumask file suppose to be in the top > folder of the pmu driver inorder to make hotplug works. > > This patch fix that issue and create new group 'cpumask_attr_group' > to add cpumask file and make sure it added on top folder. > > [...] Applied to powerpc/fixes. [1/1] powerpc/perf/hv-24x7: Move cpumask file to top folder of hv-24x7 driver https://git.kernel.org/powerpc/c/64ef8f2c4791940d7f3945507b6a45c20d959260 cheers
Re: [PATCH kernel] powerpc/perf: Stop crashing with generic_compat_pmu
On Tue, 2 Jun 2020 12:56:12 +1000, Alexey Kardashevskiy wrote: > The bhrb_filter_map ("The Branch History Rolling Buffer") callback is > only defined in raw CPUs' power_pmu structs. The "architected" CPUs use > generic_compat_pmu which does not have this callback and crashed occur. > > This add a NULL pointer check for bhrb_filter_map() which behaves as if > the callback returned an error. > > [...] Applied to powerpc/fixes. [1/1] powerpc/perf: Fix crashes with generic_compat_pmu & BHRB https://git.kernel.org/powerpc/c/b460b512417ae9c8b51a3bdcc09020cd6c60ff69 cheers
Re: [PATCH] powerpc/perf: Fix reading of MSR[HV PR] bits in trace-imc
On Wed, 26 Aug 2020 02:40:29 -0400, Athira Rajeev wrote: > IMC trace-mode uses MSR[HV PR] bits to set the cpumode > for the instruction pointer captured in each sample. > The bits are fetched from third DW of the trace record. > Reading third DW from IMC trace record should use be64_to_cpu > along with READ_ONCE inorder to fetch correct MSR[HV PR] bits. > Patch addresses this change. > > [...] Applied to powerpc/fixes. [1/1] powerpc/perf: Fix reading of MSR[HV/PR] bits in trace-imc https://git.kernel.org/powerpc/c/82715a0f332843d3a1830d7ebc9ac7c99a00c880 cheers
Re: fsl_espi errors on v5.7.15
Excerpts from Heiner Kallweit's message of August 26, 2020 4:38 pm: > On 26.08.2020 08:07, Chris Packham wrote: >> >> On 26/08/20 1:48 pm, Chris Packham wrote: >>> >>> On 26/08/20 10:22 am, Chris Packham wrote: On 25/08/20 7:22 pm, Heiner Kallweit wrote: > I've been staring at spi-fsl-espi.c for while now and I think I've >> identified a couple of deficiencies that may or may not be related >> to my >> issue. >> >> First I think the 'Transfer done but SPIE_DON isn't set' message >> can be >> generated spuriously. In fsl_espi_irq() we read the ESPI_SPIE >> register. >> We also write back to it to clear the current events. We re-read it in >> fsl_espi_cpu_irq() and complain when SPIE_DON is not set. But we can >> naturally end up in that situation if we're doing a large read. >> Consider >> the messages for reading a block of data from a spi-nor chip >> >> tx = READ_OP + ADDR >> rx = data >> >> We setup the transfer and pump out the tx_buf. The first interrupt >> goes >> off and ESPI_SPIE has SPIM_DON and SPIM_RXT set. We empty the rx fifo, >> clear ESPI_SPIE and wait for the next interrupt. The next interrupt >> fires and this time we have ESPI_SPIE with just SPIM_RXT set. This >> continues until we've received all the data and we finish with >> ESPI_SPIE >> having only SPIM_RXT set. When we re-read it we complain that SPIE_DON >> isn't set. >> >> The other deficiency is that we only get an interrupt when the >> amount of >> data in the rx fifo is above FSL_ESPI_RXTHR. If there are fewer than >> FSL_ESPI_RXTHR left to be received we will never pull them out of >> the fifo. >> > SPIM_DON will trigger an interrupt once the last characters have been > transferred, and read the remaining characters from the FIFO. The T2080RM that I have says the following about the DON bit "Last character was transmitted. The last character was transmitted and a new command can be written for the next frame." That does at least seem to fit with my assertion that it's all about the TX direction. But the fact that it doesn't happen all the time throws some doubt on it. > I think the reason I'm seeing some variability is because of how fast >> (or slow) the interrupts get processed and how fast the spi-nor >> chip can >> fill the CPUs rx fifo. >> > To rule out timing issues at high bus frequencies I initially asked > for re-testing at lower frequencies. If you e.g. limit the bus to 1 MHz > or even less, then timing shouldn't be an issue. Yes I've currently got spi-max-frequency = <100>; in my dts. I would also expect a slower frequency would fit my "DON is for TX" narrative. > Last relevant functional changes have been done almost 4 years ago. > And yours is the first such report I see. So question is what could > be so > special with your setup that it seems you're the only one being > affected. > The scenarios you describe are standard, therefore much more people > should be affected in case of a driver bug. Agreed. But even on my hardware (which may have a latent issue despite being in the field for going on 5 years) the issue only triggers under some fairly specific circumstances. > You said that kernel config impacts how frequently the issue happens. > Therefore question is what's the diff in kernel config, and how could > the differences be related to SPI. It did seem to be somewhat random. Things like CONFIG_PREEMPT have an impact but every time I found something that seemed to be having an impact I've been able to disprove it. I actually think its about how busy the system is which may or may not affect when we get round to processing the interrupts. I have managed to get the 'Transfer done but SPIE_DON isn't set!' to occur on the T2080RDB. I've had to add the following to expose the environment as a mtd partition diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi index ff87e67c70da..fbf95fc1fd68 100644 --- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi @@ -116,6 +116,15 @@ flash@0 { compatible = "micron,n25q512ax3", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <1000>; /* input clock */ + + partition@u-boot { + reg = <0x 0x0010>; + label = "u-boot"; + }; + partition@u-b
[PATCH v3 4/6] powerpc: Introduce temporary mm
x86 supports the notion of a temporary mm which restricts access to temporary PTEs to a single CPU. A temporary mm is useful for situations where a CPU needs to perform sensitive operations (such as patching a STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing said mappings to other CPUs. A side benefit is that other CPU TLBs do not need to be flushed when the temporary mm is torn down. Mappings in the temporary mm can be set in the userspace portion of the address-space. Interrupts must be disabled while the temporary mm is in use. HW breakpoints, which may have been set by userspace as watchpoints on addresses now within the temporary mm, are saved and disabled when loading the temporary mm. The HW breakpoints are restored when unloading the temporary mm. All HW breakpoints are indiscriminately disabled while the temporary mm is in use. Based on x86 implementation: commit cefa929c034e ("x86/mm: Introduce temporary mm structs") Signed-off-by: Christopher M. Riedl --- arch/powerpc/include/asm/debug.h | 1 + arch/powerpc/kernel/process.c| 5 +++ arch/powerpc/lib/code-patching.c | 65 3 files changed, 71 insertions(+) diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h index ec57daf87f40..827350c9bcf3 100644 --- a/arch/powerpc/include/asm/debug.h +++ b/arch/powerpc/include/asm/debug.h @@ -46,6 +46,7 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; } #endif void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk); +void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk); bool ppc_breakpoint_available(void); #ifdef CONFIG_PPC_ADV_DEBUG_REGS extern void do_send_trap(struct pt_regs *regs, unsigned long address, diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 016bd831908e..0758a8db6342 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -843,6 +843,11 @@ static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk) return 0; } +void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk) +{ + memcpy(brk, this_cpu_ptr(¤t_brk[nr]), sizeof(*brk)); +} + void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) { memcpy(this_cpu_ptr(¤t_brk[nr]), brk, sizeof(*brk)); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 85d3fdca9452..89b37ece6d2f 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -17,6 +17,7 @@ #include #include #include +#include static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr, struct ppc_inst *patch_addr) @@ -44,6 +45,70 @@ int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr) } #ifdef CONFIG_STRICT_KERNEL_RWX + +struct temp_mm { + struct mm_struct *temp; + struct mm_struct *prev; + bool is_kernel_thread; + struct arch_hw_breakpoint brk[HBP_NUM_MAX]; +}; + +static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm) +{ + temp_mm->temp = mm; + temp_mm->prev = NULL; + temp_mm->is_kernel_thread = false; + memset(&temp_mm->brk, 0, sizeof(temp_mm->brk)); +} + +static inline void use_temporary_mm(struct temp_mm *temp_mm) +{ + lockdep_assert_irqs_disabled(); + + temp_mm->is_kernel_thread = current->mm == NULL; + if (temp_mm->is_kernel_thread) + temp_mm->prev = current->active_mm; + else + temp_mm->prev = current->mm; + + /* +* Hash requires a non-NULL current->mm to allocate a userspace address +* when handling a page fault. Does not appear to hurt in Radix either. +*/ + current->mm = temp_mm->temp; + switch_mm_irqs_off(NULL, temp_mm->temp, current); + + if (ppc_breakpoint_available()) { + struct arch_hw_breakpoint null_brk = {0}; + int i = 0; + + for (; i < nr_wp_slots(); ++i) { + __get_breakpoint(i, &temp_mm->brk[i]); + if (temp_mm->brk[i].type != 0) + __set_breakpoint(i, &null_brk); + } + } +} + +static inline void unuse_temporary_mm(struct temp_mm *temp_mm) +{ + lockdep_assert_irqs_disabled(); + + if (temp_mm->is_kernel_thread) + current->mm = NULL; + else + current->mm = temp_mm->prev; + switch_mm_irqs_off(NULL, temp_mm->prev, current); + + if (ppc_breakpoint_available()) { + int i = 0; + + for (; i < nr_wp_slots(); ++i) + if (temp_mm->brk[i].type != 0) + __set_breakpoint(i, &temp_mm->brk[i]); + } +} + static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); #ifdef CONFIG_LKDTM -- 2.28.0