Re: [PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
Le 17/04/2018 à 19:10, Mathieu Malaterre a écrit : On Tue, Apr 17, 2018 at 6:49 PM, Christophe LEROY wrote: Le 17/04/2018 à 18:45, Mathieu Malaterre a écrit : On Tue, Apr 17, 2018 at 12:49 PM, Christophe Leroy wrote: This option does dead code and data elimination with the linker by compiling with -ffunction-sections -fdata-sections and linking with --gc-sections. By selecting this option on mpc885_ads_defconfig, vmlinux LOAD segment size gets reduced by 10% Program Header before the patch: LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 filesz 0x0036eda4 memsz 0x0038de04 flags rwx Program Header after the patch: LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 filesz 0x00316da4 memsz 0x00334268 flags rwx Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8fe4353be5e3..e1fac49cf465 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -888,6 +888,14 @@ config PPC_MEM_KEYS If unsure, say y. +config PPC_UNUSED_ELIMINATION + bool "Eliminate unused functions and data from vmlinux" + default n + select LD_DEAD_CODE_DATA_ELIMINATION + help + Select this to do dead code and data elimination with the linker + by compiling with -ffunction-sections -fdata-sections and linking + with --gc-sections. endmenu Just for reference, I cannot boot my Mac Mini G4 anymore (yaboot). The messages I can see (prom_init) are: Which version of GCC do you use ? $ powerpc-linux-gnu-gcc --version powerpc-linux-gnu-gcc (Debian 6.3.0-18) 6.3.0 20170516 Copyright (C) 2016 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. this is simply coming from: $ apt-cache policy crossbuild-essential-powerpc crossbuild-essential-powerpc: Installed: 12.3 Candidate: 12.3 Version table: *** 12.3 500 500 http://ftp.fr.debian.org/debian stretch/main amd64 Packages 500 http://ftp.fr.debian.org/debian stretch/main i386 Packages 100 /var/lib/dpkg/status Can you provide the generated System.map with and without that option active ? $ du -sh g4/System.map.* 1.7M g4/System.map.with 1.8M g4/System.map.without Here below is the list of objects removed with the option selected. I can't see anything suspect at first. Do you use one of the defconfigs of the kernel ? Otherwise, can you provide your .config ? Can you also provide a copy of the messages you can see (prom_init ...) when boot is ok ? Maybe you can also send me the two vmlinux objects. Thanks Christophe account_steal_time adbhid_exit adb_reset_bus add_range add_range_with_merge aes_fini af_unix_exit agp_exit agp_find_client_by_pid agp_find_mem_by_key agp_find_private agp_free_memory_wrap agpioc_protect_wrap agpioc_release_wrap agp_uninorth_cleanup __alloc_reserved_percpu all_stat_sessions all_stat_sessions_mutex apple_driver_exit arch_cpu_idle_dead arch_setup_msi_irq arch_teardown_msi_irq arch_tlb_gather_mmu asymmetric_key_cleanup asymmetric_key_hex_to_key_id ata_exit ata_tf_to_lba ata_tf_to_lba48 attribute_container_add_class_device_adapter attribute_container_trigger backlight_class_exit bdi_lock bhrb_table biovec_create_pool blk_stat_enable_accounting boot_mapsize bpf_map_meta_equal bvec_free bvec_nr_vecs calc_load_fold_active can_request_irq capacity_margin cap_inode_getsecurity cap_mmap_file cfq_exit cgroup_is_threaded cgroup_is_thread_root cgroup_migrate_add_src cgroup_migrate_vet_dst cgroup_on_dfl cgroup_sk_update_lock cgroupstats_build cgroup_task_count cgroup_transfer_tasks change_protection clean_sort_range clear_ftrace_function clear_zone_contiguous __clockevents_update_freq clockevents_update_freq clocksource_mark_unstable clocksource_touch_watchdog clone_property.isra.2 cmp_range cn_fini cn_queue_free_dev collect_mounts compaction_restarting copy_fpr_from_user copy_fpr_to_user copy_mount_string copy_msg cpu_check_up_prepare cpufreq_boost_trigger_state cpufreq_gov_performance_exit cpu_hotplug_state cpu_in_idle cpu_report_state cpu_set_state_online cpu_temp crashk_low_res crash_wake_offline create_prof_cpu_mask crypto_algapi_exit crypto_exit_proc crypto_null_mod_fini crypto_wq_exit css_rightmost_descendant css_set_lock cubictcp_unregister __current_kernel_time d_absolute_path dbg_release_bp_slot dbg_reserve_bp_slot deadline_exit deadline_exit debug_guardpage_ops default_restore_msi_irqs default_teardown_msi_irqs del_named_trigger dereference_module_function_descriptor __dev_pm_qos_flags dev_pm_qos_read_value devtree_lock die_will_crash disable_cpufreq dma_buf_deinit dma_common_contiguous_remap dma_common_pages_remap __dma_get_required_mask dma_pfn_limit_to_zone do_execveat do_fork __domain_nr do_msg_redirect_map do_pipe_fl
RE: [PATCH 2/6 v2] iommu: of: make of_pci_map_rid() available for other devices too
> -Original Message- > From: Robin Murphy [mailto:robin.mur...@arm.com] > Sent: Tuesday, April 17, 2018 10:23 PM > To: Nipun Gupta ; robh...@kernel.org; > frowand.l...@gmail.com > Cc: will.dea...@arm.com; mark.rutl...@arm.com; catalin.mari...@arm.com; > h...@lst.de; gre...@linuxfoundation.org; j...@8bytes.org; > m.szyprow...@samsung.com; shawn...@kernel.org; bhelg...@google.com; > io...@lists.linux-foundation.org; linux-ker...@vger.kernel.org; > devicet...@vger.kernel.org; linux-arm-ker...@lists.infradead.org; linuxppc- > d...@lists.ozlabs.org; linux-...@vger.kernel.org; Bharat Bhushan > ; stuyo...@gmail.com; Laurentiu Tudor > ; Leo Li > Subject: Re: [PATCH 2/6 v2] iommu: of: make of_pci_map_rid() available for > other devices too > > On 17/04/18 11:21, Nipun Gupta wrote: > > iommu-map property is also used by devices with fsl-mc. This patch > > moves the of_pci_map_rid to generic location, so that it can be used > > by other busses too. > > > > Signed-off-by: Nipun Gupta > > --- > > drivers/iommu/of_iommu.c | 106 > > +-- > > Doesn't this break "msi-parent" parsing for !CONFIG_OF_IOMMU? I guess you > don't want fsl-mc to have to depend on PCI, but this looks like a step in the > wrong direction. Thanks for pointing out. Agree, this will break "msi-parent" parsing for !CONFIG_OF_IOMMU case. > > I'm not entirely sure where of_map_rid() fits best, but from a quick look > around > the least-worst option might be drivers/of/of_address.c, unless Rob and Frank > have a better idea of where generic DT-based ID translation routines could > live? > > > drivers/of/irq.c | 6 +-- > > drivers/pci/of.c | 101 > > > > include/linux/of_iommu.h | 11 + > > include/linux/of_pci.h | 10 - > > 5 files changed, 117 insertions(+), 117 deletions(-) > > [...] > > struct of_pci_iommu_alias_info { > > struct device *dev; > > struct device_node *np; > > @@ -149,9 +249,9 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 > alias, void *data) > > struct of_phandle_args iommu_spec = { .args_count = 1 }; > > int err; > > > > - err = of_pci_map_rid(info->np, alias, "iommu-map", > > -"iommu-map-mask", &iommu_spec.np, > > -iommu_spec.args); > > + err = of_map_rid(info->np, alias, "iommu-map", > > +"iommu-map-mask", &iommu_spec.np, > > +iommu_spec.args); > > Super-nit: Apparently I missed rewrapping this to 2 lines in d87beb749281, > but if > it's being touched again, that would be nice ;) Sure.. I'll take care of this in the next version :) Regards, Nipun
Re: [PATCH] powerpc/8xx: Build fix with Hugetlbfs enabled
"Aneesh Kumar K.V" writes: > 8xx use slice code when hugetlbfs is enabled. We missed a header include on > 8xx which resulted in the below build failure. > > config: mpc885_ads_defconfig + CONFIG_HUGETLBFS > >CC arch/powerpc/mm/slice.o > arch/powerpc/mm/slice.c: In function 'slice_get_unmapped_area': > arch/powerpc/mm/slice.c:655:2: error: implicit declaration of function > 'need_extra_context' [-Werror=implicit-function-declaration] > arch/powerpc/mm/slice.c:656:3: error: implicit declaration of function > 'alloc_extended_context' [-Werror=implicit-function-declaration] > cc1: all warnings being treated as errors > make[1]: *** [arch/powerpc/mm/slice.o] Error 1 > make: *** [arch/powerpc/mm] Error 2 > > on PPC64 the mmu_context.h was included via linux/pkeys.h > > CC: Christophe LEROY > Signed-off-by: Aneesh Kumar K.V > --- > arch/powerpc/mm/slice.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c > index 9cd87d11fe4e..205fe557ca10 100644 > --- a/arch/powerpc/mm/slice.c > +++ b/arch/powerpc/mm/slice.c > @@ -35,6 +35,7 @@ > #include > #include > #include > +#include I already merged this, didn't I? cheers
Re: [RFC] virtio: Use DMA MAP API for devices without an IOMMU
On 04/15/2018 05:41 PM, Christoph Hellwig wrote: > On Fri, Apr 06, 2018 at 06:37:18PM +1000, Benjamin Herrenschmidt wrote: implemented as DMA API which the virtio core understands. There is no need for an IOMMU to be involved for the device representation in this case IMHO. >>> >>> This whole virtio translation issue is a mess. I think we need to >>> switch it to the dma API, and then quirk the legacy case to always >>> use the direct mapping inside the dma API. >> >> Fine with using a dma API always on the Linux side, but we do want to >> special case virtio still at the arch and qemu side to have a "direct >> mapping" mode. Not sure how (special flags on PCI devices) to avoid >> actually going through an emulated IOMMU on the qemu side, because that >> slows things down, esp. with vhost. >> >> IE, we can't I think just treat it the same as a physical device. > > We should have treated it like a physical device from the start, but > that device has unfortunately sailed. > > But yes, we'll need a per-device quirk that says 'don't attach an > iommu'. How about doing it per platform basis as suggested in this RFC through an arch specific callback. Because all the virtio devices in the given platform would require and exercise this option (to avail bounce buffer mechanism for secure guests as an example). So the flag basically is a platform specific one not a device specific one.
[PATCH v2 7/7] ocxl: Document new OCXL IOCTLs
From: Alastair D'Silva Signed-off-by: Alastair D'Silva --- Documentation/accelerators/ocxl.rst | 11 +++ 1 file changed, 11 insertions(+) diff --git a/Documentation/accelerators/ocxl.rst b/Documentation/accelerators/ocxl.rst index 7904adcc07fd..3b8d3b99795c 100644 --- a/Documentation/accelerators/ocxl.rst +++ b/Documentation/accelerators/ocxl.rst @@ -157,6 +157,17 @@ OCXL_IOCTL_GET_METADATA: Obtains configuration information from the card, such at the size of MMIO areas, the AFU version, and the PASID for the current context. +OCXL_IOCTL_ENABLE_P9_WAIT: + + Allows the AFU to wake a userspace thread executing 'wait'. Returns + information to userspace to allow it to configure the AFU. Note that + this is only available on Power 9. + +OCXL_IOCTL_GET_FEATURES: + + Reports on which CPU features that affect OpenCAPI are usable from + userspace. + mmap -- 2.14.3
[PATCH v2 1/7] powerpc: Add TIDR CPU feature for Power9
From: Alastair D'Silva This patch adds a CPU feature bit to show whether the CPU has the TIDR register available, enabling as_notify/wait in userspace. Signed-off-by: Alastair D'Silva --- arch/powerpc/include/asm/cputable.h | 3 ++- arch/powerpc/kernel/dt_cpu_ftrs.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 4e332f3531c5..54c4cbbe57b4 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -215,6 +215,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_P9_TM_HV_ASSIST LONG_ASM_CONST(0x1000) #define CPU_FTR_P9_TM_XER_SO_BUG LONG_ASM_CONST(0x2000) #define CPU_FTR_P9_TLBIE_BUG LONG_ASM_CONST(0x4000) +#define CPU_FTR_P9_TIDR LONG_ASM_CONST(0x8000) #ifndef __ASSEMBLY__ @@ -462,7 +463,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ - CPU_FTR_P9_TLBIE_BUG) + CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR) #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \ (~CPU_FTR_SAO)) #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 11a3a4fed3fb..10f8b7f55637 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -722,6 +722,7 @@ static __init void cpufeatures_cpu_quirks(void) if ((version & 0x) == 0x004e) { cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR; } } -- 2.14.3
[PATCH v2 6/7] ocxl: Add an IOCTL so userspace knows what CPU features are available
From: Alastair D'Silva In order for a userspace AFU driver to call the Power9 specific OCXL_IOCTL_ENABLE_P9_WAIT, it needs to verify that it can actually make that call. Signed-off-by: Alastair D'Silva --- Documentation/accelerators/ocxl.rst | 1 - drivers/misc/ocxl/file.c| 25 + include/uapi/misc/ocxl.h| 4 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/Documentation/accelerators/ocxl.rst b/Documentation/accelerators/ocxl.rst index ddcc58d01cfb..7904adcc07fd 100644 --- a/Documentation/accelerators/ocxl.rst +++ b/Documentation/accelerators/ocxl.rst @@ -157,7 +157,6 @@ OCXL_IOCTL_GET_METADATA: Obtains configuration information from the card, such at the size of MMIO areas, the AFU version, and the PASID for the current context. - mmap diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index eb409a469f21..33ae46ce0a8a 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -168,12 +168,32 @@ static long afu_ioctl_enable_p9_wait(struct ocxl_context *ctx, } #endif + +static long afu_ioctl_get_features(struct ocxl_context *ctx, + struct ocxl_ioctl_features __user *uarg) +{ + struct ocxl_ioctl_features arg; + + memset(&arg, 0, sizeof(arg)); + +#ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + arg.flags[0] |= OCXL_IOCTL_FEATURES_FLAGS0_P9_WAIT; +#endif + + if (copy_to_user(uarg, &arg, sizeof(arg))) + return -EFAULT; + + return 0; +} + #define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" : \ x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" : \ x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" : \ x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" : \ x == OCXL_IOCTL_GET_METADATA ? "GET_METADATA" : \ x == OCXL_IOCTL_ENABLE_P9_WAIT ? "ENABLE_P9_WAIT" : \ + x == OCXL_IOCTL_GET_FEATURES ? "GET_FEATURES" : \ "UNKNOWN") static long afu_ioctl(struct file *file, unsigned int cmd, @@ -239,6 +259,11 @@ static long afu_ioctl(struct file *file, unsigned int cmd, break; #endif + case OCXL_IOCTL_GET_FEATURES: + rc = afu_ioctl_get_features(ctx, + (struct ocxl_ioctl_features __user *) args); + break; + default: rc = -EINVAL; } diff --git a/include/uapi/misc/ocxl.h b/include/uapi/misc/ocxl.h index 8d2748e69c84..bb80f294b429 100644 --- a/include/uapi/misc/ocxl.h +++ b/include/uapi/misc/ocxl.h @@ -55,6 +55,9 @@ struct ocxl_ioctl_p9_wait { __u64 reserved3[3]; }; +#define OCXL_IOCTL_FEATURES_FLAGS0_P9_WAIT 0x01 +struct ocxl_ioctl_features { + __u64 flags[4]; }; struct ocxl_ioctl_irq_fd { @@ -72,5 +75,6 @@ struct ocxl_ioctl_irq_fd { #define OCXL_IOCTL_IRQ_SET_FD _IOW(OCXL_MAGIC, 0x13, struct ocxl_ioctl_irq_fd) #define OCXL_IOCTL_GET_METADATA _IOR(OCXL_MAGIC, 0x14, struct ocxl_ioctl_metadata) #define OCXL_IOCTL_ENABLE_P9_WAIT _IOR(OCXL_MAGIC, 0x15, struct ocxl_ioctl_p9_wait) +#define OCXL_IOCTL_GET_FEATURES _IOR(OCXL_MAGIC, 0x16, struct ocxl_ioctl_platform) #endif /* _UAPI_MISC_OCXL_H */ -- 2.14.3
[PATCH v2 5/7] ocxl: Expose the thread_id needed for wait on p9
From: Alastair D'Silva In order to successfully issue as_notify, an AFU needs to know the TID to notify, which in turn means that this information should be available in userspace so it can be communicated to the AFU. Signed-off-by: Alastair D'Silva --- drivers/misc/ocxl/context.c | 5 +++- drivers/misc/ocxl/file.c | 53 +++ drivers/misc/ocxl/link.c | 36 ++ drivers/misc/ocxl/ocxl_internal.h | 1 + include/misc/ocxl.h | 9 +++ include/uapi/misc/ocxl.h | 10 6 files changed, 113 insertions(+), 1 deletion(-) diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index 909e8807824a..95f74623113e 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -34,6 +34,8 @@ int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu, mutex_init(&ctx->xsl_error_lock); mutex_init(&ctx->irq_lock); idr_init(&ctx->irq_idr); + ctx->tidr = 0; + /* * Keep a reference on the AFU to make sure it's valid for the * duration of the life of the context @@ -65,6 +67,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr) { int rc; + // Locks both status & tidr mutex_lock(&ctx->status_mutex); if (ctx->status != OPENED) { rc = -EIO; @@ -72,7 +75,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr) } rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid, - current->mm->context.id, 0, amr, current->mm, + current->mm->context.id, ctx->tidr, amr, current->mm, xsl_fault_error, ctx); if (rc) goto out; diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index 038509e5d031..eb409a469f21 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include "ocxl_internal.h" @@ -123,11 +125,55 @@ static long afu_ioctl_get_metadata(struct ocxl_context *ctx, return 0; } +#ifdef CONFIG_PPC64 +static long afu_ioctl_enable_p9_wait(struct ocxl_context *ctx, + struct ocxl_ioctl_p9_wait __user *uarg) +{ + struct ocxl_ioctl_p9_wait arg; + + memset(&arg, 0, sizeof(arg)); + + if (cpu_has_feature(CPU_FTR_P9_TIDR)) { + enum ocxl_context_status status; + + // Locks both status & tidr + mutex_lock(&ctx->status_mutex); + if (!ctx->tidr) { + if (set_thread_tidr(current)) + return -ENOENT; + + ctx->tidr = current->thread.tidr; + } + + status = ctx->status; + mutex_unlock(&ctx->status_mutex); + + if (status == ATTACHED) { + int rc; + struct link *link = ctx->afu->fn->link; + + rc = ocxl_link_update_pe(link, ctx->pasid, ctx->tidr); + if (rc) + return rc; + } + + arg.thread_id = ctx->tidr; + } else + return -ENOENT; + + if (copy_to_user(uarg, &arg, sizeof(arg))) + return -EFAULT; + + return 0; +} +#endif + #define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" : \ x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" : \ x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" : \ x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" : \ x == OCXL_IOCTL_GET_METADATA ? "GET_METADATA" : \ + x == OCXL_IOCTL_ENABLE_P9_WAIT ? "ENABLE_P9_WAIT" : \ "UNKNOWN") static long afu_ioctl(struct file *file, unsigned int cmd, @@ -186,6 +232,13 @@ static long afu_ioctl(struct file *file, unsigned int cmd, (struct ocxl_ioctl_metadata __user *) args); break; +#ifdef CONFIG_PPC64 + case OCXL_IOCTL_ENABLE_P9_WAIT: + rc = afu_ioctl_enable_p9_wait(ctx, + (struct ocxl_ioctl_p9_wait __user *) args); + break; +#endif + default: rc = -EINVAL; } diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 656e8610eec2..88876ae8f330 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -544,6 +544,42 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, } EXPORT_SYMBOL_GPL(ocxl_link_add_pe); +int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid) +{ + struct link *link = (struct link *) link_handle; + struct spa *spa = link->spa; + struct ocxl_process_element *pe; + int pe_handle, rc; + + if (pasid > SPA
[PATCH v2 3/7] powerpc: use task_pid_nr() for TID allocation
From: Alastair D'Silva The current implementation of TID allocation, using a global IDR, may result in an errant process starving the system of available TIDs. Instead, use task_pid_nr(), as mentioned by the original author. The scenario described which prevented it's use is not applicable, as set_thread_tidr can only be called after the task struct has been populated. Signed-off-by: Alastair D'Silva --- arch/powerpc/include/asm/switch_to.h | 1 - arch/powerpc/kernel/process.c| 97 +--- 2 files changed, 1 insertion(+), 97 deletions(-) diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index be8c9fa23983..5b03d8a82409 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -94,6 +94,5 @@ static inline void clear_task_ebb(struct task_struct *t) extern int set_thread_uses_vas(void); extern int set_thread_tidr(struct task_struct *t); -extern void clear_thread_tidr(struct task_struct *t); #endif /* _ASM_POWERPC_SWITCH_TO_H */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 3b00da47699b..87f047fd2762 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1496,103 +1496,12 @@ int set_thread_uses_vas(void) } #ifdef CONFIG_PPC64 -static DEFINE_SPINLOCK(vas_thread_id_lock); -static DEFINE_IDA(vas_thread_ida); - -/* - * We need to assign a unique thread id to each thread in a process. - * - * This thread id, referred to as TIDR, and separate from the Linux's tgid, - * is intended to be used to direct an ASB_Notify from the hardware to the - * thread, when a suitable event occurs in the system. - * - * One such event is a "paste" instruction in the context of Fast Thread - * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard - * (VAS) in POWER9. - * - * To get a unique TIDR per process we could simply reuse task_pid_nr() but - * the problem is that task_pid_nr() is not yet available copy_thread() is - * called. Fixing that would require changing more intrusive arch-neutral - * code in code path in copy_process()?. - * - * Further, to assign unique TIDRs within each process, we need an atomic - * field (or an IDR) in task_struct, which again intrudes into the arch- - * neutral code. So try to assign globally unique TIDRs for now. - * - * NOTE: TIDR 0 indicates that the thread does not need a TIDR value. - * For now, only threads that expect to be notified by the VAS - * hardware need a TIDR value and we assign values > 0 for those. - */ -#define MAX_THREAD_CONTEXT ((1 << 16) - 1) -static int assign_thread_tidr(void) -{ - int index; - int err; - unsigned long flags; - -again: - if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock_irqsave(&vas_thread_id_lock, flags); - err = ida_get_new_above(&vas_thread_ida, 1, &index); - spin_unlock_irqrestore(&vas_thread_id_lock, flags); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > MAX_THREAD_CONTEXT) { - spin_lock_irqsave(&vas_thread_id_lock, flags); - ida_remove(&vas_thread_ida, index); - spin_unlock_irqrestore(&vas_thread_id_lock, flags); - return -ENOMEM; - } - - return index; -} - -static void free_thread_tidr(int id) -{ - unsigned long flags; - - spin_lock_irqsave(&vas_thread_id_lock, flags); - ida_remove(&vas_thread_ida, id); - spin_unlock_irqrestore(&vas_thread_id_lock, flags); -} - -/* - * Clear any TIDR value assigned to this thread. - */ -void clear_thread_tidr(struct task_struct *t) -{ - if (!t->thread.tidr) - return; - - if (!cpu_has_feature(CPU_FTR_P9_TIDR)) { - WARN_ON_ONCE(1); - return; - } - - mtspr(SPRN_TIDR, 0); - free_thread_tidr(t->thread.tidr); - t->thread.tidr = 0; -} - -void arch_release_task_struct(struct task_struct *t) -{ - clear_thread_tidr(t); -} - /* * Assign a unique TIDR (thread id) for task @t and set it in the thread * structure. For now, we only support setting TIDR for 'current' task. */ int set_thread_tidr(struct task_struct *t) { - int rc; - if (!cpu_has_feature(CPU_FTR_P9_TIDR)) return -EINVAL; @@ -1602,11 +1511,7 @@ int set_thread_tidr(struct task_struct *t) if (t->thread.tidr) return 0; - rc = assign_thread_tidr(); - if (rc < 0) - return rc; - - t->thread.tidr = rc; + t->thread.tidr = (u16)task_pid_nr(t); mtspr(SPRN_TIDR, t->thread.tidr); return 0; -- 2.14.3
[PATCH v2 0/7] ocxl: Implement Power9 as_notify/wait for OpenCAPI
From: Alastair D'Silva The Power 9 as_notify/wait feature provides a lower latency way to signal a thread that work is complete. This series enables the use of this feature from OpenCAPI adapters, as well as addressing a potential starvation issue when allocating thread IDs. Changelog: v2: Rename get_platform IOCTL to get_features Move stray edit from patch 1 to patch 3 Alastair D'Silva (7): powerpc: Add TIDR CPU feature for Power9 powerpc: Use TIDR CPU feature to control TIDR allocation powerpc: use task_pid_nr() for TID allocation ocxl: Rename pnv_ocxl_spa_remove_pe to clarify it's action ocxl: Expose the thread_id needed for wait on p9 ocxl: Add an IOCTL so userspace knows what CPU features are available ocxl: Document new OCXL IOCTLs Documentation/accelerators/ocxl.rst | 10 arch/powerpc/include/asm/cputable.h | 3 +- arch/powerpc/include/asm/pnv-ocxl.h | 2 +- arch/powerpc/include/asm/switch_to.h | 1 - arch/powerpc/kernel/dt_cpu_ftrs.c | 1 + arch/powerpc/kernel/process.c | 101 +- arch/powerpc/platforms/powernv/ocxl.c | 4 +- drivers/misc/ocxl/context.c | 5 +- drivers/misc/ocxl/file.c | 78 ++ drivers/misc/ocxl/link.c | 38 - drivers/misc/ocxl/ocxl_internal.h | 1 + include/misc/ocxl.h | 9 +++ include/uapi/misc/ocxl.h | 14 + 13 files changed, 162 insertions(+), 105 deletions(-) -- 2.14.3
[PATCH v2 4/7] ocxl: Rename pnv_ocxl_spa_remove_pe to clarify it's action
From: Alastair D'Silva The function removes the process element from NPU cache. Signed-off-by: Alastair D'Silva --- arch/powerpc/include/asm/pnv-ocxl.h | 2 +- arch/powerpc/platforms/powernv/ocxl.c | 4 ++-- drivers/misc/ocxl/link.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h index f6945d3bc971..208b5503f4ed 100644 --- a/arch/powerpc/include/asm/pnv-ocxl.h +++ b/arch/powerpc/include/asm/pnv-ocxl.h @@ -28,7 +28,7 @@ extern int pnv_ocxl_map_xsl_regs(struct pci_dev *dev, void __iomem **dsisr, extern int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **platform_data); extern void pnv_ocxl_spa_release(void *platform_data); -extern int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle); +extern int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle); extern int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr); extern void pnv_ocxl_free_xive_irq(u32 irq); diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c index fa9b53af3c7b..8c65aacda9c8 100644 --- a/arch/powerpc/platforms/powernv/ocxl.c +++ b/arch/powerpc/platforms/powernv/ocxl.c @@ -475,7 +475,7 @@ void pnv_ocxl_spa_release(void *platform_data) } EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release); -int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle) +int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle) { struct spa_data *data = (struct spa_data *) platform_data; int rc; @@ -483,7 +483,7 @@ int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle) rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle); return rc; } -EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe); +EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache); int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr) { diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index f30790582dc0..656e8610eec2 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -599,7 +599,7 @@ int ocxl_link_remove_pe(void *link_handle, int pasid) * On powerpc, the entry needs to be cleared from the context * cache of the NPU. */ - rc = pnv_ocxl_spa_remove_pe(link->platform_data, pe_handle); + rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle); WARN_ON(rc); pe_data = radix_tree_delete(&spa->pe_tree, pe_handle); -- 2.14.3
[PATCH v2 2/7] powerpc: Use TIDR CPU feature to control TIDR allocation
From: Alastair D'Silva Switch the use of TIDR on it's CPU feature, rather than assuming it is available based on architecture. Signed-off-by: Alastair D'Silva --- arch/powerpc/kernel/process.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 1237f13fed51..3b00da47699b 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1154,7 +1154,7 @@ static inline void restore_sprs(struct thread_struct *old_thread, mtspr(SPRN_TAR, new_thread->tar); } - if (cpu_has_feature(CPU_FTR_ARCH_300) && + if (cpu_has_feature(CPU_FTR_P9_TIDR) && old_thread->tidr != new_thread->tidr) mtspr(SPRN_TIDR, new_thread->tidr); #endif @@ -1570,7 +1570,7 @@ void clear_thread_tidr(struct task_struct *t) if (!t->thread.tidr) return; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + if (!cpu_has_feature(CPU_FTR_P9_TIDR)) { WARN_ON_ONCE(1); return; } @@ -1593,7 +1593,7 @@ int set_thread_tidr(struct task_struct *t) { int rc; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) + if (!cpu_has_feature(CPU_FTR_P9_TIDR)) return -EINVAL; if (t != current) -- 2.14.3
Re: [PATCH] misc: cxl: Change return type to vm_fault_t
On 18/04/18 00:53, Souptick Joarder wrote: Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") previously cxl_mmap_fault returns VM_FAULT_NOPAGE as default value irrespective of vm_insert_pfn() return value. This bug is fixed with new vmf_insert_pfn() which will return VM_FAULT_ type based on err. Signed-off-by: Souptick Joarder This looks good to me Acked-by: Andrew Donnellan -- Andrew Donnellan OzLabs, ADL Canberra andrew.donnel...@au1.ibm.com IBM Australia Limited
Re: [PATCH V1 00/11] powerpc/mm/book3s64: Support for split pmd ptlock
On Mon, 16 Apr 2018 16:57:12 +0530 "Aneesh Kumar K.V" wrote: > This patch series add split pmd pagetable lock for book3s64. nohash64 also > should > be able to switch to this. I need to workout the code dependency. This series > also migh have broken the build on platforms otherthan book3s64. I am sending > this early > to get feedback on whether we should continue with the approach. > > We switch the pmd allocator to use something similar to what we already use > for > level 4 pagetable allocation. We get an order 0 page and divide that to > fragments > and hand over fragments when we get request for a pmd pagetable. The pmd lock > is > now stashed in the struct page backing the allocated page. That's only for the THP case right? > > The series helps in reducing lock contention on mm->page_table_lock. > The numbers look good.
Re: [PATCH 1/2] powernv/npu: Do a PID GPU TLB flush when invalidating a large address range
On Tue, Apr 17, 2018 at 7:17 PM, Balbir Singh wrote: > On Tue, Apr 17, 2018 at 7:11 PM, Alistair Popple > wrote: >> The NPU has a limited number of address translation shootdown (ATSD) >> registers and the GPU has limited bandwidth to process ATSDs. This can >> result in contention of ATSD registers leading to soft lockups on some >> threads, particularly when invalidating a large address range in >> pnv_npu2_mn_invalidate_range(). >> >> At some threshold it becomes more efficient to flush the entire GPU TLB for >> the given MM context (PID) than individually flushing each address in the >> range. This patch will result in ranges greater than 2MB being converted >> from 32+ ATSDs into a single ATSD which will flush the TLB for the given >> PID on each GPU. >> >> Signed-off-by: Alistair Popple >> + } >> } >> > > Acked-by: Balbir Singh Tested-by: Balbir Singh
[PATCH] powerpc: platform: cell: spufs: Change return type to vm_fault_t
Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") Previously vm_insert_pfn() returns err but driver returns VM_FAULT_NOPAGE as default. The new function vmf_insert_pfn() will replace this inefficiency by returning correct VM_FAULT_* type. vmf_handle_error is a inline wrapper function which will convert error number to vm_fault_t type err. Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox --- arch/powerpc/platforms/cell/spufs/file.c | 37 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 469bdd0..a1dca9a 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -232,12 +232,13 @@ static ssize_t spufs_attr_write(struct file *file, const char __user *buf, return size; } -static int +static vm_fault_t spufs_mem_mmap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct spu_context *ctx = vma->vm_file->private_data; unsigned long pfn, offset; + vm_fault_t ret; offset = vmf->pgoff << PAGE_SHIFT; if (offset >= LS_SIZE) @@ -256,11 +257,11 @@ static ssize_t spufs_attr_write(struct file *file, const char __user *buf, vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; } - vm_insert_pfn(vma, vmf->address, pfn); + ret = vmf_insert_pfn(vma, vmf->address, pfn); spu_release(ctx); - return VM_FAULT_NOPAGE; + return ret; } static int spufs_mem_mmap_access(struct vm_area_struct *vma, @@ -312,13 +313,19 @@ static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma) .mmap = spufs_mem_mmap, }; -static int spufs_ps_fault(struct vm_fault *vmf, +static inline vm_fault_t vmf_handle_error(int err) +{ + return VM_FAULT_NOPAGE; +} + +static vm_fault_t spufs_ps_fault(struct vm_fault *vmf, unsigned long ps_offs, unsigned long ps_size) { struct spu_context *ctx = vmf->vma->vm_file->private_data; unsigned long area, offset = vmf->pgoff << PAGE_SHIFT; - int ret = 0; + int err = 0; + vm_fault_t ret = VM_FAULT_NOPAGE; spu_context_nospu_trace(spufs_ps_fault__enter, ctx); @@ -349,12 +356,14 @@ static int spufs_ps_fault(struct vm_fault *vmf, if (ctx->state == SPU_STATE_SAVED) { up_read(¤t->mm->mmap_sem); spu_context_nospu_trace(spufs_ps_fault__sleep, ctx); - ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE); + err = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE); + ret = vmf_handle_error(err); spu_context_trace(spufs_ps_fault__wake, ctx, ctx->spu); down_read(¤t->mm->mmap_sem); } else { area = ctx->spu->problem_phys + ps_offs; - vm_insert_pfn(vmf->vma, vmf->address, (area + offset) >> PAGE_SHIFT); + ret = vmf_insert_pfn(vmf->vma, vmf->address, + (area + offset) >> PAGE_SHIFT); spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); } @@ -363,11 +372,11 @@ static int spufs_ps_fault(struct vm_fault *vmf, refault: put_spu_context(ctx); - return VM_FAULT_NOPAGE; + return ret; } #if SPUFS_MMAP_4K -static int spufs_cntl_mmap_fault(struct vm_fault *vmf) +static vm_fault_t spufs_cntl_mmap_fault(struct vm_fault *vmf) { return spufs_ps_fault(vmf, 0x4000, SPUFS_CNTL_MAP_SIZE); } @@ -1040,7 +1049,7 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf, return 4; } -static int +static vm_fault_t spufs_signal1_mmap_fault(struct vm_fault *vmf) { #if SPUFS_SIGNAL_MAP_SIZE == 0x1000 @@ -1178,7 +1187,7 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf, } #if SPUFS_MMAP_4K -static int +static vm_fault_t spufs_signal2_mmap_fault(struct vm_fault *vmf) { #if SPUFS_SIGNAL_MAP_SIZE == 0x1000 @@ -1307,7 +1316,7 @@ static u64 spufs_signal2_type_get(struct spu_context *ctx) spufs_signal2_type_set, "%llu\n", SPU_ATTR_ACQUIRE); #if SPUFS_MMAP_4K -static int +static vm_fault_t spufs_mss_mmap_fault(struct vm_fault *vmf) { return spufs_ps_fault(vmf, 0x, SPUFS_MSS_MAP_SIZE); @@ -1369,7 +1378,7 @@ static int spufs_mss_open(struct inode *inode, struct file *file) .llseek = no_llseek, }; -static int +static vm_fault_t spufs_psmap_mmap_faul
[PATCH] misc: cxl: Change return type to vm_fault_t
Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") previously cxl_mmap_fault returns VM_FAULT_NOPAGE as default value irrespective of vm_insert_pfn() return value. This bug is fixed with new vmf_insert_pfn() which will return VM_FAULT_ type based on err. Signed-off-by: Souptick Joarder --- drivers/misc/cxl/context.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 7ff315a..c6ec872 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -128,11 +128,12 @@ void cxl_context_set_mapping(struct cxl_context *ctx, mutex_unlock(&ctx->mapping_lock); } -static int cxl_mmap_fault(struct vm_fault *vmf) +static vm_fault_t cxl_mmap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct cxl_context *ctx = vma->vm_file->private_data; u64 area, offset; + vm_fault_t ret; offset = vmf->pgoff << PAGE_SHIFT; @@ -169,11 +170,11 @@ static int cxl_mmap_fault(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); + ret = vmf_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); mutex_unlock(&ctx->status_mutex); - return VM_FAULT_NOPAGE; + return ret; } static const struct vm_operations_struct cxl_mmap_vmops = { -- 1.9.1
Re: [PATCH 2/2] powernv/npu: Add a debugfs setting to change ATSD threshold
On Tue, 17 Apr 2018 19:11:29 +1000 Alistair Popple wrote: > The threshold at which it becomes more efficient to coalesce a range of > ATSDs into a single per-PID ATSD is currently not well understood due to a > lack of real-world work loads. This patch adds a debugfs parameter allowing > the threshold to be altered at runtime in order to aid future development > and refinement of the value. > > Signed-off-by: Alistair Popple > --- > arch/powerpc/platforms/powernv/npu-dma.c | 12 ++-- > 1 file changed, 10 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c > index dc34662e9df9..a765bf576c14 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -17,7 +17,9 @@ > #include > #include > #include > +#include > > +#include > #include > #include > #include > @@ -44,7 +46,8 @@ DEFINE_SPINLOCK(npu_context_lock); > * entire TLB on the GPU for the given PID rather than each specific address > in > * the range. > */ > -#define ATSD_THRESHOLD (2*1024*1024) > +static uint64_t atsd_threshold = 2 * 1024 * 1024; > +static struct dentry *atsd_threshold_dentry; > > /* > * Other types of TCE cache invalidation are not functional in the > @@ -682,7 +685,7 @@ static void pnv_npu2_mn_invalidate_range(struct > mmu_notifier *mn, > struct npu_context *npu_context = mn_to_npu_context(mn); > unsigned long address; > > - if (end - start > ATSD_THRESHOLD) { > + if (end - start > atsd_threshold) { > /* >* Just invalidate the entire PID if the address range is too >* large. > @@ -956,6 +959,11 @@ int pnv_npu2_init(struct pnv_phb *phb) > static int npu_index; > uint64_t rc = 0; > > + if (!atsd_threshold_dentry) { > + atsd_threshold_dentry = debugfs_create_x64("atsd_threshold", Nit-picking can we call this atsd_threshold_in_bytes? > +0600, powerpc_debugfs_root, &atsd_threshold); > + } > + > phb->npu.nmmu_flush = > of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); > for_each_child_of_node(phb->hose->dn, dn) { Acked-by: Balbir Singh
Re: [PATCH] powerpc: platform: cell: spufs: Change return type to vm_fault_t
On Wed, Apr 18, 2018 at 12:50:38AM +0530, Souptick Joarder wrote: > Use new return type vm_fault_t for fault handler. For > now, this is just documenting that the function returns > a VM_FAULT value rather than an errno. Once all instances > are converted, vm_fault_t will become a distinct type. > > Reference id -> 1c8f422059ae ("mm: change return type to > vm_fault_t") > > Previously vm_insert_pfn() returns err but driver returns > VM_FAULT_NOPAGE as default. The new function vmf_insert_pfn() > will replace this inefficiency by returning correct VM_FAULT_* > type. > > vmf_handle_error is a inline wrapper function which > will convert error number to vm_fault_t type err. I think you sent the wrong version of this one ... The commit message should mention that we're fixing a minor bug, that the error from vm_insert_pfn() was being ignored and the effect of this is likely to be only felt in OOM situations. > @@ -256,11 +257,11 @@ static ssize_t spufs_attr_write(struct file *file, > const char __user *buf, > vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); > pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; > } > - vm_insert_pfn(vma, vmf->address, pfn); > + ret = vmf_insert_pfn(vma, vmf->address, pfn); > > spu_release(ctx); > > - return VM_FAULT_NOPAGE; > + return ret; > } I thought I said not to introduce vmf_handle_error(), because it's too trivial and obfuscates what's actually going on. > -static int spufs_ps_fault(struct vm_fault *vmf, > +static inline vm_fault_t vmf_handle_error(int err) > +{ > + return VM_FAULT_NOPAGE; > +} > + Re-reading spufs_ps_fault(), I wouldn't change anything inside it. Just change its return type to vm_fault_t and call it done.
Re: [PATCH] powerpc: platform: cell: spufs: Change return type to vm_fault_t
On Tue, Apr 17, 2018 at 9:20 PM, Souptick Joarder wrote: > Use new return type vm_fault_t for fault handler. For > now, this is just documenting that the function returns > a VM_FAULT value rather than an errno. Once all instances > are converted, vm_fault_t will become a distinct type. > > Reference id -> 1c8f422059ae ("mm: change return type to > vm_fault_t") > > Previously vm_insert_pfn() returns err but driver returns > VM_FAULT_NOPAGE as default. The new function vmf_insert_pfn() > will replace this inefficiency by returning correct VM_FAULT_* > type. > > vmf_handle_error is a inline wrapper function which > will convert error number to vm_fault_t type err. > > Signed-off-by: Souptick Joarder > Reviewed-by: Matthew Wilcox Acked-by: Arnd Bergmann
Re: [RFC PATCH 1/3] signal: Ensure every siginfo we send has all bits initialized
Dave Martin writes: > Hmmm > > memset()/clear_siginfo() may ensure that there are no uninitialised > explicit fields except for those in inactive union members, but I'm not > sure that this approach is guaranteed to sanitise the padding seen by > userspace. > > Rationale below, though it's a bit theoretical... > > With this in mind, I tend agree with Linus that hiding memset() calls > from the maintainer may be a bad idea unless they are also hidden from > the compiler. If the compiler sees the memset() it may be able to > optimise it in ways that wouldn't be possible for some other random > external function call, including optimising all or part of the call > out. > > As a result, the breakdown into individual put_user()s etc. in > copy_siginfo_to_user() may still be valuable even if all paths have the > memset(). The breakdown into individual put_user()s is known to be problematically slow, and is actually wrong. Even exclusing the SI_USER duplication in a small number of cases the fields filled out in siginfo by architecture code are not the fields that copy_siginfo_to_user is copying. Which is much worse. The code looks safe but is not. My intention is to leave 0 instances of clear_siginfo in the architecture specific code. Ideally struct siginfo will be limited to kernel/signal.c but I am not certain I can quite get that far. The function do_coredump appears to have a legit need for siginfo. > (Rationale for an arch/arm example:) > >> diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c >> index 4c375e11ae95..adda3fc2dde8 100644 >> --- a/arch/arm/vfp/vfpmodule.c >> +++ b/arch/arm/vfp/vfpmodule.c >> @@ -218,8 +218,7 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct >> pt_regs *regs) >> { >> siginfo_t info; >> >> -memset(&info, 0, sizeof(info)); >> - >> +clear_siginfo(&info); >> info.si_signo = SIGFPE; > > /* by c11 (n1570) 6.2.6.1 para 6 [1], all padding bytes in info now take >unspecified values */ > >> info.si_code = sicode; >> info.si_addr = (void __user *)(instruction_pointer(regs) - 4); > > /* by c11 (n1570) 6.2.6.1 para 7 [2], all bytes of the union info._sifields >other than than those corresponding to _sigfault take unspecified >values */ > > So I don't see why the compiler needs to ensure that any of the affected > bytes are zero: it could potentially skip a lot of the memset() as a > result, in theory. > > I've not seen a compiler actually take advantage of that, but I'm now > not sure what forbids it. I took a quick look at gcc-4.9 which I have handy. The passes -f-no-strict-aliasing which helps, and gcc actually documents that if you access things through the union it will not take advantage of c11. gcc-4.9 Documents it this way: > -fstrict-aliasing' > Allow the compiler to assume the strictest aliasing rules > applicable to the language being compiled. For C (and C++), this > activates optimizations based on the type of expressions. In > particular, an object of one type is assumed never to reside at the > same address as an object of a different type, unless the types are > almost the same. For example, an 'unsigned int' can alias an > 'int', but not a 'void*' or a 'double'. A character type may alias > any other type. > > Pay special attention to code like this: > union a_union { > int i; > double d; > }; > > int f() { > union a_union t; > t.d = 3.0; > return t.i; > } > The practice of reading from a different union member than the one > most recently written to (called "type-punning") is common. Even > with '-fstrict-aliasing', type-punning is allowed, provided the > memory is accessed through the union type. So, the code above > works as expected. > If this can happen, I only see two watertight workarounds: > > 1) Ensure that there is no implicit padding in any UAPI structure, e.g. > aeb1f39d814b: ("arm64/ptrace: Avoid uninitialised struct padding in > fpr_set()"). This would include tail-padding of any union member that > is smaller than the containing union. > > It would be significantly more effort to ensure this for siginfo though. > > 2) Poke all values directly into allocated or user memory directly > via pointers to paddingless types; never assign to objects on the kernel > stack if you care what ends up in the padding, e.g., what your > copy_siginfo_to_user() does prior to this series. > > > If I'm not barking up the wrong tree, memset() cannot generally be > used to determine the value of padding bytes, but it may still be > useful for forcing otherwise uninitialised members to sane initial > values. > > This likely affects many more things than just siginfo. Unless gcc has changed it's stance on type-punning through unions or it's semantics with -fno-strict_aliasing we should be good. Eric
Re: [PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
On Tue, Apr 17, 2018 at 6:49 PM, Christophe LEROY wrote: > > > Le 17/04/2018 à 18:45, Mathieu Malaterre a écrit : >> >> On Tue, Apr 17, 2018 at 12:49 PM, Christophe Leroy >> wrote: >>> >>> This option does dead code and data elimination with the linker by >>> compiling with -ffunction-sections -fdata-sections and linking with >>> --gc-sections. >>> >>> By selecting this option on mpc885_ads_defconfig, >>> vmlinux LOAD segment size gets reduced by 10% >>> >>> Program Header before the patch: >>> LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 >>> filesz 0x0036eda4 memsz 0x0038de04 flags rwx >>> >>> Program Header after the patch: >>> LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 >>> filesz 0x00316da4 memsz 0x00334268 flags rwx >>> >>> Signed-off-by: Christophe Leroy >>> --- >>> arch/powerpc/Kconfig | 8 >>> 1 file changed, 8 insertions(+) >>> >>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig >>> index 8fe4353be5e3..e1fac49cf465 100644 >>> --- a/arch/powerpc/Kconfig >>> +++ b/arch/powerpc/Kconfig >>> @@ -888,6 +888,14 @@ config PPC_MEM_KEYS >>> >>>If unsure, say y. >>> >>> +config PPC_UNUSED_ELIMINATION >>> + bool "Eliminate unused functions and data from vmlinux" >>> + default n >>> + select LD_DEAD_CODE_DATA_ELIMINATION >>> + help >>> + Select this to do dead code and data elimination with the >>> linker >>> + by compiling with -ffunction-sections -fdata-sections and >>> linking >>> + with --gc-sections. >>> endmenu >>> >> >> Just for reference, I cannot boot my Mac Mini G4 anymore (yaboot). The >> messages I can see (prom_init) are: > > > Which version of GCC do you use ? $ powerpc-linux-gnu-gcc --version powerpc-linux-gnu-gcc (Debian 6.3.0-18) 6.3.0 20170516 Copyright (C) 2016 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. this is simply coming from: $ apt-cache policy crossbuild-essential-powerpc crossbuild-essential-powerpc: Installed: 12.3 Candidate: 12.3 Version table: *** 12.3 500 500 http://ftp.fr.debian.org/debian stretch/main amd64 Packages 500 http://ftp.fr.debian.org/debian stretch/main i386 Packages 100 /var/lib/dpkg/status > Can you provide the generated System.map with and without that option active > ? $ du -sh g4/System.map.* 1.7M g4/System.map.with 1.8M g4/System.map.without Will send them by private emails. > Thanks > Christophe > > >> >> --- >> done >> copying OF device tree... >> Building dt strings... >> Building dt structure... >> Device tree strings 0x0110 -> 0x01100e02 >> Device tree struct 0x01101000 -> 0x01109000 >> Quiescing Open Firmware ... >> Booting Linux via __start() @ 0x0014 ... >> _ >> --- >> >> >> >>> config ISA_DMA_API >>> -- >>> 2.13.3 >>> >
[PATCH v2 2/2] powerpc/32be: use stmw/lmw for registers save/restore in asm
arch/powerpc/Makefile activates -mmultiple on BE PPC32 configs in order to use multiple word instructions in functions entry/exit The patch does the same for the asm parts, for consistency On processors like the 8xx on which insn fetching is pretty slow, this speeds up registers save/restore Signed-off-by: Christophe Leroy --- v2: Swapped both patches in the serie to reduce number of impacted lines and added the same modification in ppc_save_regs() arch/powerpc/include/asm/ppc_asm.h | 5 + arch/powerpc/kernel/misc.S | 10 ++ arch/powerpc/kernel/ppc_save_regs.S | 4 3 files changed, 19 insertions(+) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 13f7f4c0e1ea..4bb765d0b758 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -80,11 +80,16 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) #else #define SAVE_GPR(n, base) stw n,GPR0+4*(n)(base) #define REST_GPR(n, base) lwz n,GPR0+4*(n)(base) +#ifdef CONFIG_CPU_BIG_ENDIAN +#define SAVE_NVGPRS(base) stmw13, GPR0+4*13(base) +#define REST_NVGPRS(base) lmw 13, GPR0+4*13(base) +#else #define SAVE_NVGPRS(base) SAVE_GPR(13, base); SAVE_8GPRS(14, base); \ SAVE_10GPRS(22, base) #define REST_NVGPRS(base) REST_GPR(13, base); REST_8GPRS(14, base); \ REST_10GPRS(22, base) #endif +#endif #define SAVE_2GPRS(n, base)SAVE_GPR(n, base); SAVE_GPR(n+1, base) #define SAVE_4GPRS(n, base)SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base) diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 746ee0320ad4..a316d90a5c26 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -49,6 +49,10 @@ _GLOBAL(setjmp) PPC_STL r0,0(r3) PPC_STL r1,SZL(r3) PPC_STL r2,2*SZL(r3) +#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN) + mfcrr12 + stmwr12, 3*SZL(r3) +#else mfcrr0 PPC_STL r0,3*SZL(r3) PPC_STL r13,4*SZL(r3) @@ -70,10 +74,15 @@ _GLOBAL(setjmp) PPC_STL r29,20*SZL(r3) PPC_STL r30,21*SZL(r3) PPC_STL r31,22*SZL(r3) +#endif li r3,0 blr _GLOBAL(longjmp) +#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN) + lmw r12, 3*SZL(r3) + mtcrf 0x38, r12 +#else PPC_LL r13,4*SZL(r3) PPC_LL r14,5*SZL(r3) PPC_LL r15,6*SZL(r3) @@ -95,6 +104,7 @@ _GLOBAL(longjmp) PPC_LL r31,22*SZL(r3) PPC_LL r0,3*SZL(r3) mtcrf 0x38,r0 +#endif PPC_LL r0,0(r3) PPC_LL r1,SZL(r3) PPC_LL r2,2*SZL(r3) diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S index 1b1787d52896..d60316e70514 100644 --- a/arch/powerpc/kernel/ppc_save_regs.S +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -25,6 +25,9 @@ */ _GLOBAL(ppc_save_regs) PPC_STL r0,0*SZL(r3) +#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN) + stmwr2, 2*SZL(r3) +#else PPC_STL r2,2*SZL(r3) PPC_STL r3,3*SZL(r3) PPC_STL r4,4*SZL(r3) @@ -55,6 +58,7 @@ _GLOBAL(ppc_save_regs) PPC_STL r29,29*SZL(r3) PPC_STL r30,30*SZL(r3) PPC_STL r31,31*SZL(r3) +#endif /* go up one stack frame for SP */ PPC_LL r4,0(r1) PPC_STL r4,1*SZL(r3) -- 2.13.3
[PATCH v2 1/2] powerpc: avoid an unnecessary test and branch in longjmp()
Doing the test at exit of the function avoids an unnecessary test and branch inside longjmp() Signed-off-by: Christophe Leroy --- v2: Swapped both patches in the serie to reduce number of impacted lines arch/powerpc/kernel/misc.S | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index e1f3a5d054c4..746ee0320ad4 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -74,10 +74,7 @@ _GLOBAL(setjmp) blr _GLOBAL(longjmp) - PPC_LCMPI r4,0 - bne 1f - li r4,1 -1: PPC_LL r13,4*SZL(r3) + PPC_LL r13,4*SZL(r3) PPC_LL r14,5*SZL(r3) PPC_LL r15,6*SZL(r3) PPC_LL r16,7*SZL(r3) @@ -102,7 +99,9 @@ _GLOBAL(longjmp) PPC_LL r1,SZL(r3) PPC_LL r2,2*SZL(r3) mtlrr0 - mr r3,r4 + mr. r3, r4 + bnelr + li r3, 1 blr _GLOBAL(current_stack_pointer) -- 2.13.3
Re: [PATCH 2/6 v2] iommu: of: make of_pci_map_rid() available for other devices too
On 17/04/18 11:21, Nipun Gupta wrote: iommu-map property is also used by devices with fsl-mc. This patch moves the of_pci_map_rid to generic location, so that it can be used by other busses too. Signed-off-by: Nipun Gupta --- drivers/iommu/of_iommu.c | 106 +-- Doesn't this break "msi-parent" parsing for !CONFIG_OF_IOMMU? I guess you don't want fsl-mc to have to depend on PCI, but this looks like a step in the wrong direction. I'm not entirely sure where of_map_rid() fits best, but from a quick look around the least-worst option might be drivers/of/of_address.c, unless Rob and Frank have a better idea of where generic DT-based ID translation routines could live? drivers/of/irq.c | 6 +-- drivers/pci/of.c | 101 include/linux/of_iommu.h | 11 + include/linux/of_pci.h | 10 - 5 files changed, 117 insertions(+), 117 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 5c36a8b..4e7712f 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -138,6 +138,106 @@ static int of_iommu_xlate(struct device *dev, return ops->of_xlate(dev, iommu_spec); } +/** + * of_map_rid - Translate a requester ID through a downstream mapping. + * @np: root complex device node. + * @rid: device requester ID to map. + * @map_name: property name of the map to use. + * @map_mask_name: optional property name of the mask to use. + * @target: optional pointer to a target device node. + * @id_out: optional pointer to receive the translated ID. + * + * Given a device requester ID, look up the appropriate implementation-defined + * platform ID and/or the target device which receives transactions on that + * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or + * @id_out may be NULL if only the other is required. If @target points to + * a non-NULL device node pointer, only entries targeting that node will be + * matched; if it points to a NULL value, it will receive the device node of + * the first matching target phandle, with a reference held. + * + * Return: 0 on success or a standard error code on failure. + */ +int of_map_rid(struct device_node *np, u32 rid, + const char *map_name, const char *map_mask_name, + struct device_node **target, u32 *id_out) +{ + u32 map_mask, masked_rid; + int map_len; + const __be32 *map = NULL; + + if (!np || !map_name || (!target && !id_out)) + return -EINVAL; + + map = of_get_property(np, map_name, &map_len); + if (!map) { + if (target) + return -ENODEV; + /* Otherwise, no map implies no translation */ + *id_out = rid; + return 0; + } + + if (!map_len || map_len % (4 * sizeof(*map))) { + pr_err("%pOF: Error: Bad %s length: %d\n", np, + map_name, map_len); + return -EINVAL; + } + + /* The default is to select all bits. */ + map_mask = 0x; + + /* +* Can be overridden by "{iommu,msi}-map-mask" property. +*/ + if (map_mask_name) + of_property_read_u32(np, map_mask_name, &map_mask); + + masked_rid = map_mask & rid; + for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) { + struct device_node *phandle_node; + u32 rid_base = be32_to_cpup(map + 0); + u32 phandle = be32_to_cpup(map + 1); + u32 out_base = be32_to_cpup(map + 2); + u32 rid_len = be32_to_cpup(map + 3); + + if (rid_base & ~map_mask) { + pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores rid-base (0x%x)\n", + np, map_name, map_name, + map_mask, rid_base); + return -EFAULT; + } + + if (masked_rid < rid_base || masked_rid >= rid_base + rid_len) + continue; + + phandle_node = of_find_node_by_phandle(phandle); + if (!phandle_node) + return -ENODEV; + + if (target) { + if (*target) + of_node_put(phandle_node); + else + *target = phandle_node; + + if (*target != phandle_node) + continue; + } + + if (id_out) + *id_out = masked_rid - rid_base + out_base; + + pr_debug("%pOF: %s, using mask %08x, rid-base: %08x, out-base: %08x, length: %08x, rid: %08x -> %08x\n", + np, map_name, map_mask, rid_base, out_base, + rid_len, rid, masked_rid - rid_base + out_base); +
Re: [PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
Le 17/04/2018 à 18:45, Mathieu Malaterre a écrit : On Tue, Apr 17, 2018 at 12:49 PM, Christophe Leroy wrote: This option does dead code and data elimination with the linker by compiling with -ffunction-sections -fdata-sections and linking with --gc-sections. By selecting this option on mpc885_ads_defconfig, vmlinux LOAD segment size gets reduced by 10% Program Header before the patch: LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 filesz 0x0036eda4 memsz 0x0038de04 flags rwx Program Header after the patch: LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 filesz 0x00316da4 memsz 0x00334268 flags rwx Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8fe4353be5e3..e1fac49cf465 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -888,6 +888,14 @@ config PPC_MEM_KEYS If unsure, say y. +config PPC_UNUSED_ELIMINATION + bool "Eliminate unused functions and data from vmlinux" + default n + select LD_DEAD_CODE_DATA_ELIMINATION + help + Select this to do dead code and data elimination with the linker + by compiling with -ffunction-sections -fdata-sections and linking + with --gc-sections. endmenu Just for reference, I cannot boot my Mac Mini G4 anymore (yaboot). The messages I can see (prom_init) are: Which version of GCC do you use ? Can you provide the generated System.map with and without that option active ? Thanks Christophe --- done copying OF device tree... Building dt strings... Building dt structure... Device tree strings 0x0110 -> 0x01100e02 Device tree struct 0x01101000 -> 0x01109000 Quiescing Open Firmware ... Booting Linux via __start() @ 0x0014 ... _ --- config ISA_DMA_API -- 2.13.3
Re: [PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
On Tue, Apr 17, 2018 at 12:49 PM, Christophe Leroy wrote: > This option does dead code and data elimination with the linker by > compiling with -ffunction-sections -fdata-sections and linking with > --gc-sections. > > By selecting this option on mpc885_ads_defconfig, > vmlinux LOAD segment size gets reduced by 10% > > Program Header before the patch: > LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 > filesz 0x0036eda4 memsz 0x0038de04 flags rwx > > Program Header after the patch: > LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 > filesz 0x00316da4 memsz 0x00334268 flags rwx > > Signed-off-by: Christophe Leroy > --- > arch/powerpc/Kconfig | 8 > 1 file changed, 8 insertions(+) > > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig > index 8fe4353be5e3..e1fac49cf465 100644 > --- a/arch/powerpc/Kconfig > +++ b/arch/powerpc/Kconfig > @@ -888,6 +888,14 @@ config PPC_MEM_KEYS > > If unsure, say y. > > +config PPC_UNUSED_ELIMINATION > + bool "Eliminate unused functions and data from vmlinux" > + default n > + select LD_DEAD_CODE_DATA_ELIMINATION > + help > + Select this to do dead code and data elimination with the linker > + by compiling with -ffunction-sections -fdata-sections and linking > + with --gc-sections. > endmenu > Just for reference, I cannot boot my Mac Mini G4 anymore (yaboot). The messages I can see (prom_init) are: --- done copying OF device tree... Building dt strings... Building dt structure... Device tree strings 0x0110 -> 0x01100e02 Device tree struct 0x01101000 -> 0x01109000 Quiescing Open Firmware ... Booting Linux via __start() @ 0x0014 ... _ --- > config ISA_DMA_API > -- > 2.13.3 >
Re: powerpc/modules: Fix crashes by adding CONFIG_RELOCATABLE to vermagic
On 16 April 2018 at 16:10, Michael Ellerman wrote: > Ard Biesheuvel writes: > >> On 11 April 2018 at 16:49, Michael Ellerman >> wrote: >>> On Tue, 2018-04-10 at 01:22:06 UTC, Michael Ellerman wrote: If you build the kernel with CONFIG_RELOCATABLE=n, then install the modules, rebuild the kernel with CONFIG_RELOCATABLE=y and leave the old modules installed, we crash something like: Unable to handle kernel paging request for data at address 0xd00018d66cef Faulting instruction address: 0xc21ddd08 Oops: Kernel access of bad area, sig: 11 [#1] Modules linked in: x_tables autofs4 CPU: 2 PID: 1 Comm: systemd Not tainted 4.16.0-rc6-gcc_ubuntu_le-g99fec39 #1 ... NIP check_version.isra.22+0x118/0x170 Call Trace: __ksymtab_xt_unregister_table+0x58/0xfcb8 [x_tables] (unreliable) resolve_symbol+0xb4/0x150 load_module+0x10e8/0x29a0 SyS_finit_module+0x110/0x140 system_call+0x58/0x6c This happens because since commit 71810db27c1c ("modversions: treat symbol CRCs as 32 bit quantities"), a relocatable kernel encodes and handles symbol CRCs differently from a non-relocatable kernel. Although it's possible we could try and detect this situation and handle it, it's much more robust to simply make the state of CONFIG_RELOCATABLE part of the module vermagic. Fixes: 71810db27c1c ("modversions: treat symbol CRCs as 32 bit quantities") Signed-off-by: Michael Ellerman >>> >>> Applied to powerpc fixes. >>> >>> https://git.kernel.org/powerpc/c/73aca179d78eaa11604ba0783a6d8b >> >> Thanks for the cc. I guess this only affects powerpc, given that it is >> the only arch that switches between CRC immediate values and CRC >> offsets depending on the configuration. > > No worries. > > Is there any reason we shouldn't always turn on CONFIG_MODULE_REL_CRCS? > It seems to work, but I wanted to test it more before switching to that, > hence the quick fix above. > > > arch/um looks like it might be switching based on config, but I don't > know enough to say: > > config LD_SCRIPT_STATIC > bool > default y > depends on STATIC_LINK > > config LD_SCRIPT_DYN > bool > default y > depends on !LD_SCRIPT_STATIC > select MODULE_REL_CRCS if MODVERSIONS > The only reason not to enable it is that it ends up taking more space on a 32-bit architecture with CONFIG_RELOCATABLE=n, given that you need to record both the relative offset and the actual CRC value (both 32-bit quantities) rather than just the CRC itself. On a 64-bit arch with CONFIG_RELOCATABLE=n, you end up replacing a single 64-bit quantity with two 32-bit quantities, so it doesn't really matter.
[PATCH v10 22/25] mm: speculative page fault handler return VMA
When the speculative page fault handler is returning VM_RETRY, there is a chance that VMA fetched without grabbing the mmap_sem can be reused by the legacy page fault handler. By reusing it, we avoid calling find_vma() again. To achieve, that we must ensure that the VMA structure will not be freed in our back. This is done by getting the reference on it (get_vma()) and by assuming that the caller will call the new service can_reuse_spf_vma() once it has grabbed the mmap_sem. can_reuse_spf_vma() is first checking that the VMA is still in the RB tree , and then that the VMA's boundaries matched the passed address and release the reference on the VMA so that it can be freed if needed. In the case the VMA is freed, can_reuse_spf_vma() will have returned false as the VMA is no more in the RB tree. In the architecture page fault handler, the call to the new service reuse_spf_or_find_vma() should be made in place of find_vma(), this will handle the check on the spf_vma and if needed call find_vma(). Signed-off-by: Laurent Dufour --- include/linux/mm.h | 22 +++-- mm/memory.c| 140 - 2 files changed, 103 insertions(+), 59 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 08540c98d63b..50b6fd3bf9e2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1382,25 +1382,37 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, #ifdef CONFIG_SPECULATIVE_PAGE_FAULT extern int __handle_speculative_fault(struct mm_struct *mm, unsigned long address, - unsigned int flags); + unsigned int flags, + struct vm_area_struct **vma); static inline int handle_speculative_fault(struct mm_struct *mm, unsigned long address, - unsigned int flags) + unsigned int flags, + struct vm_area_struct **vma) { /* * Try speculative page fault for multithreaded user space task only. */ - if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) + if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) { + *vma = NULL; return VM_FAULT_RETRY; - return __handle_speculative_fault(mm, address, flags); + } + return __handle_speculative_fault(mm, address, flags, vma); } +extern bool can_reuse_spf_vma(struct vm_area_struct *vma, + unsigned long address); #else static inline int handle_speculative_fault(struct mm_struct *mm, unsigned long address, - unsigned int flags) + unsigned int flags, + struct vm_area_struct **vma) { return VM_FAULT_RETRY; } +static inline bool can_reuse_spf_vma(struct vm_area_struct *vma, +unsigned long address) +{ + return false; +} #endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, diff --git a/mm/memory.c b/mm/memory.c index 76178feff000..425f07e0bf38 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4311,13 +4311,22 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* This is required by vm_normal_page() */ #error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL" #endif - /* * vm_normal_page() adds some processing which should be done while * hodling the mmap_sem. */ + +/* + * Tries to handle the page fault in a speculative way, without grabbing the + * mmap_sem. + * When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must + * be checked later when the mmap_sem has been grabbed by calling + * can_reuse_spf_vma(). + * This is needed as the returned vma is kept in memory until the call to + * can_reuse_spf_vma() is made. + */ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address, - unsigned int flags) + unsigned int flags, struct vm_area_struct **vma) { struct vm_fault vmf = { .address = address, @@ -4325,21 +4334,22 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address, pgd_t *pgd, pgdval; p4d_t *p4d, p4dval; pud_t pudval; - int seq, ret = VM_FAULT_RETRY; - struct vm_area_struct *vma; + int seq, ret; /* Clear flags that may lead to release the mmap_sem to retry */ flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE); flags |= FAULT_FLAG_SPECULATIVE; - vma = get_vma(mm, address); - if (!vma) -
[PATCH v10 25/25] powerpc/mm: add speculative page fault
This patch enable the speculative page fault on the PowerPC architecture. This will try a speculative page fault without holding the mmap_sem, if it returns with VM_FAULT_RETRY, the mmap_sem is acquired and the traditional page fault processing is done. The speculative path is only tried for multithreaded process as there is no risk of contention on the mmap_sem otherwise. Signed-off-by: Laurent Dufour --- arch/powerpc/mm/fault.c | 33 +++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index c01d627e687a..37191147026e 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -464,6 +464,26 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { + fault = handle_speculative_fault(mm, address, flags, &vma); + /* +* Page fault is done if VM_FAULT_RETRY is not returned. +* But if the memory protection keys are active, we don't know +* if the fault is due to key mistmatch or due to a +* classic protection check. +* To differentiate that, we will need the VMA we no +* more have, so let's retry with the mmap_sem held. +*/ + if (fault != VM_FAULT_RETRY && + (IS_ENABLED(CONFIG_PPC_MEM_KEYS) && +fault != VM_FAULT_SIGSEGV)) { + perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, address); + goto done; + } + } else { + vma = NULL; + } + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -494,7 +514,8 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, might_sleep(); } - vma = find_vma(mm, address); + if (!vma || !can_reuse_spf_vma(vma, address)) + vma = find_vma(mm, address); if (unlikely(!vma)) return bad_area(regs, address); if (likely(vma->vm_start <= address)) @@ -551,8 +572,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, */ flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; - if (!fatal_signal_pending(current)) + if (!fatal_signal_pending(current)) { + /* +* Do not try to reuse this vma and fetch it +* again since we will release the mmap_sem. +*/ + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) + vma = NULL; goto retry; + } } /* @@ -564,6 +592,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, up_read(¤t->mm->mmap_sem); +done: if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); -- 2.7.4
[PATCH v10 24/25] x86/mm: add speculative pagefault handling
From: Peter Zijlstra Try a speculative fault before acquiring mmap_sem, if it returns with VM_FAULT_RETRY continue with the mmap_sem acquisition and do the traditional fault. Signed-off-by: Peter Zijlstra (Intel) [Clearing of FAULT_FLAG_ALLOW_RETRY is now done in handle_speculative_fault()] [Retry with usual fault path in the case VM_ERROR is returned by handle_speculative_fault(). This allows signal to be delivered] [Don't build SPF call if !CONFIG_SPECULATIVE_PAGE_FAULT] [Try speculative fault path only for multi threaded processes] [Try reuse to the VMA fetch during the speculative path in case of retry] [Call reuse_spf_or_find_vma()] [Handle memory protection key fault] Signed-off-by: Laurent Dufour --- arch/x86/mm/fault.c | 42 ++ 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 73bd8c95ac71..59f778386df5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1220,7 +1220,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - u32 pkey; + u32 pkey, *pt_pkey = &pkey; tsk = current; mm = tsk->mm; @@ -1310,6 +1310,30 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, flags |= FAULT_FLAG_INSTRUCTION; /* +* Do not try speculative page fault for kernel's pages and if +* the fault was due to protection keys since it can't be resolved. +*/ + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT) && + !(error_code & X86_PF_PK)) { + fault = handle_speculative_fault(mm, address, flags, &vma); + if (fault != VM_FAULT_RETRY) { + perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, address); + /* +* Do not advertise for the pkey value since we don't +* know it. +* This is not a matter as we checked for X86_PF_PK +* earlier, so we should not handle pkey fault here, +* but to be sure that mm_fault_error() callees will +* not try to use it, we invalidate the pointer. +*/ + pt_pkey = NULL; + goto done; + } + } else { + vma = NULL; + } + + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in * the kernel and should generate an OOPS. Unfortunately, in the @@ -1342,7 +1366,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, might_sleep(); } - vma = find_vma(mm, address); + if (!vma || !can_reuse_spf_vma(vma, address)) + vma = find_vma(mm, address); if (unlikely(!vma)) { bad_area(regs, error_code, address); return; @@ -1409,8 +1434,15 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (flags & FAULT_FLAG_ALLOW_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; - if (!fatal_signal_pending(tsk)) + if (!fatal_signal_pending(tsk)) { + /* +* Do not try to reuse this vma and fetch it +* again since we will release the mmap_sem. +*/ + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) + vma = NULL; goto retry; + } } /* User mode? Just return to handle the fatal exception */ @@ -1423,8 +1455,10 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, } up_read(&mm->mmap_sem); + +done: if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, &pkey, fault); + mm_fault_error(regs, error_code, address, pt_pkey, fault); return; } -- 2.7.4
[PATCH v10 23/25] mm: add speculative page fault vmstats
Add speculative_pgfault vmstat counter to count successful speculative page fault handling. Also fixing a minor typo in include/linux/vm_event_item.h. Signed-off-by: Laurent Dufour --- include/linux/vm_event_item.h | 3 +++ mm/memory.c | 1 + mm/vmstat.c | 5 - 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 5c7f010676a7..a240acc09684 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, SWAP_RA, SWAP_RA_HIT, #endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + SPECULATIVE_PGFAULT, +#endif NR_VM_EVENT_ITEMS }; diff --git a/mm/memory.c b/mm/memory.c index 425f07e0bf38..1cd5bc000643 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4508,6 +4508,7 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address, * If there is no need to retry, don't return the vma to the caller. */ if (ret != VM_FAULT_RETRY) { + count_vm_event(SPECULATIVE_PGFAULT); put_vma(vmf.vma); *vma = NULL; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 536332e988b8..c6b49bfa8139 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1289,7 +1289,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif -#endif /* CONFIG_VM_EVENTS_COUNTERS */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + "speculative_pgfault" +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ -- 2.7.4
[PATCH v10 21/25] perf tools: add support for the SPF perf event
Add support for the new speculative faults event. Acked-by: David Rientjes Signed-off-by: Laurent Dufour --- tools/include/uapi/linux/perf_event.h | 1 + tools/perf/util/evsel.c | 1 + tools/perf/util/parse-events.c| 4 tools/perf/util/parse-events.l| 1 + tools/perf/util/python.c | 1 + 5 files changed, 8 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 912b85b52344..9aad243607fe 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -112,6 +112,7 @@ enum perf_sw_ids { PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_BPF_OUTPUT= 10, + PERF_COUNT_SW_SPF = 11, PERF_COUNT_SW_MAX, /* non-ABI */ }; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1ac8d9236efd..e14a754c3675 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -429,6 +429,7 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = { "alignment-faults", "emulation-faults", "dummy", + "speculative-faults", }; static const char *__perf_evsel__sw_name(u64 config) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 2fb0272146d8..54719f566314 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -140,6 +140,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { .symbol = "bpf-output", .alias = "", }, + [PERF_COUNT_SW_SPF] = { + .symbol = "speculative-faults", + .alias = "spf", + }, }; #define __PERF_EVENT_FIELD(config, name) \ diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index a1a01b1ac8b8..86584d3a3068 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -308,6 +308,7 @@ emulation-faults{ return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EM dummy { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } duration_time { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); } +speculative-faults|spf { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF); } /* * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately. diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 863b61478edd..df4f7ff9bdff 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -1181,6 +1181,7 @@ static struct { PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS), PERF_CONST(COUNT_SW_EMULATION_FAULTS), PERF_CONST(COUNT_SW_DUMMY), + PERF_CONST(COUNT_SW_SPF), PERF_CONST(SAMPLE_IP), PERF_CONST(SAMPLE_TID), -- 2.7.4
[PATCH v10 20/25] perf: add a speculative page fault sw event
Add a new software event to count succeeded speculative page faults. Acked-by: David Rientjes Signed-off-by: Laurent Dufour --- include/uapi/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 912b85b52344..9aad243607fe 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -112,6 +112,7 @@ enum perf_sw_ids { PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_BPF_OUTPUT= 10, + PERF_COUNT_SW_SPF = 11, PERF_COUNT_SW_MAX, /* non-ABI */ }; -- 2.7.4
[PATCH v10 19/25] mm: adding speculative page fault failure trace events
This patch a set of new trace events to collect the speculative page fault event failures. Signed-off-by: Laurent Dufour --- include/trace/events/pagefault.h | 88 mm/memory.c | 62 ++-- 2 files changed, 137 insertions(+), 13 deletions(-) create mode 100644 include/trace/events/pagefault.h diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h new file mode 100644 index ..a9643b3759f2 --- /dev/null +++ b/include/trace/events/pagefault.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM pagefault + +#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PAGEFAULT_H + +#include +#include + +DECLARE_EVENT_CLASS(spf, + + TP_PROTO(unsigned long caller, +struct vm_area_struct *vma, unsigned long address), + + TP_ARGS(caller, vma, address), + + TP_STRUCT__entry( + __field(unsigned long, caller) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + __field(unsigned long, address) + ), + + TP_fast_assign( + __entry->caller = caller; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end; + __entry->address= address; + ), + + TP_printk("ip:%lx vma:%lx-%lx address:%lx", + __entry->caller, __entry->vm_start, __entry->vm_end, + __entry->address) +); + +DEFINE_EVENT(spf, spf_pte_lock, + + TP_PROTO(unsigned long caller, +struct vm_area_struct *vma, unsigned long address), + + TP_ARGS(caller, vma, address) +); + +DEFINE_EVENT(spf, spf_vma_changed, + + TP_PROTO(unsigned long caller, +struct vm_area_struct *vma, unsigned long address), + + TP_ARGS(caller, vma, address) +); + +DEFINE_EVENT(spf, spf_vma_noanon, + + TP_PROTO(unsigned long caller, +struct vm_area_struct *vma, unsigned long address), + + TP_ARGS(caller, vma, address) +); + +DEFINE_EVENT(spf, spf_vma_notsup, + + TP_PROTO(unsigned long caller, +struct vm_area_struct *vma, unsigned long address), + + TP_ARGS(caller, vma, address) +); + +DEFINE_EVENT(spf, spf_vma_access, + + TP_PROTO(unsigned long caller, +struct vm_area_struct *vma, unsigned long address), + + TP_ARGS(caller, vma, address) +); + +DEFINE_EVENT(spf, spf_pmd_changed, + + TP_PROTO(unsigned long caller, +struct vm_area_struct *vma, unsigned long address), + + TP_ARGS(caller, vma, address) +); + +#endif /* _TRACE_PAGEFAULT_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/memory.c b/mm/memory.c index 8addf78deadb..76178feff000 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -80,6 +80,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include + #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif @@ -2317,8 +2320,10 @@ static bool pte_spinlock(struct vm_fault *vmf) } local_irq_disable(); - if (vma_has_changed(vmf)) + if (vma_has_changed(vmf)) { + trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address); goto out; + } #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* @@ -2326,16 +2331,21 @@ static bool pte_spinlock(struct vm_fault *vmf) * is not a huge collapse operation in progress in our back. */ pmdval = READ_ONCE(*vmf->pmd); - if (!pmd_same(pmdval, vmf->orig_pmd)) + if (!pmd_same(pmdval, vmf->orig_pmd)) { + trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address); goto out; + } #endif vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - if (unlikely(!spin_trylock(vmf->ptl))) + if (unlikely(!spin_trylock(vmf->ptl))) { + trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address); goto out; + } if (vma_has_changed(vmf)) { spin_unlock(vmf->ptl); + trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address); goto out; } @@ -2368,8 +2378,10 @@ static bool pte_map_lock(struct vm_fault *vmf) * block on the PTL and thus we're safe. */ local_irq_disable(); - if (vma_has_changed(vmf)) + if (vma_has_changed(vmf)) { + trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address); goto out; + } #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* @@ -2377,8 +2389,10 @@ static bool pte_map_lock(struct vm_fault *vmf) * is not a huge collapse operation in progress in our back. */ pmdval = READ_ON
[PATCH v10 18/25] mm: provide speculative fault infrastructure
From: Peter Zijlstra Provide infrastructure to do a speculative fault (not holding mmap_sem). The not holding of mmap_sem means we can race against VMA change/removal and page-table destruction. We use the SRCU VMA freeing to keep the VMA around. We use the VMA seqcount to detect change (including umapping / page-table deletion) and we use gup_fast() style page-table walking to deal with page-table races. Once we've obtained the page and are ready to update the PTE, we validate if the state we started the fault with is still valid, if not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the PTE and we're done. Signed-off-by: Peter Zijlstra (Intel) [Manage the newly introduced pte_spinlock() for speculative page fault to fail if the VMA is touched in our back] [Rename vma_is_dead() to vma_has_changed() and declare it here] [Fetch p4d and pud] [Set vmd.sequence in __handle_mm_fault()] [Abort speculative path when handle_userfault() has to be called] [Add additional VMA's flags checks in handle_speculative_fault()] [Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()] [Don't set vmf->pte and vmf->ptl if pte_map_lock() failed] [Remove warning comment about waiting for !seq&1 since we don't want to wait] [Remove warning about no huge page support, mention it explictly] [Don't call do_fault() in the speculative path as __do_fault() calls vma->vm_ops->fault() which may want to release mmap_sem] [Only vm_fault pointer argument for vma_has_changed()] [Fix check against huge page, calling pmd_trans_huge()] [Use READ_ONCE() when reading VMA's fields in the speculative path] [Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for processing done in vm_normal_page()] [Check that vma->anon_vma is already set when starting the speculative path] [Check for memory policy as we can't support MPOL_INTERLEAVE case due to the processing done in mpol_misplaced()] [Don't support VMA growing up or down] [Move check on vm_sequence just before calling handle_pte_fault()] [Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT] [Add mem cgroup oom check] [Use READ_ONCE to access p*d entries] [Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()] [Don't fetch pte again in handle_pte_fault() when running the speculative path] [Check PMD against concurrent collapsing operation] [Try spin lock the pte during the speculative path to avoid deadlock with other CPU's invalidating the TLB and requiring this CPU to catch the inter processor's interrupt] [Move define of FAULT_FLAG_SPECULATIVE here] [Introduce __handle_speculative_fault() and add a check against mm->mm_users in handle_speculative_fault() defined in mm.h] Signed-off-by: Laurent Dufour --- include/linux/hugetlb_inline.h | 2 +- include/linux/mm.h | 30 include/linux/pagemap.h| 4 +- mm/internal.h | 16 +- mm/memory.c| 340 - 5 files changed, 385 insertions(+), 7 deletions(-) diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..9e25283d6fc9 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -8,7 +8,7 @@ static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index e2c24ea58d94..08540c98d63b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -309,6 +309,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_USER0x40/* The fault originated in userspace */ #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not holding mmap_sem */ #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ @@ -337,6 +338,10 @@ struct vm_fault { gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + unsigned int sequence; + pmd_t orig_pmd; /* value of PMD at the time of fault */ +#endif pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching @@ -1373,6 +1378,31 @@ int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags); + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +extern int __handle_speculative_fault(struct mm_struct *mm, +
[PATCH v10 17/25] mm: protect mm_rb tree with a rwlock
This change is inspired by the Peter's proposal patch [1] which was protecting the VMA using SRCU. Unfortunately, SRCU is not scaling well in that particular case, and it is introducing major performance degradation due to excessive scheduling operations. To allow access to the mm_rb tree without grabbing the mmap_sem, this patch is protecting it access using a rwlock. As the mm_rb tree is a O(log n) search it is safe to protect it using such a lock. The VMA cache is not protected by the new rwlock and it should not be used without holding the mmap_sem. To allow the picked VMA structure to be used once the rwlock is released, a use count is added to the VMA structure. When the VMA is allocated it is set to 1. Each time the VMA is picked with the rwlock held its use count is incremented. Each time the VMA is released it is decremented. When the use count hits zero, this means that the VMA is no more used and should be freed. This patch is preparing for 2 kind of VMA access : - as usual, under the control of the mmap_sem, - without holding the mmap_sem for the speculative page fault handler. Access done under the control the mmap_sem doesn't require to grab the rwlock to protect read access to the mm_rb tree, but access in write must be done under the protection of the rwlock too. This affects inserting and removing of elements in the RB tree. The patch is introducing 2 new functions: - vma_get() to find a VMA based on an address by holding the new rwlock. - vma_put() to release the VMA when its no more used. These services are designed to be used when access are made to the RB tree without holding the mmap_sem. When a VMA is removed from the RB tree, its vma->vm_rb field is cleared and we rely on the WMB done when releasing the rwlock to serialize the write with the RMB done in a later patch to check for the VMA's validity. When free_vma is called, the file associated with the VMA is closed immediately, but the policy and the file structure remained in used until the VMA's use count reach 0, which may happens later when exiting an in progress speculative page fault. [1] https://patchwork.kernel.org/patch/5108281/ Cc: Peter Zijlstra (Intel) Cc: Matthew Wilcox Signed-off-by: Laurent Dufour --- include/linux/mm.h | 1 + include/linux/mm_types.h | 4 ++ kernel/fork.c| 3 ++ mm/init-mm.c | 3 ++ mm/internal.h| 6 +++ mm/mmap.c| 115 +++ 6 files changed, 104 insertions(+), 28 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f967bf84094f..e2c24ea58d94 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1272,6 +1272,7 @@ static inline void INIT_VMA(struct vm_area_struct *vma) INIT_LIST_HEAD(&vma->anon_vma_chain); #ifdef CONFIG_SPECULATIVE_PAGE_FAULT seqcount_init(&vma->vm_sequence); + atomic_set(&vma->vm_ref_count, 1); #endif } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index db5e9d630e7a..faf3844dd815 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -337,6 +337,7 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_SPECULATIVE_PAGE_FAULT seqcount_t vm_sequence; + atomic_t vm_ref_count; /* see vma_get(), vma_put() */ #endif } __randomize_layout; @@ -355,6 +356,9 @@ struct kioctx_table; struct mm_struct { struct vm_area_struct *mmap;/* list of VMAs */ struct rb_root mm_rb; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + rwlock_t mm_rb_lock; +#endif u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, diff --git a/kernel/fork.c b/kernel/fork.c index d937e5945f77..9f8d235a3df8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -891,6 +891,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->mmap = NULL; mm->mm_rb = RB_ROOT; mm->vmacache_seqnum = 0; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + rwlock_init(&mm->mm_rb_lock); +#endif atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); diff --git a/mm/init-mm.c b/mm/init-mm.c index f94d5d15ebc0..e71ac37a98c4 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -17,6 +17,9 @@ struct mm_struct init_mm = { .mm_rb = RB_ROOT, +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + .mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock), +#endif .pgd= swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/internal.h b/mm/internal.h index 62d8c34e63d5..fb2667b20f0a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -40,6 +40,12 @@ void page_writeback_init(void); int do_swap_page(struct vm_fault *vmf); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +extern
[PATCH v10 16/25] mm: introduce __page_add_new_anon_rmap()
When dealing with speculative page fault handler, we may race with VMA being split or merged. In this case the vma->vm_start and vm->vm_end fields may not match the address the page fault is occurring. This can only happens when the VMA is split but in that case, the anon_vma pointer of the new VMA will be the same as the original one, because in __split_vma the new->anon_vma is set to src->anon_vma when *new = *vma. So even if the VMA boundaries are not correct, the anon_vma pointer is still valid. If the VMA has been merged, then the VMA in which it has been merged must have the same anon_vma pointer otherwise the merge can't be done. So in all the case we know that the anon_vma is valid, since we have checked before starting the speculative page fault that the anon_vma pointer is valid for this VMA and since there is an anon_vma this means that at one time a page has been backed and that before the VMA is cleaned, the page table lock would have to be grab to clean the PTE, and the anon_vma field is checked once the PTE is locked. This patch introduce a new __page_add_new_anon_rmap() service which doesn't check for the VMA boundaries, and create a new inline one which do the check. When called from a page fault handler, if this is not a speculative one, there is a guarantee that vm_start and vm_end match the faulting address, so this check is useless. In the context of the speculative page fault handler, this check may be wrong but anon_vma is still valid as explained above. Signed-off-by: Laurent Dufour --- include/linux/rmap.h | 12 ++-- mm/memory.c | 8 mm/rmap.c| 5 ++--- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 988d176472df..a5d282573093 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -174,8 +174,16 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); +void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); +static inline void page_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, + unsigned long address, bool compound) +{ + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + __page_add_new_anon_rmap(page, vma, address, compound); +} + void page_add_file_rmap(struct page *, bool); void page_remove_rmap(struct page *, bool); diff --git a/mm/memory.c b/mm/memory.c index d9146a0c3d25..9c220ac0e2c5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2563,7 +2563,7 @@ static int wp_page_copy(struct vm_fault *vmf) * thread doing COW. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address, false); + __page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags); /* @@ -3100,7 +3100,7 @@ int do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address, false); + __page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); __lru_cache_add_active_or_unevictable(page, vmf->vma_flags); } else { @@ -3251,7 +3251,7 @@ static int do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + __page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); __lru_cache_add_active_or_unevictable(page, vmf->vma_flags); setpte: @@ -3505,7 +3505,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vmf->vma_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + __page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); __lru_cache_add_active_or_unevictable(page, vmf->vma_flags); } else { diff --git a/mm/rmap.c b/mm/rmap.c index 8d5337fed37b..9307f6140796 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1136,7 +1136,7 @@ void do_page_add_
[PATCH v10 15/25] mm: introduce __vm_normal_page()
When dealing with the speculative fault path we should use the VMA's field cached value stored in the vm_fault structure. Currently vm_normal_page() is using the pointer to the VMA to fetch the vm_flags value. This patch provides a new __vm_normal_page() which is receiving the vm_flags flags value as parameter. Note: The speculative path is turned on for architecture providing support for special PTE flag. So only the first block of vm_normal_page is used during the speculative path. Signed-off-by: Laurent Dufour --- include/linux/mm.h | 18 +++--- mm/memory.c| 25 - 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c65205c8c558..f967bf84094f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1275,9 +1275,21 @@ static inline void INIT_VMA(struct vm_area_struct *vma) #endif } -struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, -pte_t pte, bool with_public_device); -#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false) +struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device, + unsigned long vma_flags); +static inline struct page *_vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, pte_t pte, + bool with_public_device) +{ + return __vm_normal_page(vma, addr, pte, with_public_device, + vma->vm_flags); +} +static inline struct page *vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + return _vm_normal_page(vma, addr, pte, false); +} struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); diff --git a/mm/memory.c b/mm/memory.c index 47af9e97f02a..d9146a0c3d25 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -780,7 +780,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } /* - * vm_normal_page -- This function gets the "struct page" associated with a pte. + * __vm_normal_page -- This function gets the "struct page" associated with + * a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this @@ -826,8 +827,9 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, #else # define HAVE_PTE_SPECIAL 0 #endif -struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, -pte_t pte, bool with_public_device) +struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device, + unsigned long vma_flags) { unsigned long pfn = pte_pfn(pte); @@ -836,7 +838,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); - if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn)) return NULL; @@ -867,9 +869,13 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } /* !HAVE_PTE_SPECIAL case follows: */ + /* +* This part should never get called when CONFIG_SPECULATIVE_PAGE_FAULT +* is set. This is mainly because we can't rely on vm_start. +*/ - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { + if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; @@ -878,7 +884,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; - if (!is_cow_mapping(vma->vm_flags)) + if (!is_cow_mapping(vma_flags)) return NULL; } } @@ -2743,7 +2749,8 @@ static int do_wp_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + vmf->page = __vm_normal_page(vma, vmf->address, vmf->orig_pte, false,
[PATCH v10 14/25] mm: introduce __lru_cache_add_active_or_unevictable
The speculative page fault handler which is run without holding the mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags is not guaranteed to remain constant. Introducing __lru_cache_add_active_or_unevictable() which has the vma flags value parameter instead of the vma pointer. Acked-by: David Rientjes Signed-off-by: Laurent Dufour --- include/linux/swap.h | 10 -- mm/memory.c | 8 mm/swap.c| 6 +++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 1985940af479..a7dc37e0e405 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -338,8 +338,14 @@ extern void deactivate_file_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); -extern void lru_cache_add_active_or_unevictable(struct page *page, - struct vm_area_struct *vma); +extern void __lru_cache_add_active_or_unevictable(struct page *page, + unsigned long vma_flags); + +static inline void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + return __lru_cache_add_active_or_unevictable(page, vma->vm_flags); +} /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); diff --git a/mm/memory.c b/mm/memory.c index e28cbbae3f3d..47af9e97f02a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2559,7 +2559,7 @@ static int wp_page_copy(struct vm_fault *vmf) ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); + __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -3095,7 +3095,7 @@ int do_swap_page(struct vm_fault *vmf) if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); - lru_cache_add_active_or_unevictable(page, vma); + __lru_cache_add_active_or_unevictable(page, vmf->vma_flags); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); @@ -3246,7 +3246,7 @@ static int do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); - lru_cache_add_active_or_unevictable(page, vma); + __lru_cache_add_active_or_unevictable(page, vmf->vma_flags); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3500,7 +3500,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); - lru_cache_add_active_or_unevictable(page, vma); + __lru_cache_add_active_or_unevictable(page, vmf->vma_flags); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); diff --git a/mm/swap.c b/mm/swap.c index 3dd518832096..f2f9c587246f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -455,12 +455,12 @@ void lru_cache_add(struct page *page) * directly back onto it's zone's unevictable list, it does NOT use a * per cpu pagevec. */ -void lru_cache_add_active_or_unevictable(struct page *page, -struct vm_area_struct *vma) +void __lru_cache_add_active_or_unevictable(struct page *page, + unsigned long vma_flags) { VM_BUG_ON_PAGE(PageLRU(page), page); - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) + if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) SetPageActive(page); else if (!TestSetPageMlocked(page)) { /* -- 2.7.4
[PATCH v10 13/25] mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()
migrate_misplaced_page() is only called during the page fault handling so it's better to pass the pointer to the struct vm_fault instead of the vma. This way during the speculative page fault path the saved vma->vm_flags could be used. Acked-by: David Rientjes Signed-off-by: Laurent Dufour --- include/linux/migrate.h | 4 ++-- mm/memory.c | 2 +- mm/migrate.c| 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f2b4abbca55e..fd4c3ab7bd9c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -126,14 +126,14 @@ static inline void __ClearPageMovable(struct page *page) #ifdef CONFIG_NUMA_BALANCING extern bool pmd_trans_migrating(pmd_t pmd); extern int migrate_misplaced_page(struct page *page, - struct vm_area_struct *vma, int node); + struct vm_fault *vmf, int node); #else static inline bool pmd_trans_migrating(pmd_t pmd) { return false; } static inline int migrate_misplaced_page(struct page *page, -struct vm_area_struct *vma, int node) +struct vm_fault *vmf, int node) { return -EAGAIN; /* can't migrate now */ } diff --git a/mm/memory.c b/mm/memory.c index 2fb9920e06a5..e28cbbae3f3d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3894,7 +3894,7 @@ static int do_numa_page(struct vm_fault *vmf) } /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, vma, target_nid); + migrated = migrate_misplaced_page(page, vmf, target_nid); if (migrated) { page_nid = target_nid; flags |= TNF_MIGRATED; diff --git a/mm/migrate.c b/mm/migrate.c index 44d7007cfc1c..5d5cf9b5ac16 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1944,7 +1944,7 @@ bool pmd_trans_migrating(pmd_t pmd) * node. Caller is expected to have an elevated reference count on * the page that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, +int migrate_misplaced_page(struct page *page, struct vm_fault *vmf, int node) { pg_data_t *pgdat = NODE_DATA(node); @@ -1957,7 +1957,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * with execute permissions as they are probably shared libraries. */ if (page_mapcount(page) != 1 && page_is_file_cache(page) && - (vma->vm_flags & VM_EXEC)) + (vmf->vma_flags & VM_EXEC)) goto out; /* -- 2.7.4
[PATCH v10 12/25] mm: cache some VMA fields in the vm_fault structure
When handling speculative page fault, the vma->vm_flags and vma->vm_page_prot fields are read once the page table lock is released. So there is no more guarantee that these fields would not change in our back. They will be saved in the vm_fault structure before the VMA is checked for changes. This patch also set the fields in hugetlb_no_page() and __collapse_huge_page_swapin even if it is not need for the callee. Signed-off-by: Laurent Dufour --- include/linux/mm.h | 10 -- mm/huge_memory.c | 6 +++--- mm/hugetlb.c | 2 ++ mm/khugepaged.c| 2 ++ mm/memory.c| 50 ++ mm/migrate.c | 2 +- 6 files changed, 42 insertions(+), 30 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f6edd15563bc..c65205c8c558 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -367,6 +367,12 @@ struct vm_fault { * page table to avoid allocation from * atomic context. */ + /* +* These entries are required when handling speculative page fault. +* This way the page handling is done using consistent field values. +*/ + unsigned long vma_flags; + pgprot_t vma_page_prot; }; /* page entry size for vm->huge_fault() */ @@ -687,9 +693,9 @@ void free_compound_page(struct page *page); * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags) { - if (likely(vma->vm_flags & VM_WRITE)) + if (likely(vma_flags & VM_WRITE)) pte = pte_mkwrite(pte); return pte; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a3a1815f8e11..da2afda67e68 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1194,8 +1194,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t entry; - entry = mk_pte(pages[i], vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + entry = mk_pte(pages[i], vmf->vma_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); @@ -2168,7 +2168,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); - entry = maybe_mkwrite(entry, vma); + entry = maybe_mkwrite(entry, vma->vm_flags); if (!write) entry = pte_wrprotect(entry); if (!young) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 218679138255..774864153407 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3718,6 +3718,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, .vma = vma, .address = address, .flags = flags, + .vma_flags = vma->vm_flags, + .vma_page_prot = vma->vm_page_prot, /* * Hard to debug if it ends up being * used by a callee that assumes diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0b28af4b950d..2b02a9f9589e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -887,6 +887,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, .pgoff = linear_page_index(vma, address), + .vma_flags = vma->vm_flags, + .vma_page_prot = vma->vm_page_prot, }; /* we only decide to swapin, if there is enough young ptes */ diff --git a/mm/memory.c b/mm/memory.c index f76f5027d251..2fb9920e06a5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1826,7 +1826,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, out_mkwrite: if (mkwrite) { entry = pte_mkyoung(entry); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags); } set_pte_at(mm, addr, pte, entry); @@ -2472,7 +2472,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) flush_cache_page(vma, vmf->address,
[PATCH v10 11/25] mm: protect SPF handler against anon_vma changes
The speculative page fault handler must be protected against anon_vma changes. This is because page_add_new_anon_rmap() is called during the speculative path. In addition, don't try speculative page fault if the VMA don't have an anon_vma structure allocated because its allocation should be protected by the mmap_sem. In __vma_adjust() when importer->anon_vma is set, there is no need to protect against speculative page faults since speculative page fault is aborted if the vma->anon_vma is not set. When calling page_add_new_anon_rmap() vma->anon_vma is necessarily valid since we checked for it when locking the pte and the anon_vma is removed once the pte is unlocked. So even if the speculative page fault handler is running concurrently with do_unmap(), as the pte is locked in unmap_region() - through unmap_vmas() - and the anon_vma unlinked later, because we check for the vma sequence counter which is updated in unmap_page_range() before locking the pte, and then in free_pgtables() so when locking the pte the change will be detected. Signed-off-by: Laurent Dufour --- mm/memory.c | 4 1 file changed, 4 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index f7fed053df80..f76f5027d251 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -624,7 +624,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ + vm_write_begin(vma); unlink_anon_vmas(vma); + vm_write_end(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { @@ -638,7 +640,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; + vm_write_begin(vma); unlink_anon_vmas(vma); + vm_write_end(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, -- 2.7.4
[PATCH v10 10/25] mm: protect mremap() against SPF hanlder
If a thread is remapping an area while another one is faulting on the destination area, the SPF handler may fetch the vma from the RB tree before the pte has been moved by the other thread. This means that the moved ptes will overwrite those create by the page fault handler leading to page leaked. CPU 1 CPU2 enter mremap() unmap the dest area copy_vma() Enter speculative page fault handler >> at this time the dest area is present in the RB tree fetch the vma matching dest area create a pte as the VMA matched Exit the SPF handler move_ptes() > it is assumed that the dest area is empty, > the move ptes overwrite the page mapped by the CPU2. To prevent that, when the VMA matching the dest area is extended or created by copy_vma(), it should be marked as non available to the SPF handler. The usual way to so is to rely on vm_write_begin()/end(). This is already in __vma_adjust() called by copy_vma() (through vma_merge()). But __vma_adjust() is calling vm_write_end() before returning which create a window for another thread. This patch adds a new parameter to vma_merge() which is passed down to vma_adjust(). The assumption is that copy_vma() is returning a vma which should be released by calling vm_raw_write_end() by the callee once the ptes have been moved. Signed-off-by: Laurent Dufour --- include/linux/mm.h | 24 +++- mm/mmap.c | 53 + mm/mremap.c| 13 + 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 988daf7030c9..f6edd15563bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2211,18 +2211,32 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); + extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); + struct vm_area_struct *expand, bool keep_locked); + static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { - return __vma_adjust(vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vma, start, end, pgoff, insert, NULL, false); } -extern struct vm_area_struct *vma_merge(struct mm_struct *, + +extern struct vm_area_struct *__vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, unsigned long end, + unsigned long vm_flags, struct anon_vma *anon, struct file *file, + pgoff_t pgoff, struct mempolicy *mpol, + struct vm_userfaultfd_ctx uff, bool keep_locked); + +static inline struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + unsigned long vm_flags, struct anon_vma *anon, struct file *file, + pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff) +{ + return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off, + pol, uff, false); +} + extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); diff --git a/mm/mmap.c b/mm/mmap.c index 921f20cc6df0..5601f1ef8bb9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -680,7 +680,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm, */ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand) + struct vm_area_struct *expand, bool keep_locked) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; @@ -796,8 +796,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, importer->anon_vma = exporter->anon_vma; error = anon_vma_clone(importer, exporter); - if (error) + if (error) { + if (next && next != vma) + vm_raw_write_end(next); + vm_raw_write_end(vma); return error; + } } } again: @@ -992,7 +996,8 @@ int __vma_adjust(struct vm_area_struc
[PATCH v10 09/25] mm: protect VMA modifications using VMA sequence count
The VMA sequence count has been introduced to allow fast detection of VMA modification when running a page fault handler without holding the mmap_sem. This patch provides protection against the VMA modification done in : - madvise() - mpol_rebind_policy() - vma_replace_policy() - change_prot_numa() - mlock(), munlock() - mprotect() - mmap_region() - collapse_huge_page() - userfaultd registering services In addition, VMA fields which will be read during the speculative fault path needs to be written using WRITE_ONCE to prevent write to be split and intermediate values to be pushed to other CPUs. Signed-off-by: Laurent Dufour --- fs/proc/task_mmu.c | 5 - fs/userfaultfd.c | 17 + mm/khugepaged.c| 3 +++ mm/madvise.c | 6 +- mm/mempolicy.c | 51 ++- mm/mlock.c | 13 - mm/mmap.c | 22 +- mm/mprotect.c | 4 +++- mm/swap_state.c| 8 ++-- 9 files changed, 89 insertions(+), 40 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c486ad4b43f0..aeb417f28839 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1136,8 +1136,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, goto out_mm; } for (vma = mm->mmap; vma; vma = vma->vm_next) { - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, + vma->vm_flags & ~VM_SOFTDIRTY); vma_set_page_prot(vma); + vm_write_end(vma); } downgrade_write(&mm->mmap_sem); break; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cec550c8468f..b8212ba17695 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -659,8 +659,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vm_write_begin(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + WRITE_ONCE(vma->vm_flags, + vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING)); + vm_write_end(vma); return 0; } @@ -885,8 +888,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma = prev; else prev = vma; - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vm_write_end(vma); } up_write(&mm->mmap_sem); mmput(mm); @@ -1434,8 +1439,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; + vm_write_end(vma); skip: prev = vma; @@ -1592,8 +1599,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vm_write_end(vma); skip: prev = vma; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7b2a4bf8671..0b28af4b950d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1011,6 +1011,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (mm_find_pmd(mm, address) != pmd) goto out; + vm_write_begin(vma); anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); @@ -1046,6 +1047,7 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); + vm_write_end(vma); result = SCAN_FAIL; goto out; } @@ -1080
[PATCH v10 08/25] mm: VMA sequence count
From: Peter Zijlstra Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence counts such that we can easily test if a VMA is changed. The unmap_page_range() one allows us to make assumptions about page-tables; when we find the seqcount hasn't changed we can assume page-tables are still valid. The flip side is that we cannot distinguish between a vma_adjust() and the unmap_page_range() -- where with the former we could have re-checked the vma bounds against the address. Signed-off-by: Peter Zijlstra (Intel) [Port to 4.12 kernel] [Build depends on CONFIG_SPECULATIVE_PAGE_FAULT] [Introduce vm_write_* inline function depending on CONFIG_SPECULATIVE_PAGE_FAULT] [Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence by using vm_raw_write* functions] [Fix a lock dependency warning in mmap_region() when entering the error path] [move sequence initialisation INIT_VMA()] Signed-off-by: Laurent Dufour --- include/linux/mm.h | 44 include/linux/mm_types.h | 3 +++ mm/memory.c | 2 ++ mm/mmap.c| 31 +++ 4 files changed, 80 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index efc1248b82bd..988daf7030c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1264,6 +1264,9 @@ struct zap_details { static inline void INIT_VMA(struct vm_area_struct *vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + seqcount_init(&vma->vm_sequence); +#endif } struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, @@ -1386,6 +1389,47 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +static inline void vm_write_begin(struct vm_area_struct *vma) +{ + write_seqcount_begin(&vma->vm_sequence); +} +static inline void vm_write_begin_nested(struct vm_area_struct *vma, +int subclass) +{ + write_seqcount_begin_nested(&vma->vm_sequence, subclass); +} +static inline void vm_write_end(struct vm_area_struct *vma) +{ + write_seqcount_end(&vma->vm_sequence); +} +static inline void vm_raw_write_begin(struct vm_area_struct *vma) +{ + raw_write_seqcount_begin(&vma->vm_sequence); +} +static inline void vm_raw_write_end(struct vm_area_struct *vma) +{ + raw_write_seqcount_end(&vma->vm_sequence); +} +#else +static inline void vm_write_begin(struct vm_area_struct *vma) +{ +} +static inline void vm_write_begin_nested(struct vm_area_struct *vma, +int subclass) +{ +} +static inline void vm_write_end(struct vm_area_struct *vma) +{ +} +static inline void vm_raw_write_begin(struct vm_area_struct *vma) +{ +} +static inline void vm_raw_write_end(struct vm_area_struct *vma) +{ +} +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21612347d311..db5e9d630e7a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -335,6 +335,9 @@ struct vm_area_struct { struct mempolicy *vm_policy;/* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + seqcount_t vm_sequence; +#endif } __randomize_layout; struct core_thread { diff --git a/mm/memory.c b/mm/memory.c index f86efcb8e268..f7fed053df80 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1503,6 +1503,7 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long next; BUG_ON(addr >= end); + vm_write_begin(vma); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -1512,6 +1513,7 @@ void unmap_page_range(struct mmu_gather *tlb, next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); + vm_write_end(vma); } diff --git a/mm/mmap.c b/mm/mmap.c index 8bd9ae1dfacc..813e49589ea1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -692,6 +692,30 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, long adjust_next = 0; int remove_next = 0; + /* +* Why using vm_raw_write*() functions here to avoid lockdep's warning ? +* +* Locked is complaining about a theoretical lock dependency, involving +* 3 locks: +* mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim +* +* Here are the major path leading to this dependency : +* 1. __vma_adjust() mmap_sem -> vm_sequence -> i_mmap_rwsem +* 2. move_vmap(
[PATCH v10 07/25] mm: introduce INIT_VMA()
Some VMA struct fields need to be initialized once the VMA structure is allocated. Currently this only concerns anon_vma_chain field but some other will be added to support the speculative page fault. Instead of spreading the initialization calls all over the code, let's introduce a dedicated inline function. Signed-off-by: Laurent Dufour --- fs/exec.c | 2 +- include/linux/mm.h | 5 + kernel/fork.c | 2 +- mm/mmap.c | 10 +- mm/nommu.c | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 32eea4c65909..bd03689aa358 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -311,7 +311,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - INIT_LIST_HEAD(&vma->anon_vma_chain); + INIT_VMA(vma); err = insert_vm_struct(mm, vma); if (err) diff --git a/include/linux/mm.h b/include/linux/mm.h index 714da99d77a3..efc1248b82bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1261,6 +1261,11 @@ struct zap_details { pgoff_t last_index; /* Highest page->index to unmap */ }; +static inline void INIT_VMA(struct vm_area_struct *vma) +{ + INIT_LIST_HEAD(&vma->anon_vma_chain); +} + struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device); #define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false) diff --git a/kernel/fork.c b/kernel/fork.c index b1d877f1a0ac..d937e5945f77 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -451,7 +451,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (!tmp) goto fail_nomem; *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); + INIT_VMA(tmp); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; diff --git a/mm/mmap.c b/mm/mmap.c index 188f195883b9..8bd9ae1dfacc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1700,7 +1700,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); + INIT_VMA(vma); if (file) { if (vm_flags & VM_DENYWRITE) { @@ -2586,7 +2586,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, /* most fields are the same, copy all, and then fixup */ *new = *vma; - INIT_LIST_HEAD(&new->anon_vma_chain); + INIT_VMA(new); if (new_below) new->vm_end = addr; @@ -2956,7 +2956,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long return -ENOMEM; } - INIT_LIST_HEAD(&vma->anon_vma_chain); + INIT_VMA(vma); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -3167,7 +3167,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_pgoff = pgoff; if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - INIT_LIST_HEAD(&new_vma->anon_vma_chain); + INIT_VMA(new_vma); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) @@ -3310,7 +3310,7 @@ static struct vm_area_struct *__install_special_mapping( if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&vma->anon_vma_chain); + INIT_VMA(vma); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; diff --git a/mm/nommu.c b/mm/nommu.c index 13723736d38f..6909ea0bf88d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1212,7 +1212,7 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); + INIT_VMA(vma); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; -- 2.7.4
[PATCH v10 06/25] mm: make pte_unmap_same compatible with SPF
pte_unmap_same() is making the assumption that the page table are still around because the mmap_sem is held. This is no more the case when running a speculative page fault and additional check must be made to ensure that the final page table are still there. This is now done by calling pte_spinlock() to check for the VMA's consistency while locking for the page tables. This is requiring passing a vm_fault structure to pte_unmap_same() which is containing all the needed parameters. As pte_spinlock() may fail in the case of a speculative page fault, if the VMA has been touched in our back, pte_unmap_same() should now return 3 cases : 1. pte are the same (0) 2. pte are different (VM_FAULT_PTNOTSAME) 3. a VMA's changes has been detected (VM_FAULT_RETRY) The case 2 is handled by the introduction of a new VM_FAULT flag named VM_FAULT_PTNOTSAME which is then trapped in cow_user_page(). If VM_FAULT_RETRY is returned, it is passed up to the callers to retry the page fault while holding the mmap_sem. Acked-by: David Rientjes Signed-off-by: Laurent Dufour --- include/linux/mm.h | 1 + mm/memory.c| 39 --- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d1aff80669c..714da99d77a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1208,6 +1208,7 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables * and needs fsync() to complete (for * synchronous page faults in DAX) */ +#define VM_FAULT_PTNOTSAME 0x4000 /* Page table entries have changed */ #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ diff --git a/mm/memory.c b/mm/memory.c index 0b9a51f80e0e..f86efcb8e268 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2309,21 +2309,29 @@ static inline bool pte_map_lock(struct vm_fault *vmf) * parts, do_swap_page must check under lock before unmapping the pte and * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). + * + * pte_unmap_same() returns: + * 0 if the PTE are the same + * VM_FAULT_PTNOTSAME if the PTE are different + * VM_FAULT_RETRY if the VMA has changed in our back during + * a speculative page fault handling. */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) +static inline int pte_unmap_same(struct vm_fault *vmf) { - int same = 1; + int ret = 0; + #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - same = pte_same(*page_table, orig_pte); - spin_unlock(ptl); + if (pte_spinlock(vmf)) { + if (!pte_same(*vmf->pte, vmf->orig_pte)) + ret = VM_FAULT_PTNOTSAME; + spin_unlock(vmf->ptl); + } else + ret = VM_FAULT_RETRY; } #endif - pte_unmap(page_table); - return same; + pte_unmap(vmf->pte); + return ret; } static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) @@ -2912,10 +2920,19 @@ int do_swap_page(struct vm_fault *vmf) pte_t pte; int locked; int exclusive = 0; - int ret = 0; + int ret; - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + ret = pte_unmap_same(vmf); + if (ret) { + /* +* If pte != orig_pte, this means another thread did the +* swap operation in our back. +* So nothing else to do. +*/ + if (ret == VM_FAULT_PTNOTSAME) + ret = 0; goto out; + } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { -- 2.7.4
[PATCH v10 05/25] mm: introduce pte_spinlock for FAULT_FLAG_SPECULATIVE
When handling page fault without holding the mmap_sem the fetch of the pte lock pointer and the locking will have to be done while ensuring that the VMA is not touched in our back. So move the fetch and locking operations in a dedicated function. Signed-off-by: Laurent Dufour --- mm/memory.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 4528bd584b7a..0b9a51f80e0e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); +static inline bool pte_spinlock(struct vm_fault *vmf) +{ + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + return true; +} + static inline bool pte_map_lock(struct vm_fault *vmf) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, @@ -3804,8 +3811,8 @@ static int do_numa_page(struct vm_fault *vmf) * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. */ - vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); - spin_lock(vmf->ptl); + if (!pte_spinlock(vmf)) + return VM_FAULT_RETRY; if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; @@ -3998,8 +4005,8 @@ static int handle_pte_fault(struct vm_fault *vmf) if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); - vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - spin_lock(vmf->ptl); + if (!pte_spinlock(vmf)) + return VM_FAULT_RETRY; entry = vmf->orig_pte; if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; -- 2.7.4
[PATCH v10 04/25] mm: prepare for FAULT_FLAG_SPECULATIVE
From: Peter Zijlstra When speculating faults (without holding mmap_sem) we need to validate that the vma against which we loaded pages is still valid when we're ready to install the new PTE. Therefore, replace the pte_offset_map_lock() calls that (re)take the PTL with pte_map_lock() which can fail in case we find the VMA changed since we started the fault. Signed-off-by: Peter Zijlstra (Intel) [Port to 4.12 kernel] [Remove the comment about the fault_env structure which has been implemented as the vm_fault structure in the kernel] [move pte_map_lock()'s definition upper in the file] [move the define of FAULT_FLAG_SPECULATIVE later in the series] [review error path in do_swap_page(), do_anonymous_page() and wp_page_copy()] Signed-off-by: Laurent Dufour --- mm/memory.c | 87 - 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a1f990e33e38..4528bd584b7a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); +static inline bool pte_map_lock(struct vm_fault *vmf) +{ + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + return true; +} + /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures @@ -2477,25 +2484,26 @@ static int wp_page_copy(struct vm_fault *vmf) const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + int ret = VM_FAULT_OOM; if (unlikely(anon_vma_prepare(vma))) - goto oom; + goto out; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!new_page) - goto oom; + goto out; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) - goto oom; + goto out; cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) - goto oom_free_new; + goto out_free_new; __SetPageUptodate(new_page); @@ -2504,7 +2512,10 @@ static int wp_page_copy(struct vm_fault *vmf) /* * Re-check the pte - we dropped the lock */ - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!pte_map_lock(vmf)) { + ret = VM_FAULT_RETRY; + goto out_uncharge; + } if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { @@ -2591,12 +2602,14 @@ static int wp_page_copy(struct vm_fault *vmf) put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; -oom_free_new: +out_uncharge: + mem_cgroup_cancel_charge(new_page, memcg, false); +out_free_new: put_page(new_page); -oom: +out: if (old_page) put_page(old_page); - return VM_FAULT_OOM; + return ret; } /** @@ -2617,8 +2630,8 @@ static int wp_page_copy(struct vm_fault *vmf) int finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); + if (!pte_map_lock(vmf)) + return VM_FAULT_RETRY; /* * We might have raced with another page fault while we released the * pte_offset_map_lock. @@ -2736,8 +2749,11 @@ static int do_wp_page(struct vm_fault *vmf) get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); lock_page(vmf->page); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); + if (!pte_map_lock(vmf)) { + unlock_page(vmf->page); + put_page(vmf->page); + return VM_FAULT_RETRY; + } if (!pte_same(*vmf->pte, vmf->orig_pte)) { unlock_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2944,11 +2960,15 @@ int do_swap_page(struct vm_fault *vmf) if (!page) {
[PATCH v10 03/25] powerpc/mm: set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for BOOK3S_64. This enables the Speculative Page Fault handler. Support is only provide for BOOK3S_64 currently because: - require CONFIG_PPC_STD_MMU because checks done in set_access_flags_filter() - require BOOK3S because we can't support for book3e_hugetlb_preload() called by update_mmu_cache() Cc: Michael Ellerman Signed-off-by: Laurent Dufour --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c32a181a7cbb..21ef887da7a3 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -233,6 +233,7 @@ config PPC select OLD_SIGACTIONif PPC32 select OLD_SIGSUSPEND select SPARSE_IRQ + select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if PPC_BOOK3S_64 select SYSCTL_EXCEPTION_TRACE select VIRT_TO_BUS if !PPC64 # -- 2.7.4
[PATCH v10 02/25] x86/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT which turns on the Speculative Page Fault handler when building for 64bit. Cc: Thomas Gleixner Signed-off-by: Laurent Dufour --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d8983df5a2bc..ebdeb48e4a4a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -30,6 +30,7 @@ config X86_64 select MODULES_USE_ELF_RELA select X86_DEV_DMA_OPS select ARCH_HAS_SYSCALL_WRAPPER + select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT # # Arch settings -- 2.7.4
[PATCH v10 01/25] mm: introduce CONFIG_SPECULATIVE_PAGE_FAULT
This configuration variable will be used to build the code needed to handle speculative page fault. By default it is turned off, and activated depending on architecture support, SMP and MMU. Suggested-by: Thomas Gleixner Suggested-by: David Rientjes Signed-off-by: Laurent Dufour --- mm/Kconfig | 22 ++ 1 file changed, 22 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index d5004d82a1d6..5484dca11199 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -752,3 +752,25 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT + def_bool n + +config SPECULATIVE_PAGE_FAULT + bool "Speculative page faults" + default y + depends on ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT + depends on MMU && SMP + help + Try to handle user space page faults without holding the mmap_sem. + +This should allow better concurrency for massively threaded process +since the page fault handler will not wait for other threads memory +layout change to be done, assuming that this change is done in another +part of the process's memory space. This type of page fault is named +speculative page fault. + +If the speculative page fault fails because of a concurrency is +detected or because underlying PMD or PTE tables are not yet +allocating, it is failing its processing and a classic page fault +is then tried. -- 2.7.4
[PATCH v10 00/25] Speculative page faults
This is a port on kernel 4.16 of the work done by Peter Zijlstra to handle page fault without holding the mm semaphore [1]. The idea is to try to handle user space page faults without holding the mmap_sem. This should allow better concurrency for massively threaded process since the page fault handler will not wait for other threads memory layout change to be done, assuming that this change is done in another part of the process's memory space. This type page fault is named speculative page fault. If the speculative page fault fails because of a concurrency is detected or because underlying PMD or PTE tables are not yet allocating, it is failing its processing and a classic page fault is then tried. The speculative page fault (SPF) has to look for the VMA matching the fault address without holding the mmap_sem, this is done by introducing a rwlock which protects the access to the mm_rb tree. Previously this was done using SRCU but it was introducing a lot of scheduling to process the VMA's freeing operation which was hitting the performance by 20% as reported by Kemi Wang [2].Using a rwlock to protect access to the mm_rb tree is limiting the locking contention to these operations which are expected to be in a O(log n) order. In addition to ensure that the VMA is not freed in our back a reference count is added and 2 services (get_vma() and put_vma()) are introduced to handle the reference count. When a VMA is fetch from the RB tree using get_vma() is must be later freeed using put_vma(). Furthermore, to allow the VMA to be used again by the classic page fault handler a service is introduced can_reuse_spf_vma(). This service is expected to be called with the mmap_sem hold. It checked that the VMA is still matching the specified address and is releasing its reference count as the mmap_sem is hold it is ensure that it will not be freed in our back. In general, the VMA's reference count could be decremented when holding the mmap_sem but it should not be increased as holding the mmap_sem is ensuring that the VMA is stable. I can't see anymore the overhead I got while will-it-scale benchmark anymore. The VMA's attributes checked during the speculative page fault processing have to be protected against parallel changes. This is done by using a per VMA sequence lock. This sequence lock allows the speculative page fault handler to fast check for parallel changes in progress and to abort the speculative page fault in that case. Once the VMA is found, the speculative page fault handler would check for the VMA's attributes to verify that the page fault has to be handled correctly or not. Thus the VMA is protected through a sequence lock which allows fast detection of concurrent VMA changes. If such a change is detected, the speculative page fault is aborted and a *classic* page fault is tried. VMA sequence lockings are added when VMA attributes which are checked during the page fault are modified. When the PTE is fetched, the VMA is checked to see if it has been changed, so once the page table is locked, the VMA is valid, so any other changes leading to touching this PTE will need to lock the page table, so no parallel change is possible at this time. The locking of the PTE is done with interrupts disabled, this allows to check for the PMD to ensure that there is not an ongoing collapsing operation. Since khugepaged is firstly set the PMD to pmd_none and then is waiting for the other CPU to have catch the IPI interrupt, if the pmd is valid at the time the PTE is locked, we have the guarantee that the collapsing opertion will have to wait on the PTE lock to move foward. This allows the SPF handler to map the PTE safely. If the PMD value is different than the one recorded at the beginning of the SPF operation, the classic page fault handler will be called to handle the operation while holding the mmap_sem. As the PTE lock is done with the interrupts disabled, the lock is done using spin_trylock() to avoid dead lock when handling a page fault while a TLB invalidate is requested by an other CPU holding the PTE. In pseudo code, this could be seen as: speculative_page_fault() { vma = GET_VMA_vma() check vma sequence count check vma's support disable interrupt check pgd,p4d,...,pte save pmd and pte in vmf save vma sequence counter in vmf enable interrupt check vma sequence count handle_pte_fault(vma) .. page = alloc_page() pte_map_lock() disable interrupt abort if sequence counter has changed abort if pmd or pte has changed pte map and lock enable interrupt if abort free page abort
Re: [RFC PATCH 1/3] signal: Ensure every siginfo we send has all bits initialized
On Sun, Apr 15, 2018 at 10:57:33AM -0500, Eric W. Biederman wrote: > > Call clear_siginfo to ensure every stack allocated siginfo is properly > initialized before being passed to the signal sending functions. > > Note: It is not safe to depend on C initializers to initialize struct > siginfo on the stack because C is allowed to skip holes when > initializing a structure. > > The initialization of struct siginfo in tracehook_report_syscall_exit > was moved from the helper user_single_step_siginfo into > tracehook_report_syscall_exit itself, to make it clear that the local > variable siginfo gets fully initialized. > > In a few cases the scope of struct siginfo has been reduced to make it > clear that siginfo siginfo is not used on other paths in the function > in which it is declared. > > Instances of using memset to initialize siginfo have been replaced > with calls clear_siginfo for clarity. > > Signed-off-by: "Eric W. Biederman" [...] Hmmm memset()/clear_siginfo() may ensure that there are no uninitialised explicit fields except for those in inactive union members, but I'm not sure that this approach is guaranteed to sanitise the padding seen by userspace. Rationale below, though it's a bit theoretical... With this in mind, I tend agree with Linus that hiding memset() calls from the maintainer may be a bad idea unless they are also hidden from the compiler. If the compiler sees the memset() it may be able to optimise it in ways that wouldn't be possible for some other random external function call, including optimising all or part of the call out. As a result, the breakdown into individual put_user()s etc. in copy_siginfo_to_user() may still be valuable even if all paths have the memset(). (Rationale for an arch/arm example:) > diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c > index 4c375e11ae95..adda3fc2dde8 100644 > --- a/arch/arm/vfp/vfpmodule.c > +++ b/arch/arm/vfp/vfpmodule.c > @@ -218,8 +218,7 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct > pt_regs *regs) > { > siginfo_t info; > > - memset(&info, 0, sizeof(info)); > - > + clear_siginfo(&info); > info.si_signo = SIGFPE; /* by c11 (n1570) 6.2.6.1 para 6 [1], all padding bytes in info now take unspecified values */ > info.si_code = sicode; > info.si_addr = (void __user *)(instruction_pointer(regs) - 4); /* by c11 (n1570) 6.2.6.1 para 7 [2], all bytes of the union info._sifields other than than those corresponding to _sigfault take unspecified values */ So I don't see why the compiler needs to ensure that any of the affected bytes are zero: it could potentially skip a lot of the memset() as a result, in theory. I've not seen a compiler actually take advantage of that, but I'm now not sure what forbids it. If this can happen, I only see two watertight workarounds: 1) Ensure that there is no implicit padding in any UAPI structure, e.g. aeb1f39d814b: ("arm64/ptrace: Avoid uninitialised struct padding in fpr_set()"). This would include tail-padding of any union member that is smaller than the containing union. It would be significantly more effort to ensure this for siginfo though. 2) Poke all values directly into allocated or user memory directly via pointers to paddingless types; never assign to objects on the kernel stack if you care what ends up in the padding, e.g., what your copy_siginfo_to_user() does prior to this series. If I'm not barking up the wrong tree, memset() cannot generally be used to determine the value of padding bytes, but it may still be useful for forcing otherwise uninitialised members to sane initial values. This likely affects many more things than just siginfo. [...] Cheers ---Dave [1] n1570 6.2.6.1.6: When a value is stored in an object of structure or union type, including in a member object, the bytes of the object representation that correspond to any padding bytes take unspecified values [...] [2] n1570 6.2.6.1.7: When a value is stored in a member of an object of union type, the bytes of the object representation that do not correspond to that member but do correspond to other members take unspecified values.
[PATCH] powerpc/time: remove to_tm and use RTC_LIB
RTC_LIB includes a generic function to convert RTC data into struct rtc_time. Use it and remove to_tm(). Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig| 1 + arch/powerpc/include/asm/time.h | 1 - arch/powerpc/kernel/rtas-proc.c | 4 +-- arch/powerpc/kernel/time.c | 52 + arch/powerpc/platforms/8xx/m8xx_setup.c | 2 +- arch/powerpc/platforms/powermac/time.c | 2 +- arch/powerpc/platforms/ps3/time.c | 2 +- 7 files changed, 7 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index edbbd2ea1298..e1fac49cf465 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -232,6 +232,7 @@ config PPC select OF_RESERVED_MEM select OLD_SIGACTIONif PPC32 select OLD_SIGSUSPEND + select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select VIRT_TO_BUS if !PPC64 diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index db546c034905..0ad1cf2285b1 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -27,7 +27,6 @@ extern unsigned long tb_ticks_per_sec; extern struct clock_event_device decrementer_clockevent; struct rtc_time; -extern void to_tm(int tim, struct rtc_time * tm); extern void tick_broadcast_ipi_handler(void); extern void generic_calibrate_decr(void); diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index fb070d8cad07..6de77f9434b0 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -314,7 +314,7 @@ static ssize_t ppc_rtas_poweron_write(struct file *file, power_on_time = nowtime; /* save the time */ - to_tm(nowtime, &tm); + rtc_time64_to_tm(nowtime, &tm); error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL, tm.tm_year, tm.tm_mon, tm.tm_mday, @@ -378,7 +378,7 @@ static ssize_t ppc_rtas_clock_write(struct file *file, if (error) return error; - to_tm(nowtime, &tm); + rtc_time64_to_tm(nowtime, &tm); error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, 0); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 56869fd879ed..362673cc09f2 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -788,7 +788,7 @@ int update_persistent_clock(struct timespec now) if (!ppc_md.set_rtc_time) return -ENODEV; - to_tm(now.tv_sec + 1 + timezone_offset, &tm); + rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, &tm); tm.tm_year -= 1900; tm.tm_mon -= 1; @@ -1141,56 +1141,6 @@ void __init time_init(void) #endif } - -#define FEBRUARY 2 -#defineSTARTOFTIME 1970 -#define SECDAY 86400L -#define SECYR (SECDAY * 365) -#defineleapyear(year) ((year) % 4 == 0 && \ -((year) % 100 != 0 || (year) % 400 == 0)) -#definedays_in_year(a) (leapyear(a) ? 366 : 365) -#definedays_in_month(a)(month_days[(a) - 1]) - -static int month_days[12] = { - 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 -}; - -void to_tm(int tim, struct rtc_time * tm) -{ - register inti; - register long hms, day; - - day = tim / SECDAY; - hms = tim % SECDAY; - - /* Hours, minutes, seconds are easy */ - tm->tm_hour = hms / 3600; - tm->tm_min = (hms % 3600) / 60; - tm->tm_sec = (hms % 3600) % 60; - - /* Number of years in days */ - for (i = STARTOFTIME; day >= days_in_year(i); i++) - day -= days_in_year(i); - tm->tm_year = i; - - /* Number of months in days left */ - if (leapyear(tm->tm_year)) - days_in_month(FEBRUARY) = 29; - for (i = 1; day >= days_in_month(i); i++) - day -= days_in_month(i); - days_in_month(FEBRUARY) = 28; - tm->tm_mon = i; - - /* Days are what is left over (+1) from all that. */ - tm->tm_mday = day + 1; - - /* -* No-one uses the day of the week. -*/ - tm->tm_wday = -1; -} -EXPORT_SYMBOL(to_tm); - /* * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit * result. diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index 2188d691a40f..0f9740185eb9 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -192,7 +192,7 @@ void mpc8xx_get_rtc_time(struct rtc_time *tm) /* Get time from the RTC. */ data = in_be32(&sys_tmr->sit_rtc); - to_tm(data, tm); + rtc_time64_to_tm(data, tm); tm->tm_year -= 1900;
Re: [PATCH] powerpc/8xx: Build fix with Hugetlbfs enabled
Le 16/04/2018 à 13:27, Aneesh Kumar K.V a écrit : 8xx use slice code when hugetlbfs is enabled. We missed a header include on 8xx which resulted in the below build failure. config: mpc885_ads_defconfig + CONFIG_HUGETLBFS CC arch/powerpc/mm/slice.o arch/powerpc/mm/slice.c: In function 'slice_get_unmapped_area': arch/powerpc/mm/slice.c:655:2: error: implicit declaration of function 'need_extra_context' [-Werror=implicit-function-declaration] arch/powerpc/mm/slice.c:656:3: error: implicit declaration of function 'alloc_extended_context' [-Werror=implicit-function-declaration] cc1: all warnings being treated as errors make[1]: *** [arch/powerpc/mm/slice.o] Error 1 make: *** [arch/powerpc/mm] Error 2 on PPC64 the mmu_context.h was included via linux/pkeys.h CC: Christophe LEROY Signed-off-by: Aneesh Kumar K.V Tested-by: Christophe Leroy --- arch/powerpc/mm/slice.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 9cd87d11fe4e..205fe557ca10 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -35,6 +35,7 @@ #include #include #include +#include static DEFINE_SPINLOCK(slice_convert_lock);
[PATCH] powerpc/8xx: Remove RTC clock on 88x
The 885 familly processors don't have the Real Time Clock Signed-off-by: Christophe Leroy --- arch/powerpc/platforms/8xx/adder875.c| 2 -- arch/powerpc/platforms/8xx/ep88xc.c | 2 -- arch/powerpc/platforms/8xx/mpc885ads_setup.c | 2 -- 3 files changed, 6 deletions(-) diff --git a/arch/powerpc/platforms/8xx/adder875.c b/arch/powerpc/platforms/8xx/adder875.c index 333dece79394..bcef9f66191e 100644 --- a/arch/powerpc/platforms/8xx/adder875.c +++ b/arch/powerpc/platforms/8xx/adder875.c @@ -111,7 +111,5 @@ define_machine(adder875) { .get_irq = mpc8xx_get_irq, .restart = mpc8xx_restart, .calibrate_decr = generic_calibrate_decr, - .set_rtc_time = mpc8xx_set_rtc_time, - .get_rtc_time = mpc8xx_get_rtc_time, .progress = udbg_progress, }; diff --git a/arch/powerpc/platforms/8xx/ep88xc.c b/arch/powerpc/platforms/8xx/ep88xc.c index cd0d90f1fb1c..ebcf34a14789 100644 --- a/arch/powerpc/platforms/8xx/ep88xc.c +++ b/arch/powerpc/platforms/8xx/ep88xc.c @@ -170,7 +170,5 @@ define_machine(ep88xc) { .get_irq= mpc8xx_get_irq, .restart = mpc8xx_restart, .calibrate_decr = mpc8xx_calibrate_decr, - .set_rtc_time = mpc8xx_set_rtc_time, - .get_rtc_time = mpc8xx_get_rtc_time, .progress = udbg_progress, }; diff --git a/arch/powerpc/platforms/8xx/mpc885ads_setup.c b/arch/powerpc/platforms/8xx/mpc885ads_setup.c index e821a42d5816..a0c83c1905c6 100644 --- a/arch/powerpc/platforms/8xx/mpc885ads_setup.c +++ b/arch/powerpc/platforms/8xx/mpc885ads_setup.c @@ -220,7 +220,5 @@ define_machine(mpc885_ads) { .get_irq= mpc8xx_get_irq, .restart= mpc8xx_restart, .calibrate_decr = mpc8xx_calibrate_decr, - .set_rtc_time = mpc8xx_set_rtc_time, - .get_rtc_time = mpc8xx_get_rtc_time, .progress = udbg_progress, }; -- 2.13.3
[PATCH] powerpc/boot: remove unused variable in mpc8xx
Variable div is set but never used. Remove it. Signed-off-by: Christophe Leroy --- arch/powerpc/boot/mpc8xx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/boot/mpc8xx.c b/arch/powerpc/boot/mpc8xx.c index add55a7f184f..c9bd9285c548 100644 --- a/arch/powerpc/boot/mpc8xx.c +++ b/arch/powerpc/boot/mpc8xx.c @@ -24,7 +24,7 @@ u32 mpc885_get_clock(u32 crystal) { u32 *immr; u32 plprcr; - int mfi, mfn, mfd, pdf, div; + int mfi, mfn, mfd, pdf; u32 ret; immr = fsl_get_immr(); @@ -43,7 +43,6 @@ u32 mpc885_get_clock(u32 crystal) } pdf = (plprcr >> 1) & 0xf; - div = (plprcr >> 20) & 3; mfd = (plprcr >> 22) & 0x1f; mfn = (plprcr >> 27) & 0x1f; -- 2.13.3
[PATCH] powerpc/misc: merge reloc_offset() and add_reloc_offset()
reloc_offset() is the same as add_reloc_offset(0) Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc.S | 17 +++-- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 384357cb8bc0..e1f3a5d054c4 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -25,23 +25,12 @@ /* * Returns (address we are running at) - (address we were linked at) * for use before the text and data are mapped to KERNELBASE. - */ - -_GLOBAL(reloc_offset) - mflrr0 - bl 1f -1: mflrr3 - PPC_LL r4,(2f-1b)(r3) - subfr3,r4,r3 - mtlrr0 - blr - .align 3 -2: PPC_LONG 1b - -/* * add_reloc_offset(x) returns x + reloc_offset(). */ + +_GLOBAL(reloc_offset) + li r3, 0 _GLOBAL(add_reloc_offset) mflrr0 bl 1f -- 2.13.3
[PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
This option does dead code and data elimination with the linker by compiling with -ffunction-sections -fdata-sections and linking with --gc-sections. By selecting this option on mpc885_ads_defconfig, vmlinux LOAD segment size gets reduced by 10% Program Header before the patch: LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 filesz 0x0036eda4 memsz 0x0038de04 flags rwx Program Header after the patch: LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16 filesz 0x00316da4 memsz 0x00334268 flags rwx Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8fe4353be5e3..e1fac49cf465 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -888,6 +888,14 @@ config PPC_MEM_KEYS If unsure, say y. +config PPC_UNUSED_ELIMINATION + bool "Eliminate unused functions and data from vmlinux" + default n + select LD_DEAD_CODE_DATA_ELIMINATION + help + Select this to do dead code and data elimination with the linker + by compiling with -ffunction-sections -fdata-sections and linking + with --gc-sections. endmenu config ISA_DMA_API -- 2.13.3
[PATCH 6/6 v2] arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc
Fsl-mc bus now support the iommu-map property. Comply to this binding for fsl_mc bus. This patch also updates the dts w.r.t. the DMA configuration. Signed-off-by: Nipun Gupta --- arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index f3a40af..1b1c5eb 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -135,6 +135,7 @@ #address-cells = <2>; #size-cells = <2>; ranges; + dma-ranges = <0x0 0x0 0x0 0x0 0x1 0x>; clockgen: clocking@130 { compatible = "fsl,ls2080a-clockgen"; @@ -357,6 +358,8 @@ reg = <0x0008 0x0c00 0 0x40>,/* MC portal base */ <0x 0x0834 0 0x4>; /* MC control reg */ msi-parent = <&its>; + iommu-map = <0 &smmu 0 0>; /* This is fixed-up by u-boot */ + dma-coherent; #address-cells = <3>; #size-cells = <1>; @@ -460,6 +463,8 @@ compatible = "arm,mmu-500"; reg = <0 0x500 0 0x80>; #global-interrupts = <12>; + #iommu-cells = <1>; + stream-match-mask = <0x7C00>; interrupts = <0 13 4>, /* global secure fault */ <0 14 4>, /* combined secure interrupt */ <0 15 4>, /* global non-secure fault */ @@ -502,7 +507,6 @@ <0 204 4>, <0 205 4>, <0 206 4>, <0 207 4>, <0 208 4>, <0 209 4>; - mmu-masters = <&fsl_mc 0x300 0>; }; dspi: dspi@210 { -- 1.9.1
[PATCH 5/6 v2] bus: fsl-mc: supoprt dma configure for devices on fsl-mc bus
Signed-off-by: Nipun Gupta --- drivers/bus/fsl-mc/fsl-mc-bus.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 5d8266c..624828b 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -127,6 +127,16 @@ static int fsl_mc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } +static int fsl_mc_dma_configure(struct device *dev) +{ + struct device *dma_dev = dev; + + while (dev_is_fsl_mc(dma_dev)) + dma_dev = dma_dev->parent; + + return of_dma_configure(dev, dma_dev->of_node, 0); +} + static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -148,6 +158,7 @@ struct bus_type fsl_mc_bus_type = { .name = "fsl-mc", .match = fsl_mc_bus_match, .uevent = fsl_mc_bus_uevent, + .dma_configure = fsl_mc_dma_configure, .dev_groups = fsl_mc_dev_groups, }; EXPORT_SYMBOL_GPL(fsl_mc_bus_type); @@ -616,6 +627,7 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc, mc_dev->icid = parent_mc_dev->icid; mc_dev->dma_mask = FSL_MC_DEFAULT_DMA_MASK; mc_dev->dev.dma_mask = &mc_dev->dma_mask; + mc_dev->dev.coherent_dma_mask = mc_dev->dma_mask; dev_set_msi_domain(&mc_dev->dev, dev_get_msi_domain(&parent_mc_dev->dev)); } @@ -633,10 +645,6 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc, goto error_cleanup_dev; } - /* Objects are coherent, unless 'no shareability' flag set. */ - if (!(obj_desc->flags & FSL_MC_OBJ_FLAG_NO_MEM_SHAREABILITY)) - arch_setup_dma_ops(&mc_dev->dev, 0, 0, NULL, true); - /* * The device-specific probe callback will get invoked by device_add() */ -- 1.9.1
[PATCH 4/6 v2] iommu: arm-smmu: Add support for the fsl-mc bus
Implement bus specific support for the fsl-mc bus including registering arm_smmu_ops and bus specific device add operations. Signed-off-by: Nipun Gupta --- drivers/iommu/arm-smmu.c | 7 +++ drivers/iommu/iommu.c| 21 + include/linux/fsl/mc.h | 8 include/linux/iommu.h| 2 ++ 4 files changed, 38 insertions(+) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 69e7c60..e1d5090 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -52,6 +52,7 @@ #include #include +#include #include "io-pgtable.h" #include "arm-smmu-regs.h" @@ -1459,6 +1460,8 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) if (dev_is_pci(dev)) group = pci_device_group(dev); + else if (dev_is_fsl_mc(dev)) + group = fsl_mc_device_group(dev); else group = generic_device_group(dev); @@ -2037,6 +2040,10 @@ static void arm_smmu_bus_init(void) bus_set_iommu(&pci_bus_type, &arm_smmu_ops); } #endif +#ifdef CONFIG_FSL_MC_BUS + if (!iommu_present(&fsl_mc_bus_type)) + bus_set_iommu(&fsl_mc_bus_type, &arm_smmu_ops); +#endif } static int arm_smmu_device_probe(struct platform_device *pdev) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 69fef99..fbeebb2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include static struct kset *iommu_group_kset; @@ -987,6 +988,26 @@ struct iommu_group *pci_device_group(struct device *dev) return iommu_group_alloc(); } +/* Get the IOMMU group for device on fsl-mc bus */ +struct iommu_group *fsl_mc_device_group(struct device *dev) +{ + struct device *cont_dev = fsl_mc_cont_dev(dev); + struct iommu_group *group; + + /* Container device is responsible for creating the iommu group */ + if (fsl_mc_is_cont_dev(dev)) { + group = iommu_group_alloc(); + if (IS_ERR(group)) + return NULL; + } else { + get_device(cont_dev); + group = iommu_group_get(cont_dev); + put_device(cont_dev); + } + + return group; +} + /** * iommu_group_get_for_dev - Find or create the IOMMU group for a device * @dev: target device diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index f27cb14..dddaca1 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -351,6 +351,14 @@ struct fsl_mc_io { #define dev_is_fsl_mc(_dev) (0) #endif +/* Macro to check if a device is a container device */ +#define fsl_mc_is_cont_dev(_dev) (to_fsl_mc_device(_dev)->flags & \ + FSL_MC_IS_DPRC) + +/* Macro to get the container device of a MC device */ +#define fsl_mc_cont_dev(_dev) (fsl_mc_is_cont_dev(_dev) ? \ + (_dev) : (_dev)->parent) + /* * module_fsl_mc_driver() - Helper macro for drivers that don't do * anything special in module init/exit. This eliminates a lot of diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 41b8c57..00a460b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -389,6 +389,8 @@ static inline size_t iommu_map_sg(struct iommu_domain *domain, extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ extern struct iommu_group *generic_device_group(struct device *dev); +/* FSL-MC device grouping function */ +struct iommu_group *fsl_mc_device_group(struct device *dev); /** * struct iommu_fwspec - per-device IOMMU instance data -- 1.9.1
[PATCH 3/6 v2] iommu: support iommu configuration for fsl-mc devices
Signed-off-by: Nipun Gupta --- drivers/iommu/of_iommu.c | 20 1 file changed, 20 insertions(+) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 4e7712f..af4fc3b 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #define NO_IOMMU 1 @@ -260,6 +261,23 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) return err; } +static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev, + struct device_node *master_np) +{ + struct of_phandle_args iommu_spec = { .args_count = 1 }; + int err; + + err = of_map_rid(master_np, mc_dev->icid, "iommu-map", +"iommu-map-mask", &iommu_spec.np, +iommu_spec.args); + if (err) + return err == -ENODEV ? NO_IOMMU : err; + + err = of_iommu_xlate(&mc_dev->dev, &iommu_spec); + of_node_put(iommu_spec.np); + return err; +} + const struct iommu_ops *of_iommu_configure(struct device *dev, struct device_node *master_np) { @@ -291,6 +309,8 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, err = pci_for_each_dma_alias(to_pci_dev(dev), of_pci_iommu_init, &info); + } else if (dev_is_fsl_mc(dev)) { + err = of_fsl_mc_iommu_init(to_fsl_mc_device(dev), master_np); } else { struct of_phandle_args iommu_spec; int idx = 0; -- 1.9.1
[PATCH 2/6 v2] iommu: of: make of_pci_map_rid() available for other devices too
iommu-map property is also used by devices with fsl-mc. This patch moves the of_pci_map_rid to generic location, so that it can be used by other busses too. Signed-off-by: Nipun Gupta --- drivers/iommu/of_iommu.c | 106 +-- drivers/of/irq.c | 6 +-- drivers/pci/of.c | 101 include/linux/of_iommu.h | 11 + include/linux/of_pci.h | 10 - 5 files changed, 117 insertions(+), 117 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 5c36a8b..4e7712f 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -138,6 +138,106 @@ static int of_iommu_xlate(struct device *dev, return ops->of_xlate(dev, iommu_spec); } +/** + * of_map_rid - Translate a requester ID through a downstream mapping. + * @np: root complex device node. + * @rid: device requester ID to map. + * @map_name: property name of the map to use. + * @map_mask_name: optional property name of the mask to use. + * @target: optional pointer to a target device node. + * @id_out: optional pointer to receive the translated ID. + * + * Given a device requester ID, look up the appropriate implementation-defined + * platform ID and/or the target device which receives transactions on that + * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or + * @id_out may be NULL if only the other is required. If @target points to + * a non-NULL device node pointer, only entries targeting that node will be + * matched; if it points to a NULL value, it will receive the device node of + * the first matching target phandle, with a reference held. + * + * Return: 0 on success or a standard error code on failure. + */ +int of_map_rid(struct device_node *np, u32 rid, + const char *map_name, const char *map_mask_name, + struct device_node **target, u32 *id_out) +{ + u32 map_mask, masked_rid; + int map_len; + const __be32 *map = NULL; + + if (!np || !map_name || (!target && !id_out)) + return -EINVAL; + + map = of_get_property(np, map_name, &map_len); + if (!map) { + if (target) + return -ENODEV; + /* Otherwise, no map implies no translation */ + *id_out = rid; + return 0; + } + + if (!map_len || map_len % (4 * sizeof(*map))) { + pr_err("%pOF: Error: Bad %s length: %d\n", np, + map_name, map_len); + return -EINVAL; + } + + /* The default is to select all bits. */ + map_mask = 0x; + + /* +* Can be overridden by "{iommu,msi}-map-mask" property. +*/ + if (map_mask_name) + of_property_read_u32(np, map_mask_name, &map_mask); + + masked_rid = map_mask & rid; + for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) { + struct device_node *phandle_node; + u32 rid_base = be32_to_cpup(map + 0); + u32 phandle = be32_to_cpup(map + 1); + u32 out_base = be32_to_cpup(map + 2); + u32 rid_len = be32_to_cpup(map + 3); + + if (rid_base & ~map_mask) { + pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores rid-base (0x%x)\n", + np, map_name, map_name, + map_mask, rid_base); + return -EFAULT; + } + + if (masked_rid < rid_base || masked_rid >= rid_base + rid_len) + continue; + + phandle_node = of_find_node_by_phandle(phandle); + if (!phandle_node) + return -ENODEV; + + if (target) { + if (*target) + of_node_put(phandle_node); + else + *target = phandle_node; + + if (*target != phandle_node) + continue; + } + + if (id_out) + *id_out = masked_rid - rid_base + out_base; + + pr_debug("%pOF: %s, using mask %08x, rid-base: %08x, out-base: %08x, length: %08x, rid: %08x -> %08x\n", + np, map_name, map_mask, rid_base, out_base, + rid_len, rid, masked_rid - rid_base + out_base); + return 0; + } + + pr_err("%pOF: Invalid %s translation - no match for rid 0x%x on %pOF\n", + np, map_name, rid, target && *target ? *target : NULL); + return -EFAULT; +} + struct of_pci_iommu_alias_info { struct device *dev; struct device_node *np; @@ -149,9 +249,9 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) struct of_phandle_args iommu_spec = { .args_count = 1 }; i
[PATCH 1/6 v2] Docs: dt: add fsl-mc iommu-map device-tree binding
The existing IOMMU bindings cannot be used to specify the relationship between fsl-mc devices and IOMMUs. This patch adds a generic binding for mapping fsl-mc devices to IOMMUs, using iommu-map property. Signed-off-by: Nipun Gupta --- .../devicetree/bindings/misc/fsl,qoriq-mc.txt | 39 ++ 1 file changed, 39 insertions(+) diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt index 6611a7c..8cbed4f 100644 --- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt +++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt @@ -9,6 +9,25 @@ blocks that can be used to create functional hardware objects/devices such as network interfaces, crypto accelerator instances, L2 switches, etc. +For an overview of the DPAA2 architecture and fsl-mc bus see: +drivers/staging/fsl-mc/README.txt + +As described in the above overview, all DPAA2 objects in a DPRC share the +same hardware "isolation context" and a 10-bit value called an ICID +(isolation context id) is expressed by the hardware to identify +the requester. + +The generic 'iommus' property is insufficient to describe the relationship +between ICIDs and IOMMUs, so an iommu-map property is used to define +the set of possible ICIDs under a root DPRC and how they map to +an IOMMU. + +For generic IOMMU bindings, see +Documentation/devicetree/bindings/iommu/iommu.txt. + +For arm-smmu binding, see: +Documentation/devicetree/bindings/iommu/arm,smmu.txt. + Required properties: - compatible @@ -88,14 +107,34 @@ Sub-nodes: Value type: Definition: Specifies the phandle to the PHY device node associated with the this dpmac. +Optional properties: + +- iommu-map: Maps an ICID to an IOMMU and associated iommu-specifier + data. + + The property is an arbitrary number of tuples of + (icid-base,iommu,iommu-base,length). + + Any ICID i in the interval [icid-base, icid-base + length) is + associated with the listed IOMMU, with the iommu-specifier + (i - icid-base + iommu-base). Example: +smmu: iommu@500 { + compatible = "arm,mmu-500"; + #iommu-cells = <2>; + stream-match-mask = <0x7C00>; + ... +}; + fsl_mc: fsl-mc@80c00 { compatible = "fsl,qoriq-mc"; reg = <0x0008 0x0c00 0 0x40>,/* MC portal base */ <0x 0x0834 0 0x4>; /* MC control reg */ msi-parent = <&its>; +/* define map for ICIDs 23-64 */ +iommu-map = <23 &smmu 23 41>; #address-cells = <3>; #size-cells = <1>; -- 1.9.1
[PATCH 0/6 v2] Support for fsl-mc bus and its devices in SMMU
This patchset defines IOMMU DT binding for fsl-mc bus and adds support in SMMU for fsl-mc bus. This patch series is dependent on patset: https://patchwork.kernel.org/patch/10317337/ These patches - Define property 'iommu-map' for fsl-mc bus (patch 1) - Integrates the fsl-mc bus with the SMMU using this IOMMU binding (patch 2,3,4) - Adds the dma configuration support for fsl-mc bus (patch 5) - Updates the fsl-mc device node with iommu/dma related changes (patch6) Nipun Gupta (6): Docs: dt: add fsl-mc iommu-map device-tree binding iommu: of: make of_pci_map_rid() available for other devices too iommu: support iommu configuration for fsl-mc devices iommu: arm-smmu: Add support for the fsl-mc bus bus: fsl-mc: supoprt dma configure for devices on fsl-mc bus arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc Changes in v2: - use iommu-map property for fsl-mc bus - rebase over patchset https://patchwork.kernel.org/patch/10317337/ and make corresponding changes for dma configuration of devices on fsl-mc bus .../devicetree/bindings/misc/fsl,qoriq-mc.txt | 39 +++ arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 6 +- drivers/bus/fsl-mc/fsl-mc-bus.c| 16 ++- drivers/iommu/arm-smmu.c | 7 ++ drivers/iommu/iommu.c | 21 drivers/iommu/of_iommu.c | 126 - drivers/of/irq.c | 6 +- drivers/pci/of.c | 101 - include/linux/fsl/mc.h | 8 ++ include/linux/iommu.h | 2 + include/linux/of_iommu.h | 11 ++ include/linux/of_pci.h | 10 -- 12 files changed, 231 insertions(+), 122 deletions(-) -- 1.9.1
Re: [1/5] powerpc/lib: Fix off-by-one in alternate feature patching
On Mon, 2018-04-16 at 14:39:01 UTC, Michael Ellerman wrote: > When we patch an alternate feature section, we have to adjust any > relative branches that branch out of the alternate section. > > But currently we have a bug if we have a branch that points to past > the last instruction of the alternate section, eg: > > FTR_SECTION_ELSE > 1: b 2f > or 6,6,6 > 2: > ALT_FTR_SECTION_END(...) > nop > > This will result in a relative branch at 1 with a target that equals > the end of the alternate section. > > That branch does not need adjusting when it's moved to the non-else > location. Currently we do adjust it, resulting in a branch that goes > off into the link-time location of the else section, which is junk. > > The fix is to not patch branches that have a target == end of the > alternate section. > > Fixes: d20fe50a7b3c ("KVM: PPC: Book3S HV: Branch inside feature section") > Fixes: 9b1a735de64c ("powerpc: Add logic to patch alternative feature > sections") > Cc: sta...@vger.kernel.org # v2.6.27+ > Signed-off-by: Michael Ellerman Applied to powerpc fixes. https://git.kernel.org/powerpc/c/b8858581febb050688e276b956796b cheers
Re: powerpc/64s: Default l1d_size to 64K in RFI fallback flush
On Tue, 2018-04-17 at 01:49:20 UTC, Michael Ellerman wrote: > From: Madhavan Srinivasan > > If there is no d-cache-size property in the device tree, l1d_size could > be zero. We don't actually expect that to happen, it's only been seen > on mambo (simulator) in some configurations. > > A zero-size l1d_size leads to the loop in the asm wrapping around to > 2^64-1, and then walking off the end of the fallback area and > eventually causing a page fault which is fatal. > > Just default to 64K which is correct on some CPUs, and sane enough to > not cause a crash on others. > > Fixes: aa8a5e0062ac9 ('powerpc/64s: Add support for RFI flush of L1-D cache') > Signed-off-by: Madhavan Srinivasan > [mpe: Rewrite comment and change log] > Signed-off-by: Michael Ellerman Applied to powerpc fixes. https://git.kernel.org/powerpc/c/9dfbf78e4114fcaf4ef61c49885c3a cheers
[RESEND PATCH 1/3] powerpc: dts: use 'atmel' as at24 anufacturer for pdm360ng
Using 'at' as the part of the compatible string is now deprecated. Use a correct string: 'atmel,'. Signed-off-by: Bartosz Golaszewski --- arch/powerpc/boot/dts/pdm360ng.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/pdm360ng.dts b/arch/powerpc/boot/dts/pdm360ng.dts index 445b88114009..df1283b63d9b 100644 --- a/arch/powerpc/boot/dts/pdm360ng.dts +++ b/arch/powerpc/boot/dts/pdm360ng.dts @@ -98,7 +98,7 @@ fsl,preserve-clocking; eeprom@50 { - compatible = "at,24c01"; + compatible = "atmel,24c01"; reg = <0x50>; }; -- 2.17.0
[RESEND PATCH 3/3] powerpc: dts: use a correct at24 compatible fallback in ac14xx
Using 'at24' as fallback is now deprecated - use the full 'atmel,' string. Signed-off-by: Bartosz Golaszewski --- arch/powerpc/boot/dts/ac14xx.dts | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/boot/dts/ac14xx.dts b/arch/powerpc/boot/dts/ac14xx.dts index 83bcfd865167..0be5c4f3265d 100644 --- a/arch/powerpc/boot/dts/ac14xx.dts +++ b/arch/powerpc/boot/dts/ac14xx.dts @@ -176,12 +176,12 @@ clock-frequency = <40>; at24@30 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x30>; }; at24@31 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x31>; }; @@ -191,42 +191,42 @@ }; at24@50 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x50>; }; at24@51 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x51>; }; at24@52 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x52>; }; at24@53 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x53>; }; at24@54 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x54>; }; at24@55 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x55>; }; at24@56 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x56>; }; at24@57 { - compatible = "at24,24c01"; + compatible = "atmel,24c01"; reg = <0x57>; }; -- 2.17.0
[RESEND PATCH 2/3] powerpc: dts: use 'atmel' as at24 manufacturer for kmcent2
Using compatible strings without the part for at24 is now deprecated. Use a correct 'atmel,' value. Signed-off-by: Bartosz Golaszewski --- arch/powerpc/boot/dts/fsl/kmcent2.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/fsl/kmcent2.dts b/arch/powerpc/boot/dts/fsl/kmcent2.dts index 5922c1ea0e96..3094df05f5ea 100644 --- a/arch/powerpc/boot/dts/fsl/kmcent2.dts +++ b/arch/powerpc/boot/dts/fsl/kmcent2.dts @@ -130,7 +130,7 @@ #size-cells = <0>; eeprom@54 { - compatible = "24c02"; + compatible = "atmel,24c02"; reg = <0x54>; pagesize = <2>; read-only; -- 2.17.0
Re: [PATCH] powerpc/misc: get rid of add_reloc_offset()
On Tue, Apr 17, 2018 at 09:56:24AM +0200, Christophe Leroy wrote: > add_reloc_offset() is almost redundant with reloc_offset() > > Signed-off-by: Christophe Leroy > --- > arch/powerpc/include/asm/setup.h | 3 +-- > arch/powerpc/kernel/misc.S | 16 > arch/powerpc/kernel/prom_init_check.sh | 2 +- > 3 files changed, 2 insertions(+), 19 deletions(-) > > diff --git a/arch/powerpc/include/asm/setup.h > b/arch/powerpc/include/asm/setup.h > index 27fa52ed6d00..115e0896ffa7 100644 > --- a/arch/powerpc/include/asm/setup.h > +++ b/arch/powerpc/include/asm/setup.h > @@ -17,10 +17,9 @@ extern void note_scsi_host(struct device_node *, void *); > > /* Used in very early kernel initialization. */ > extern unsigned long reloc_offset(void); > -extern unsigned long add_reloc_offset(unsigned long); > extern void reloc_got2(unsigned long); > > -#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x))) > +#define PTRRELOC(x) ((typeof(x)) ((unsigned long)(x) + reloc_offset())) NAK. This is how it used to be, and we changed it in order to prevent gcc from making incorrect assumptions. If you use the form with the explicit addition, and x is the address of an array, gcc will assume that the result is within the bounds of the array (apparently the C standard says it can do that) and potentially generate incorrect code. I recall that we had an actual case where gcc was generating incorrect code, though I don't recall the details, as this was some time before 2002. Paul.
Re: [PATCH 1/2] powernv/npu: Do a PID GPU TLB flush when invalidating a large address range
On Tue, Apr 17, 2018 at 7:11 PM, Alistair Popple wrote: > The NPU has a limited number of address translation shootdown (ATSD) > registers and the GPU has limited bandwidth to process ATSDs. This can > result in contention of ATSD registers leading to soft lockups on some > threads, particularly when invalidating a large address range in > pnv_npu2_mn_invalidate_range(). > > At some threshold it becomes more efficient to flush the entire GPU TLB for > the given MM context (PID) than individually flushing each address in the > range. This patch will result in ranges greater than 2MB being converted > from 32+ ATSDs into a single ATSD which will flush the TLB for the given > PID on each GPU. > > Signed-off-by: Alistair Popple > --- > arch/powerpc/platforms/powernv/npu-dma.c | 23 +++ > 1 file changed, 19 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c > index 94801d8e7894..dc34662e9df9 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -40,6 +40,13 @@ > DEFINE_SPINLOCK(npu_context_lock); > > /* > + * When an address shootdown range exceeds this threshold we invalidate the > + * entire TLB on the GPU for the given PID rather than each specific address > in > + * the range. > + */ > +#define ATSD_THRESHOLD (2*1024*1024) > + > +/* > * Other types of TCE cache invalidation are not functional in the > * hardware. > */ > @@ -675,11 +682,19 @@ static void pnv_npu2_mn_invalidate_range(struct > mmu_notifier *mn, > struct npu_context *npu_context = mn_to_npu_context(mn); > unsigned long address; > > - for (address = start; address < end; address += PAGE_SIZE) > - mmio_invalidate(npu_context, 1, address, false); > + if (end - start > ATSD_THRESHOLD) { I'm nitpicking, but (end - start) > ATSD_THRESHOLD is clearer > + /* > +* Just invalidate the entire PID if the address range is too > +* large. > +*/ > + mmio_invalidate(npu_context, 0, 0, true); > + } else { > + for (address = start; address < end; address += PAGE_SIZE) > + mmio_invalidate(npu_context, 1, address, false); > > - /* Do the flush only on the final addess == end */ > - mmio_invalidate(npu_context, 1, address, true); > + /* Do the flush only on the final addess == end */ > + mmio_invalidate(npu_context, 1, address, true); > + } > } > Acked-by: Balbir Singh
[PATCH 2/2] powernv/npu: Add a debugfs setting to change ATSD threshold
The threshold at which it becomes more efficient to coalesce a range of ATSDs into a single per-PID ATSD is currently not well understood due to a lack of real-world work loads. This patch adds a debugfs parameter allowing the threshold to be altered at runtime in order to aid future development and refinement of the value. Signed-off-by: Alistair Popple --- arch/powerpc/platforms/powernv/npu-dma.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index dc34662e9df9..a765bf576c14 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -17,7 +17,9 @@ #include #include #include +#include +#include #include #include #include @@ -44,7 +46,8 @@ DEFINE_SPINLOCK(npu_context_lock); * entire TLB on the GPU for the given PID rather than each specific address in * the range. */ -#define ATSD_THRESHOLD (2*1024*1024) +static uint64_t atsd_threshold = 2 * 1024 * 1024; +static struct dentry *atsd_threshold_dentry; /* * Other types of TCE cache invalidation are not functional in the @@ -682,7 +685,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct npu_context *npu_context = mn_to_npu_context(mn); unsigned long address; - if (end - start > ATSD_THRESHOLD) { + if (end - start > atsd_threshold) { /* * Just invalidate the entire PID if the address range is too * large. @@ -956,6 +959,11 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; + if (!atsd_threshold_dentry) { + atsd_threshold_dentry = debugfs_create_x64("atsd_threshold", + 0600, powerpc_debugfs_root, &atsd_threshold); + } + phb->npu.nmmu_flush = of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { -- 2.11.0
[PATCH 1/2] powernv/npu: Do a PID GPU TLB flush when invalidating a large address range
The NPU has a limited number of address translation shootdown (ATSD) registers and the GPU has limited bandwidth to process ATSDs. This can result in contention of ATSD registers leading to soft lockups on some threads, particularly when invalidating a large address range in pnv_npu2_mn_invalidate_range(). At some threshold it becomes more efficient to flush the entire GPU TLB for the given MM context (PID) than individually flushing each address in the range. This patch will result in ranges greater than 2MB being converted from 32+ ATSDs into a single ATSD which will flush the TLB for the given PID on each GPU. Signed-off-by: Alistair Popple --- arch/powerpc/platforms/powernv/npu-dma.c | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 94801d8e7894..dc34662e9df9 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -40,6 +40,13 @@ DEFINE_SPINLOCK(npu_context_lock); /* + * When an address shootdown range exceeds this threshold we invalidate the + * entire TLB on the GPU for the given PID rather than each specific address in + * the range. + */ +#define ATSD_THRESHOLD (2*1024*1024) + +/* * Other types of TCE cache invalidation are not functional in the * hardware. */ @@ -675,11 +682,19 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct npu_context *npu_context = mn_to_npu_context(mn); unsigned long address; - for (address = start; address < end; address += PAGE_SIZE) - mmio_invalidate(npu_context, 1, address, false); + if (end - start > ATSD_THRESHOLD) { + /* +* Just invalidate the entire PID if the address range is too +* large. +*/ + mmio_invalidate(npu_context, 0, 0, true); + } else { + for (address = start; address < end; address += PAGE_SIZE) + mmio_invalidate(npu_context, 1, address, false); - /* Do the flush only on the final addess == end */ - mmio_invalidate(npu_context, 1, address, true); + /* Do the flush only on the final addess == end */ + mmio_invalidate(npu_context, 1, address, true); + } } static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { -- 2.11.0
[PATCH] powerpc/misc: get rid of add_reloc_offset()
add_reloc_offset() is almost redundant with reloc_offset() Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/setup.h | 3 +-- arch/powerpc/kernel/misc.S | 16 arch/powerpc/kernel/prom_init_check.sh | 2 +- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 27fa52ed6d00..115e0896ffa7 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -17,10 +17,9 @@ extern void note_scsi_host(struct device_node *, void *); /* Used in very early kernel initialization. */ extern unsigned long reloc_offset(void); -extern unsigned long add_reloc_offset(unsigned long); extern void reloc_got2(unsigned long); -#define PTRRELOC(x)((typeof(x)) add_reloc_offset((unsigned long)(x))) +#define PTRRELOC(x)((typeof(x)) ((unsigned long)(x) + reloc_offset())) void check_for_initrd(void); void mem_topology_setup(void); diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 384357cb8bc0..2711b10ebdb3 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -39,22 +39,6 @@ _GLOBAL(reloc_offset) .align 3 2: PPC_LONG 1b -/* - * add_reloc_offset(x) returns x + reloc_offset(). - */ -_GLOBAL(add_reloc_offset) - mflrr0 - bl 1f -1: mflrr5 - PPC_LL r4,(2f-1b)(r5) - subfr5,r4,r5 - add r3,r3,r5 - mtlrr0 - blr - - .align 3 -2: PPC_LONG 1b - _GLOBAL(setjmp) mflrr0 PPC_STL r0,0(r3) diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index acb6b9226352..ee9f63186b72 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -16,7 +16,7 @@ # If you really need to reference something from prom_init.o add # it to the list below: -WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush +WHITELIST="__bss_start __bss_stop copy_and_flush _end enter_prom memcpy memset reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224 -- 2.13.3
[PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32
commit 87a156fb18fe1 ("Align hot loops of some string functions") degraded the performance of string functions by adding useless nops A simple benchmark on an 8xx calling 10x a memchr() that matches the first byte runs in 41668 TB ticks before this patch and in 35986 TB ticks after this patch. So this gives an improvement of approx 10% Another benchmark doing the same with a memchr() matching the 128th byte runs in 1011365 TB ticks before this patch and 1005682 TB ticks after this patch, so regardless on the number of loops, removing those useless nops improves the test by 5683 TB ticks. Fixes: 87a156fb18fe1 ("Align hot loops of some string functions") Signed-off-by: Christophe Leroy --- arch/powerpc/lib/string.S | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index 89af53b08b4a..9e96f1c102c6 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S @@ -25,7 +25,9 @@ _GLOBAL(strncpy) mtctr r5 addir6,r3,-1 addir4,r4,-1 +#ifdef CONFIG_PPC64 .balign 16 +#endif 1: lbzur0,1(r4) cmpwi 0,r0,0 stbur0,1(r6) @@ -47,7 +49,9 @@ _GLOBAL(strncmp) mtctr r5 addir5,r3,-1 addir4,r4,-1 +#ifdef CONFIG_PPC64 .balign 16 +#endif 1: lbzur3,1(r5) cmpwi 1,r3,0 lbzur0,1(r4) @@ -68,7 +72,9 @@ _GLOBAL(memchr) #endif mtctr r5 addir3,r3,-1 +#ifdef CONFIG_PPC64 .balign 16 +#endif 1: lbzur0,1(r3) cmpw0,r0,r4 bdnzf 2,1b -- 2.13.3
[PATCH 6/7] powerpc/lib: inline more NUL size verifications
strncmp(), strncpy(), memchr() are often called with constant size. This patch gives GCC a chance to optimise NULL size verification out Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/string.h | 24 arch/powerpc/lib/string.S | 8 2 files changed, 32 insertions(+) diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 196ac5d587fb..1465d5629ef2 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -30,6 +30,22 @@ extern void * memchr(const void *,int,__kernel_size_t); extern void * memcpy_flushcache(void *,const void *,__kernel_size_t); #ifndef CONFIG_FORTIFY_SOURCE +static inline char *__strncpy(char *p, const char *q, __kernel_size_t size) +{ + if (unlikely(!size)) + return p; + return strncpy(p, q, size); +} +#define strncpy __strncpy + +static inline int __strncmp(const char *p, const char *q, __kernel_size_t size) +{ + if (unlikely(!size)) + return 0; + return strncmp(p, q, size); +} +#define strncmp __strncmp + static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, int offset) { int dif; @@ -72,6 +88,14 @@ static inline int __memcmp(const void *p,const void *q,__kernel_size_t size) return memcmp(p, q, size); } #define memcmp __memcmp + +static inline void *__memchr(const void *p, int c, __kernel_size_t size) +{ + if (unlikely(!size)) + return NULL; + return memchr(p, c, size); +} +#define memchr __memchr #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index cbb90fdc672d..89af53b08b4a 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S @@ -18,8 +18,10 @@ /* This clears out any unused part of the destination buffer, just as the libc version does. -- paulus */ _GLOBAL(strncpy) +#ifdef CONFIG_FORTIFY_SOURCE PPC_LCMPI 0,r5,0 beqlr +#endif mtctr r5 addir6,r3,-1 addir4,r4,-1 @@ -38,8 +40,10 @@ _GLOBAL(strncpy) EXPORT_SYMBOL(strncpy) _GLOBAL(strncmp) +#ifdef CONFIG_FORTIFY_SOURCE PPC_LCMPI 0,r5,0 beq-2f +#endif mtctr r5 addir5,r3,-1 addir4,r4,-1 @@ -51,13 +55,17 @@ _GLOBAL(strncmp) beqlr 1 bdnzt eq,1b blr +#ifdef CONFIG_FORTIFY_SOURCE 2: li r3,0 blr +#endif EXPORT_SYMBOL(strncmp) _GLOBAL(memchr) +#ifdef CONFIG_FORTIFY_SOURCE PPC_LCMPI 0,r5,0 beq-2f +#endif mtctr r5 addir3,r3,-1 .balign 16 -- 2.13.3
[PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user()
Rewrite clear_user() on the same principle as memset(0), making use of dcbz to clear complete cache lines. This code is a copy/paste of memset(), with some modifications in order to retrieve remaining number of bytes to be cleared, as it needs to be returned in case of error. On a MPC885, throughput is almost doubled: Before: ~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s After: ~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s On a MPC8321, throughput is multiplied by 2.12: Before: root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s After: root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s Signed-off-by: Christophe Leroy --- arch/powerpc/lib/string_32.S | 85 +++- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S index 5b2a73fb07be..31fc92b0aae6 100644 --- a/arch/powerpc/lib/string_32.S +++ b/arch/powerpc/lib/string_32.S @@ -11,6 +11,7 @@ #include #include #include +#include .text @@ -61,44 +62,78 @@ _GLOBAL(memcmp) #endif EXPORT_SYMBOL(memcmp) +CACHELINE_BYTES = L1_CACHE_BYTES +LG_CACHELINE_BYTES = L1_CACHE_SHIFT +CACHELINE_MASK = (L1_CACHE_BYTES-1) + _GLOBAL(__clear_user) - addir6,r3,-4 - li r3,0 - li r5,0 - cmplwi 0,r4,4 +/* + * Use dcbz on the complete cache lines in the destination + * to set them to zero. This requires that the destination + * area is cacheable. + */ + cmplwi cr0, r4, 4 + mr r10, r3 + li r3, 0 blt 7f - /* clear a single word */ -11:stwur5,4(r6) + +11:stw r3, 0(r10) beqlr - /* clear word sized chunks */ - andi. r0,r6,3 - add r4,r0,r4 - subfr6,r0,r6 - srwir0,r4,2 - andi. r4,r4,3 + andi. r0, r10, 3 + add r11, r0, r4 + subfr6, r0, r10 + + clrlwi r7, r6, 32 - LG_CACHELINE_BYTES + add r8, r7, r11 + srwir9, r8, LG_CACHELINE_BYTES + addic. r9, r9, -1 /* total number of complete cachelines */ + ble 2f + xorir0, r7, CACHELINE_MASK & ~3 + srwi. r0, r0, 2 + beq 3f + mtctr r0 +4: stwur3, 4(r6) + bdnz4b +3: mtctr r9 + li r7, 4 +10:dcbzr7, r6 + addir6, r6, CACHELINE_BYTES + bdnz10b + clrlwi r11, r8, 32 - LG_CACHELINE_BYTES + addir11, r11, 4 + +2: srwir0 ,r11 ,2 mtctr r0 - bdz 7f -1: stwur5,4(r6) + bdz 6f +1: stwur3, 4(r6) bdnz1b - /* clear byte sized chunks */ -7: cmpwi 0,r4,0 +6: andi. r11, r11, 3 beqlr - mtctr r4 - addir6,r6,3 -8: stbur5,1(r6) + mtctr r11 + addir6, r6, 3 +8: stbur3, 1(r6) bdnz8b blr -90:mr r3,r4 + +7: cmpwi cr0, r4, 0 + beqlr + mtctr r4 + addir6, r10, -1 +9: stbur3, 1(r6) + bdnz9b blr -91:mfctr r3 - slwir3,r3,2 - add r3,r3,r4 + +90:mr r3, r4 blr -92:mfctr r3 +91:add r3, r10, r4 + subfr3, r6, r3 blr EX_TABLE(11b, 90b) + EX_TABLE(4b, 91b) + EX_TABLE(10b, 91b) EX_TABLE(1b, 91b) - EX_TABLE(8b, 92b) + EX_TABLE(8b, 91b) + EX_TABLE(9b, 91b) EXPORT_SYMBOL(__clear_user) -- 2.13.3
[PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes
In my 8xx configuration, I get 208 calls to memcmp() Within those 208 calls, about half of them have constant sizes, 46 have a size of 8, 17 have a size of 16, only a few have a size over 16. Other fixed sizes are mostly 4, 6 and 10. This patch inlines calls to memcmp() when size is constant and lower than or equal to 16 In my 8xx configuration, this reduces the number of calls to memcmp() from 208 to 123 The following table shows the number of TB timeticks to perform a constant size memcmp() before and after the patch depending on the size Before After Improvement 01: 75775682 25% 02: 416685682 86% 03: 51137 13258 74% 04: 454555682 87% 05: 58713 13258 77% 06: 58712 13258 77% 07: 68183 20834 70% 08: 56819 15153 73% 09: 70077 28411 60% 10: 70077 28411 60% 11: 79546 35986 55% 12: 68182 28411 58% 13: 81440 35986 55% 14: 81440 39774 51% 15: 94697 43562 54% 16: 79546 37881 52% Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/string.h | 37 + 1 file changed, 37 insertions(+) diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index cf6f495134c3..196ac5d587fb 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#include + #define __HAVE_ARCH_STRNCPY #define __HAVE_ARCH_STRNCMP #define __HAVE_ARCH_MEMSET @@ -28,10 +30,45 @@ extern void * memchr(const void *,int,__kernel_size_t); extern void * memcpy_flushcache(void *,const void *,__kernel_size_t); #ifndef CONFIG_FORTIFY_SOURCE +static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, int offset) +{ + int dif; + + BUILD_BUG_ON(!size || size > 8); + + p += offset, q += offset; + if (size == 1) + return *(u8*)p - *(u8*)q; + if (size == 2) + return be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q); + if (size == 3) { + dif = be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q); + if (dif) + return dif; + return *(u8*)(p + 2) - *(u8*)(q + 2); + } + if (size == 8) { + s64 tmp = be64_to_cpu(*(u64*)p) - be64_to_cpu(*(u64*)q); + return tmp >> 32 ? : (int)tmp; + } + + dif = be32_to_cpu(*(u32*)p) - be32_to_cpu(*(u32*)q); + if (size == 4 || dif) + return dif; + + return ___memcmp(p, q, size - 4, 4); +} + static inline int __memcmp(const void *p,const void *q,__kernel_size_t size) { if (unlikely(!size)) return 0; + if (__builtin_constant_p(size) && size <= 16) { + int dif = ___memcmp(p, q, size < 8 ? size : 8, 0); + if (size <= 8 || dif) + return dif; + return ___memcmp(p, q, size - 8, 8); + } return memcmp(p, q, size); } #define memcmp __memcmp -- 2.13.3
[PATCH 3/7] powerpc/lib: optimise PPC32 memcmp
At the time being, memcmp() compares two chunks of memory byte per byte. This patch optimised the comparison by comparing word by word. A small benchmark performed on an 8xx based on the comparison of two chuncks of 512 bytes performed 10 times gives: Before : 5852274 TB ticks After: 1488638 TB ticks This is almost 4 times faster Signed-off-by: Christophe Leroy --- arch/powerpc/lib/string_32.S | 42 +++--- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S index 94e9c9bc31c3..5b2a73fb07be 100644 --- a/arch/powerpc/lib/string_32.S +++ b/arch/powerpc/lib/string_32.S @@ -19,13 +19,41 @@ _GLOBAL(memcmp) PPC_LCMPI 0,r5,0 beq-2f #endif - mtctr r5 - addir6,r3,-1 - addir4,r4,-1 -1: lbzur3,1(r6) - lbzur0,1(r4) - subf. r3,r0,r3 - bdnzt 2,1b + srawi. r7, r5, 2 /* Divide len by 4 */ + mr r6, r3 + beq-3f + mtctr r7 + li r7, 0 +1: +#ifdef __LITTLE_ENDIAN__ + lwbrx r3, r6, r7 + lwbrx r0, r4, r7 +#else + lwzxr3, r6, r7 + lwzxr0, r4, r7 +#endif + addir7, r7, 4 + subf. r3, r0, r3 + bdnzt eq, 1b + bnelr + andi. r5, r5, 3 + beqlr +3: cmplwi cr1, r5, 2 + blt-cr1, 4f +#ifdef __LITTLE_ENDIAN__ + lhbrx r3, r6, r7 + lhbrx r0, r4, r7 +#else + lhzxr3, r6, r7 + lhzxr0, r4, r7 +#endif + addir7, r7, 2 + subf. r3, r0, r3 + beqlr cr1 + bnelr +4: lbzxr3, r6, r7 + lbzxr0, r4, r7 + subf. r3, r0, r3 blr #ifdef CONFIG_FORTIFY_SOURCE 2: li r3,0 -- 2.13.3
[PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification
Many calls to memcmp() are done with constant size. This patch gives GCC a chance to optimise out the NULL size verification. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/string.h | 10 ++ arch/powerpc/lib/memcmp_64.S | 4 arch/powerpc/lib/string_32.S | 4 3 files changed, 18 insertions(+) diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 9b8cedf618f4..cf6f495134c3 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -27,6 +27,16 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); extern void * memcpy_flushcache(void *,const void *,__kernel_size_t); +#ifndef CONFIG_FORTIFY_SOURCE +static inline int __memcmp(const void *p,const void *q,__kernel_size_t size) +{ + if (unlikely(!size)) + return 0; + return memcmp(p, q, size); +} +#define memcmp __memcmp +#endif + #ifdef CONFIG_PPC64 #define __HAVE_ARCH_MEMSET32 #define __HAVE_ARCH_MEMSET64 diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index d75d18b7bd55..f6822fabf254 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -30,7 +30,9 @@ #endif _GLOBAL(memcmp) +#ifdef CONFIG_FORTIFY_SOURCE cmpdi cr1,r5,0 +#endif /* Use the short loop if both strings are not 8B aligned */ or r6,r3,r4 @@ -39,7 +41,9 @@ _GLOBAL(memcmp) /* Use the short loop if length is less than 32B */ cmpdi cr6,r5,31 +#ifdef CONFIG_FORTIFY_SOURCE beq cr1,.Lzero +#endif bne .Lshort bgt cr6,.Llong diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S index 2519f8bd09e3..94e9c9bc31c3 100644 --- a/arch/powerpc/lib/string_32.S +++ b/arch/powerpc/lib/string_32.S @@ -15,8 +15,10 @@ .text _GLOBAL(memcmp) +#ifdef CONFIG_FORTIFY_SOURCE PPC_LCMPI 0,r5,0 beq-2f +#endif mtctr r5 addir6,r3,-1 addir4,r4,-1 @@ -25,8 +27,10 @@ _GLOBAL(memcmp) subf. r3,r0,r3 bdnzt 2,1b blr +#ifdef CONFIG_FORTIFY_SOURCE 2: li r3,0 blr +#endif EXPORT_SYMBOL(memcmp) _GLOBAL(__clear_user) -- 2.13.3
[PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S
In preparation of optimisation patches, move PPC32 specific memcmp() and __clear_user() into string_32.S Signed-off-by: Christophe Leroy --- arch/powerpc/lib/Makefile| 5 +-- arch/powerpc/lib/string.S| 61 - arch/powerpc/lib/string_32.S | 72 3 files changed, 75 insertions(+), 63 deletions(-) create mode 100644 arch/powerpc/lib/string_32.S diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 653901042ad7..2c9b8c0adf22 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ memcpy_power7.o obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ - string_64.o memcpy_64.o memcmp_64.o pmem.o + memcpy_64.o memcmp_64.o pmem.o obj64-$(CONFIG_SMP)+= locks.o obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o -obj-y += checksum_$(BITS).o checksum_wrappers.o +obj-y += checksum_$(BITS).o checksum_wrappers.o \ + string_$(BITS).o obj-y += sstep.o ldstfp.o quad.o obj64-y+= quad.o diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index a787776822d8..cbb90fdc672d 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S @@ -55,23 +55,6 @@ _GLOBAL(strncmp) blr EXPORT_SYMBOL(strncmp) -#ifdef CONFIG_PPC32 -_GLOBAL(memcmp) - PPC_LCMPI 0,r5,0 - beq-2f - mtctr r5 - addir6,r3,-1 - addir4,r4,-1 -1: lbzur3,1(r6) - lbzur0,1(r4) - subf. r3,r0,r3 - bdnzt 2,1b - blr -2: li r3,0 - blr -EXPORT_SYMBOL(memcmp) -#endif - _GLOBAL(memchr) PPC_LCMPI 0,r5,0 beq-2f @@ -85,47 +68,3 @@ _GLOBAL(memchr) 2: li r3,0 blr EXPORT_SYMBOL(memchr) - -#ifdef CONFIG_PPC32 -_GLOBAL(__clear_user) - addir6,r3,-4 - li r3,0 - li r5,0 - cmplwi 0,r4,4 - blt 7f - /* clear a single word */ -11:stwur5,4(r6) - beqlr - /* clear word sized chunks */ - andi. r0,r6,3 - add r4,r0,r4 - subfr6,r0,r6 - srwir0,r4,2 - andi. r4,r4,3 - mtctr r0 - bdz 7f -1: stwur5,4(r6) - bdnz1b - /* clear byte sized chunks */ -7: cmpwi 0,r4,0 - beqlr - mtctr r4 - addir6,r6,3 -8: stbur5,1(r6) - bdnz8b - blr -90:mr r3,r4 - blr -91:mfctr r3 - slwir3,r3,2 - add r3,r3,r4 - blr -92:mfctr r3 - blr - - EX_TABLE(11b, 90b) - EX_TABLE(1b, 91b) - EX_TABLE(8b, 92b) - -EXPORT_SYMBOL(__clear_user) -#endif diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S new file mode 100644 index ..2519f8bd09e3 --- /dev/null +++ b/arch/powerpc/lib/string_32.S @@ -0,0 +1,72 @@ +/* + * String handling functions for PowerPC32 + * + * Copyright (C) 2018 CS Systemes d'Information + * + * Author: Christophe Leroy + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include +#include +#include +#include + + .text + +_GLOBAL(memcmp) + PPC_LCMPI 0,r5,0 + beq-2f + mtctr r5 + addir6,r3,-1 + addir4,r4,-1 +1: lbzur3,1(r6) + lbzur0,1(r4) + subf. r3,r0,r3 + bdnzt 2,1b + blr +2: li r3,0 + blr +EXPORT_SYMBOL(memcmp) + +_GLOBAL(__clear_user) + addir6,r3,-4 + li r3,0 + li r5,0 + cmplwi 0,r4,4 + blt 7f + /* clear a single word */ +11:stwur5,4(r6) + beqlr + /* clear word sized chunks */ + andi. r0,r6,3 + add r4,r0,r4 + subfr6,r0,r6 + srwir0,r4,2 + andi. r4,r4,3 + mtctr r0 + bdz 7f +1: stwur5,4(r6) + bdnz1b + /* clear byte sized chunks */ +7: cmpwi 0,r4,0 + beqlr + mtctr r4 + addir6,r6,3 +8: stbur5,1(r6) + bdnz8b + blr +90:mr r3,r4 + blr +91:mfctr r3 + slwir3,r3,2 + add r3,r3,r4 + blr +92:mfctr r3 + blr + + EX_TABLE(11b, 90b) + EX_TABLE(1b, 91b) + EX_TABLE(8b, 92b) + +EXPORT_SYMBOL(__clear_user) -- 2.13.3