[PATCH v2 1/4] powerpc: Remove flush_instruction_cache for book3s/32
The only callers of flush_instruction_cache() are: arch/powerpc/kernel/swsusp_booke.S: bl flush_instruction_cache arch/powerpc/mm/nohash/40x.c: flush_instruction_cache(); arch/powerpc/mm/nohash/44x.c: flush_instruction_cache(); arch/powerpc/mm/nohash/fsl_booke.c: flush_instruction_cache(); arch/powerpc/platforms/44x/machine_check.c: flush_instruction_cache(); arch/powerpc/platforms/44x/machine_check.c: flush_instruction_cache(); This function is not used by book3s/32, drop it. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc_32.S | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index b24f866fef81..5c074c2ff5b5 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -271,9 +271,8 @@ _ASM_NOKPROBE_SYMBOL(real_writeb) /* * Flush instruction cache. - * This is a no-op on the 601. */ -#ifndef CONFIG_PPC_8xx +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) _GLOBAL(flush_instruction_cache) #if defined(CONFIG_4xx) lis r3, KERNELBASE@h @@ -290,18 +289,11 @@ _GLOBAL(flush_instruction_cache) mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 -#elif defined(CONFIG_PPC_BOOK3S_601) - blr /* for 601, do nothing */ -#else - /* 603/604 processor - use invalidate-all bit in HID0 */ - mfspr r3,SPRN_HID0 - ori r3,r3,HID0_ICFI - mtspr SPRN_HID0,r3 #endif /* CONFIG_4xx */ isync blr EXPORT_SYMBOL(flush_instruction_cache) -#endif /* CONFIG_PPC_8xx */ +#endif /* * Copy a whole page. We use the dcbz instruction on the destination -- 2.25.0
[PATCH v2 3/4] powerpc: Rewrite 4xx flush_cache_instruction() in C
Nothing prevents flush_cache_instruction() from being writen in C. Do it to improve readability and maintainability. This function is very small and isn't called from assembly, make it static inline in asm/cacheflush.h Signed-off-by: Christophe Leroy --- v2: Written as a static inline instead of adding a new C file for this function alone. --- arch/powerpc/include/asm/cacheflush.h | 8 arch/powerpc/kernel/misc_32.S | 7 +-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index 481877879fec..138e46d8c04e 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -98,7 +98,15 @@ static inline void invalidate_dcache_range(unsigned long start, mb(); /* sync */ } +#ifdef CONFIG_4xx +static inline void flush_instruction_cache(void) +{ + iccci((void *)KERNELBASE); + isync(); +} +#else void flush_instruction_cache(void); +#endif #include diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 5c074c2ff5b5..1bda207459a8 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -272,12 +272,8 @@ _ASM_NOKPROBE_SYMBOL(real_writeb) /* * Flush instruction cache. */ -#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) +#ifdef CONFIG_FSL_BOOKE _GLOBAL(flush_instruction_cache) -#if defined(CONFIG_4xx) - lis r3, KERNELBASE@h - iccci 0,r3 -#elif defined(CONFIG_FSL_BOOKE) #ifdef CONFIG_E200 mfspr r3,SPRN_L1CSR0 ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC @@ -289,7 +285,6 @@ _GLOBAL(flush_instruction_cache) mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 -#endif /* CONFIG_4xx */ isync blr EXPORT_SYMBOL(flush_instruction_cache) -- 2.25.0
Re: [PATCH 1/5] powerpc: Remove flush_instruction_cache for book3s/32
Le 13/08/2020 à 14:14, Christoph Hellwig a écrit : On Thu, Aug 13, 2020 at 01:13:08PM +0100, Christoph Hellwig wrote: On Thu, Aug 13, 2020 at 10:12:00AM +, Christophe Leroy wrote: -#ifndef CONFIG_PPC_8xx +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) _GLOBAL(flush_instruction_cache) #if defined(CONFIG_4xx) lis r3, KERNELBASE@h @@ -290,18 +289,11 @@ _GLOBAL(flush_instruction_cache) mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 -#elif defined(CONFIG_PPC_BOOK3S_601) - blr /* for 601, do nothing */ -#else - /* 603/604 processor - use invalidate-all bit in HID0 */ - mfspr r3,SPRN_HID0 - ori r3,r3,HID0_ICFI - mtspr SPRN_HID0,r3 #endif /* CONFIG_4xx */ isync blr EXPORT_SYMBOL(flush_instruction_cache) -#endif /* CONFIG_PPC_8xx */ +#endif /* CONFIG_PPC_8xx || CONFIG_PPC_BOOK3S_32 */ What about untangling this into entirely separate versions instead of the ifdef mess? Also the export does not seem to be needed at all. Ok, I see that you do that later, sorry. In v2, I drop the untangling patch, because the series completely dismantles flush_instruction_cache() so there is no need for an ephemeral untanggled version of it. Christophe
[PATCH v2] powerpc: Remove flush_instruction_cache() on 8xx
flush_instruction_cache() is never used on 8xx, remove it. Signed-off-by: Christophe Leroy --- v2: Becomes a standalone patch independant of the series dismantling the ASM flush_instruction_cache() --- arch/powerpc/mm/nohash/8xx.c | 7 --- 1 file changed, 7 deletions(-) diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index d2b37146ae6c..231ca95f9ffb 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -244,13 +244,6 @@ void set_context(unsigned long id, pgd_t *pgd) mb(); } -void flush_instruction_cache(void) -{ - isync(); - mtspr(SPRN_IC_CST, IDC_INVALL); - isync(); -} - #ifdef CONFIG_PPC_KUEP void __init setup_kuep(bool disabled) { -- 2.25.0
Re: [PATCH v3] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
Le 13/08/2020 à 17:11, Nathan Lynch a écrit : The drmem lmb list can have hundreds of thousands of entries, and unfortunately lookups take the form of linear searches. As long as this is the case, traversals have the potential to monopolize the CPU and provoke lockup reports, workqueue stalls, and the like unless they explicitly yield. Rather than placing cond_resched() calls within various for_each_drmem_lmb() loop blocks in the code, put it in the iteration expression of the loop macro itself so users can't omit it. Introduce a drmem_lmb_next() iteration helper function which calls cond_resched() at a regular interval during array traversal. Each iteration of the loop in DLPAR code paths can involve around ten RTAS calls which can each take up to 250us, so this ensures the check is performed at worst every few milliseconds. Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format") Signed-off-by: Nathan Lynch Looks a lot better to me than v2. Reviewed-by: Christophe Leroy --- Notes: Changes since v2: * Make drmem_lmb_next() more general. * Adjust reschedule interval for better code generation. * Add commentary to drmem_lmb_next() to explain the cond_resched() call. * Remove bounds assertions. Changes since v1: * Add bounds assertions in drmem_lmb_next(). * Call cond_resched() in the iterator on only every 20th element instead of on every iteration, to reduce overhead in tight loops. arch/powerpc/include/asm/drmem.h | 18 +- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 17ccc6474ab6..6fb928605ed1 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -8,6 +8,8 @@ #ifndef _ASM_POWERPC_LMB_H #define _ASM_POWERPC_LMB_H +#include + struct drmem_lmb { u64 base_addr; u32 drc_index; @@ -26,8 +28,22 @@ struct drmem_lmb_info { extern struct drmem_lmb_info *drmem_info; +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb, + const struct drmem_lmb *start) +{ + /* +* DLPAR code paths can take several milliseconds per element +* when interacting with firmware. Ensure that we don't +* unfairly monopolize the CPU. +*/ + if (((++lmb - start) % 16) == 0) + cond_resched(); + + return lmb; +} + #define for_each_drmem_lmb_in_range(lmb, start, end) \ - for ((lmb) = (start); (lmb) < (end); (lmb)++) + for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start)) #define for_each_drmem_lmb(lmb) \ for_each_drmem_lmb_in_range((lmb), \
[PATCH] kernel/watchdog: fix warning -Wunused-variable for watchdog_allowed_mask in ppc64
In ppc64 config if `CONFIG_SOFTLOCKUP_DETECTOR` is not set then it warns for unused declaration of `watchdog_allowed_mask` while building, move the declaration inside ifdef later in the code. ``` kernel/watchdog.c:47:23: warning: ‘watchdog_allowed_mask’ defined but not used [-Wunused-variable] static struct cpumask watchdog_allowed_mask __read_mostly; ``` Signed-off-by: Balamuruhan S --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..33c9b8a3d51b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,7 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask); @@ -166,6 +165,7 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace; unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +static struct cpumask watchdog_allowed_mask __read_mostly; static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; base-commit: a3a28c4451dff698d0c7ef5a3e80423aa5774e2b -- 2.24.1
[PATCH] powerpc: Add POWER10 raw mode cputable entry
Add a raw mode cputable entry for POWER10. Copies most of the fields from commit a3ea40d5c736 ("powerpc: Add POWER10 architected mode") except for oprofile_cpu_type, machine_check_early, pvr_mask and pvr_mask filed. On bare metal systems we use DT CPU features, which doesn't need a cputable entry. But in VMs we still rely on the raw cputable entry to set the correct values for the PMU related fields. Signed-off-by: Madhavan Srinivasan --- arch/powerpc/kernel/cputable.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index b4066354f0730..1e052f53e5dca 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -541,6 +541,25 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early= __machine_check_early_realmode_p9, .platform = "power9", }, + { /* Power10 */ + .pvr_mask = 0x, + .pvr_value = 0x0080, + .cpu_name = "POWER10 (raw)", + .cpu_features = CPU_FTRS_POWER10, + .cpu_user_features = COMMON_USER_POWER10, + .cpu_user_features2 = COMMON_USER2_POWER10, + .mmu_features = MMU_FTRS_POWER10, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power10", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power10, + .cpu_restore= __restore_cpu_power10, + .machine_check_early= __machine_check_early_realmode_p10, + .platform = "power10", + }, { /* Cell Broadband Engine */ .pvr_mask = 0x, .pvr_value = 0x0070, -- 2.26.2
Re: BUG: unable to handle kernel paging request in fl_dump_key
syzbot has bisected this issue to: commit a51486266c3ba8e035a47fa96df67f274fe0c7d0 Author: Jiri Pirko Date: Sat Jun 15 09:03:49 2019 + net: sched: remove NET_CLS_IND config option bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=1746350990 start commit: 1ca0fafd tcp: md5: allow changing MD5 keys in all socket s.. git tree: net final oops: https://syzkaller.appspot.com/x/report.txt?x=14c6350990 console output: https://syzkaller.appspot.com/x/log.txt?x=10c6350990 kernel config: https://syzkaller.appspot.com/x/.config?x=bf3aec367b9ab569 dashboard link: https://syzkaller.appspot.com/bug?extid=9c1be56e9317b795e874 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=1062a40b10 Reported-by: syzbot+9c1be56e9317b795e...@syzkaller.appspotmail.com Fixes: a51486266c3b ("net: sched: remove NET_CLS_IND config option") For information about bisection process see: https://goo.gl/tpsmEJ#bisection
fsl_espi errors on v5.7.15
Hi, I'm seeing a problem with accessing spi-nor after upgrading a T2081 based system to linux v5.7.15 For this board u-boot and the u-boot environment live on spi-nor. When I use fw_setenv from userspace I get the following kernel logs # fw_setenv foo=1 fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set! fsl_espi ffe11.spi: Transfer done but rx/tx fifo's aren't empty! fsl_espi ffe11.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32 fsl_espi ffe11.spi: Transfer done but rx/tx fifo's aren't empty! fsl_espi ffe11.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32 fsl_espi ffe11.spi: Transfer done but rx/tx fifo's aren't empty! fsl_espi ffe11.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32 ... If I run fw_printenv (before getting it into a bad state) it is able to display the content of the boards u-boot environment. If been unsuccessful in producing a setup for bisecting the issue. I do know the issue doesn't occur on the old 4.4.x based kernel but that's probably not much help. Any pointers on what the issue (and/or solution) might be. Thanks, Chris
Re: [PATCH] powerpc/papr_scm: Limit the readability of 'perf_stats' sysfs attribute
"Aneesh Kumar K.V" writes: > On 8/13/20 10:04 AM, Vaibhav Jain wrote: >> The newly introduced 'perf_stats' attribute uses the default access >> mode of 0444 letting non-root users access performance stats of an >> nvdimm and potentially force the kernel into issuing large number of >> expensive HCALLs. Since the information exposed by this attribute >> cannot be cached hence its better to ward of access to this attribute >> from users who don't need to access these performance statistics. >> >> Hence this patch adds check in perf_stats_show() to only let users >> that are 'perfmon_capable()' to read the nvdimm performance >> statistics. >> >> Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance stats from >> PHYP') >> Reported-by: Aneesh Kumar K.V >> Signed-off-by: Vaibhav Jain >> --- >> arch/powerpc/platforms/pseries/papr_scm.c | 4 >> 1 file changed, 4 insertions(+) >> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c >> b/arch/powerpc/platforms/pseries/papr_scm.c >> index f439f0dfea7d1..36c51bf8af9a8 100644 >> --- a/arch/powerpc/platforms/pseries/papr_scm.c >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c >> @@ -792,6 +792,10 @@ static ssize_t perf_stats_show(struct device *dev, >> struct nvdimm *dimm = to_nvdimm(dev); >> struct papr_scm_priv *p = nvdimm_provider_data(dimm); >> >> +/* Allow access only to perfmon capable users */ >> +if (!perfmon_capable()) >> +return -EACCES; >> + > > An access check is usually done in open(). This is the read callback IIUC. Yes. Otherwise an unprivileged user can open the file, and then trick a suid program into reading from it. cheers
Re: [RFC PATCH 1/2] powerpc/numa: Introduce logical numa id
Hi Aneesh, "Aneesh Kumar K.V" writes: > "Aneesh Kumar K.V" writes: >> On 8/8/20 2:15 AM, Nathan Lynch wrote: >>> "Aneesh Kumar K.V" writes: On 8/7/20 9:54 AM, Nathan Lynch wrote: > "Aneesh Kumar K.V" writes: >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c >> index e437a9ac4956..6c659aada55b 100644 >> --- a/arch/powerpc/mm/numa.c >> +++ b/arch/powerpc/mm/numa.c >> @@ -221,25 +221,51 @@ static void initialize_distance_lookup_table(int >> nid, >> } >>} >> >> +static u32 nid_map[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 1] = >> NUMA_NO_NODE}; > > It's odd to me to use MAX_NUMNODES for this array when it's going to be > indexed not by Linux's logical node IDs but by the platform-provided > domain number, which has no relation to MAX_NUMNODES. I didn't want to dynamically allocate this. We could fetch "ibm,max-associativity-domains" to find the size for that. The current code do assume firmware group id to not exceed MAX_NUMNODES. Hence kept the array size to be MAX_NUMNODEs. I do agree that it is confusing. May be we can do #define MAX_AFFINITY_DOMAIN MAX_NUMNODES? >>> >>> Well, consider: >>> >>> - ibm,max-associativity-domains can change at runtime with LPM. This >>>doesn't happen in practice yet, but we should probably start thinking >>>about how to support that. >>> - The domain numbering isn't clearly specified to have any particular >>>properties such as beginning at zero or a contiguous range. >>> >>> While the current code likely contains assumptions contrary to these >>> points, a change such as this is an opportunity to think about whether >>> those assumptions can be reduced or removed. In particular I think it >>> would be good to gracefully degrade when the number of NUMA affinity >>> domains can exceed MAX_NUMNODES. Using the platform-supplied domain >>> numbers to directly index Linux data structures will make that >>> impossible. >>> >>> So, maybe genradix or even xarray wouldn't actually be overengineering >>> here. >>> >> >> One of the challenges with such a data structure is that we initialize >> the nid_map before the slab is available. This means a memblock based >> allocation and we would end up implementing such a sparse data structure >> ourselves here. Yes, good point. >> As you mentioned above, since we know that hypervisor as of now limits >> the max affinity domain id below ibm,max-associativity-domains we are >> good with an array-like nid_map we have here. This keeps the code simpler. >> >> This will also allow us to switch to a more sparse data structure as you >> requested here in the future because the main change that is pushed in >> this series is the usage of firmare_group_id_to_nid(). The details of >> the data structure we use to keep track of that mapping are pretty much >> internal to that function. > > How about this? This makes it not a direct index. But it do limit the > search to max numa node on the system. > > static int domain_id_map[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 1] = -1 }; > > static int __affinity_domain_to_nid(int domain_id, int max_nid) > { > int i; > > for (i = 0; i < max_nid; i++) { > if (domain_id_map[i] == domain_id) > return i; > } > return NUMA_NO_NODE; > } OK, this indexes the array by Linux's node id, good. I was wondering if I could persuade you do flip it around like this :-) Walking through the code below: > int affinity_domain_to_nid(struct affinity_domain *domain) > { > int nid, domain_id; > static int last_nid = 0; > static DEFINE_SPINLOCK(node_id_lock); > > domain_id = domain->id; > /* >* For PowerNV we don't change the node id. This helps to avoid >* confusion w.r.t the expected node ids. On pseries, node numbers >* are virtualized. Hence do logical node id for pseries. >*/ > if (!firmware_has_feature(FW_FEATURE_LPAR)) > return domain_id; > > if (domain_id == -1 || last_nid == MAX_NUMNODES) > return NUMA_NO_NODE; > > nid = __affinity_domain_to_nid(domain_id, last_nid); So this is pseries fast path. Attempt to look up the Linux node for the given domain, where last_nid is the highest-numbered node in use so far. If the result is in [0..last_nid] we're done. > > if (nid == NUMA_NO_NODE) { > spin_lock(_id_lock); If the lookup fails enter the critical section. As we discussed offline, this is a precaution for potentially parallel device probing. > /* recheck with lock held */ > nid = __affinity_domain_to_nid(domain_id, last_nid); Attempt the same lookup again. If the result is in [0..last_nid], another thread has just initialized the mapping for this domain and we're done. > if (nid == NUMA_NO_NODE) { > nid =
Re: [PATCH] sfc_ef100: Fix build failure on powerpc
On 13/08/2020 15:39, Christophe Leroy wrote: > ppc6xx_defconfig fails building sfc.ko module, complaining > about the lack of _umoddi3 symbol. > > This is due to the following test > > if (EFX_MIN_DMAQ_SIZE % reader->value) { > > Because reader->value is u64. Already fixed in net.git by 41077c990266 ("sfc: fix ef100 design-param checking"). But thanks anyway.
Re: [PATCH] arch/powerpc: use simple i2c probe function
Hi, On 07/08/20 17:27, Stephen Kitt wrote: > The i2c probe functions here don't use the id information provided in > their second argument, so the single-parameter i2c probe function > ("probe_new") can be used instead. > > This avoids scanning the identifier tables during probes. > > Signed-off-by: Stephen Kitt Reviewed-by: Luca Ceresoli -- Luca
Re: linux-next: runtime warning in Linus' tree
On Thu, Aug 13, 2020 at 11:20:33AM -0400, Johannes Weiner wrote: > On Thu, Aug 13, 2020 at 04:46:54PM +1000, Stephen Rothwell wrote: > > [0.055220][T0] WARNING: CPU: 0 PID: 0 at mm/memcontrol.c:5220 > > mem_cgroup_css_alloc+0x350/0x904 > > > [The line numbers in the final linux next are 5226 and 5141 due to > > later patches.] > > > > Introduced (or exposed) by commit > > > > 3e38e0aaca9e ("mm: memcg: charge memcg percpu memory to the parent > > cgroup") > > > > This commit actually adds the WARN_ON, so it either adds the bug that > > sets it off, or the bug already existed. > > > > Unfotunately, the version of this patch in linux-next up tuntil today > > is different. :-( > > Sorry, I made a last-minute request to include these checks in that > patch to make the code a bit more robust, but they trigger a false > positive here. Let's remove them. > > --- > > From de8ea7c96c056c3cbe7b93995029986a158fb9cd Mon Sep 17 00:00:00 2001 > From: Johannes Weiner > Date: Thu, 13 Aug 2020 10:40:54 -0400 > Subject: [PATCH] mm: memcontrol: fix warning when allocating the root cgroup > > Commit 3e38e0aaca9e ("mm: memcg: charge memcg percpu memory to the > parent cgroup") adds memory tracking to the memcg kernel structures > themselves to make cgroups liable for the memory they are consuming > through the allocation of child groups (which can be significant). > > This code is a bit awkward as it's spread out through several > functions: The outermost function does memalloc_use_memcg(parent) to > set up current->active_memcg, which designates which cgroup to charge, > and the inner functions pass GFP_ACCOUNT to request charging for > specific allocations. To make sure this dependency is satisfied at all > times - to make sure we don't randomly charge whoever is calling the > functions - the inner functions warn on !current->active_memcg. > > However, this triggers a false warning when the root memcg itself is > allocated. No parent exists in this case, and so current->active_memcg > is rightfully NULL. It's a false positive, not indicative of a bug. > > Delete the warnings for now, we can revisit this later. > > Fixes: 3e38e0aaca9e ("mm: memcg: charge memcg percpu memory to the parent > cgroup") > Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Thanks! > --- > mm/memcontrol.c | 6 -- > 1 file changed, 6 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index d59fd9af6e63..9d87082e64aa 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -5137,9 +5137,6 @@ static int alloc_mem_cgroup_per_node_info(struct > mem_cgroup *memcg, int node) > if (!pn) > return 1; > > - /* We charge the parent cgroup, never the current task */ > - WARN_ON_ONCE(!current->active_memcg); > - > pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, >GFP_KERNEL_ACCOUNT); > if (!pn->lruvec_stat_local) { > @@ -5222,9 +5219,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) > goto fail; > } > > - /* We charge the parent cgroup, never the current task */ > - WARN_ON_ONCE(!current->active_memcg); > - > memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, > GFP_KERNEL_ACCOUNT); > if (!memcg->vmstats_local) > -- > 2.28.0 >
[PATCH 6/9] powerpc: Remove support for PowerPC 601
PowerPC 601 has been retired. Remove all associated specific code. CPU_FTRS_PPC601 has CPU_FTR_COHERENT_ICACHE and CPU_FTR_COMMON. CPU_FTR_COMMON is already present via other CPU_FTRS. None of the remaining CPU selects CPU_FTR_COHERENT_ICACHE. So CPU_FTRS_PPC601 can be removed from the possible features, hence can be removed completely. Signed-off-by: Christophe Leroy --- arch/powerpc/boot/util.S| 15 +-- arch/powerpc/include/asm/cputable.h | 12 ++--- arch/powerpc/include/asm/ppc_asm.h | 3 +- arch/powerpc/include/asm/ptrace.h | 4 -- arch/powerpc/include/asm/time.h | 2 +- arch/powerpc/include/asm/timex.h| 3 -- arch/powerpc/kernel/btext.c | 8 +--- arch/powerpc/kernel/entry_32.S | 18 arch/powerpc/kernel/head_32.S | 44 ++ arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/traps.c | 4 -- arch/powerpc/kernel/vdso32/datapage.S | 2 - arch/powerpc/kernel/vdso32/vdso32.lds.S | 2 - arch/powerpc/mm/book3s32/mmu.c | 39 +++- arch/powerpc/mm/ptdump/bats.c | 59 - arch/powerpc/platforms/powermac/setup.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 4 -- 17 files changed, 17 insertions(+), 206 deletions(-) diff --git a/arch/powerpc/boot/util.S b/arch/powerpc/boot/util.S index f11f0589a669..d03cdb7606dc 100644 --- a/arch/powerpc/boot/util.S +++ b/arch/powerpc/boot/util.S @@ -18,7 +18,7 @@ .text -/* udelay (on non-601 processors) needs to know the period of the +/* udelay needs to know the period of the * timebase in nanoseconds. This used to be hardcoded to be 60ns * (period of 66MHz/4). Now a variable is used that is initialized to * 60 for backward compatibility, but it can be overridden as necessary @@ -37,19 +37,6 @@ timebase_period_ns: */ .globl udelay udelay: - mfspr r4,SPRN_PVR - srwir4,r4,16 - cmpwi 0,r4,1 /* 601 ? */ - bne .Ludelay_not_601 -00:li r0,86 /* Instructions / microsecond? */ - mtctr r0 -10:addir0,r0,0 /* NOP */ - bdnz10b - subic. r3,r3,1 - bne 00b - blr - -.Ludelay_not_601: mulli r4,r3,1000 /* nanoseconds */ /* Change r4 to be the number of ticks using: * (nanoseconds + (timebase_period_ns - 1 )) / timebase_period_ns diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index fdddb822d564..76ce0ffd8af0 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -294,8 +294,6 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_MAYBE_CAN_NAP 0 #endif -#define CPU_FTRS_PPC601(CPU_FTR_COMMON | \ - CPU_FTR_COHERENT_ICACHE) #define CPU_FTRS_603 (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \ CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE) #define CPU_FTRS_604 (CPU_FTR_COMMON | CPU_FTR_PPC_LE) @@ -511,10 +509,8 @@ static inline void cpu_feature_keys_init(void) { } #else enum { CPU_FTRS_POSSIBLE = -#ifdef CONFIG_PPC_BOOK3S_601 - CPU_FTRS_PPC601 | -#elif defined(CONFIG_PPC_BOOK3S_32) - CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU | +#ifdef CONFIG_PPC_BOOK3S_32 + CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU | CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 | CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX | CPU_FTRS_7400_NOTAU | CPU_FTRS_7400 | CPU_FTRS_7450_20 | @@ -589,9 +585,7 @@ enum { #else enum { CPU_FTRS_ALWAYS = -#ifdef CONFIG_PPC_BOOK3S_601 - CPU_FTRS_PPC601 & -#elif defined(CONFIG_PPC_BOOK3S_32) +#ifdef CONFIG_PPC_BOOK3S_32 CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU & CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 & CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX & diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 67a421b81a50..511786f0e40d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -401,8 +401,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #define MFTBU(dest)mfspr dest, SPRN_TBRU #endif -/* tlbsync is not implemented on 601 */ -#if !defined(CONFIG_SMP) || defined(CONFIG_PPC_BOOK3S_601) +#ifndef CONFIG_SMP #define TLBSYNC #else #define TLBSYNCtlbsync; sync diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 155a197c0aa1..e2c778c176a3 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -243,11 +243,7 @@ static inline void set_trap_norestart(struct pt_regs *regs) } #define arch_has_single_step() (1) -#ifndef CONFIG_PPC_BOOK3S_601 #define arch_has_block_step() (true) -#else
[PATCH 7/9] powerpc: Tidy up a bit after removal of PowerPC 601.
The removal of the 601 left some standalone blocks from former if/else. Drop the { } and re-indent. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/btext.c| 11 +++-- arch/powerpc/mm/book3s32/mmu.c | 45 +++--- 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index b609fb39dba8..c22a8e0dbc93 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -95,13 +95,10 @@ void __init btext_prepare_BAT(void) boot_text_mapped = 0; return; } - { - /* 603, 604, G3, G4, ... */ - lowbits = addr & ~0xFF00UL; - addr &= 0xFF00UL; - disp_BAT[0] = vaddr | (BL_16M<<2) | 2; - disp_BAT[1] = addr | (_PAGE_NO_CACHE | _PAGE_GUARDED | BPP_RW); - } + lowbits = addr & ~0xFF00UL; + addr &= 0xFF00UL; + disp_BAT[0] = vaddr | (BL_16M<<2) | 2; + disp_BAT[1] = addr | (_PAGE_NO_CACHE | _PAGE_GUARDED | BPP_RW); logicalDisplayBase = (void *) (vaddr + lowbits); } #endif diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index f42b718ea971..16546ca4074e 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -73,16 +73,13 @@ unsigned long p_block_mapped(phys_addr_t pa) static int find_free_bat(void) { int b; + int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; - { - int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + for (b = 0; b < n; b++) { + struct ppc_bat *bat = BATS[b]; - for (b = 0; b < n; b++) { - struct ppc_bat *bat = BATS[b]; - - if (!(bat[1].batu & 3)) - return b; - } + if (!(bat[1].batu & 3)) + return b; } return -1; } @@ -273,24 +270,22 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, flags &= ~_PAGE_COHERENT; bl = (size >> 17) - 1; - { - /* Do DBAT first */ - wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE - | _PAGE_COHERENT | _PAGE_GUARDED); - wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; - bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ - bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) - bat[1].batu |= 1; /* Vp = 1 */ - if (flags & _PAGE_GUARDED) { - /* G bit must be zero in IBATs */ - flags &= ~_PAGE_EXEC; - } - if (flags & _PAGE_EXEC) - bat[0] = bat[1]; - else - bat[0].batu = bat[0].batl = 0; + /* Do DBAT first */ + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT | _PAGE_GUARDED); + wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[1].batu |= 1; /* Vp = 1 */ + if (flags & _PAGE_GUARDED) { + /* G bit must be zero in IBATs */ + flags &= ~_PAGE_EXEC; } + if (flags & _PAGE_EXEC) + bat[0] = bat[1]; + else + bat[0].batu = bat[0].batl = 0; bat_addrs[index].start = virt; bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; -- 2.25.0
[PATCH 9/9] powerpc: Remove get_tb_or_rtc()
601 is gone, get_tb_or_rtc() is equivalent to get_tb(). Replace the former by the later. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/time.h | 5 - arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/time.c | 6 +++--- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 37fa99f9783d..c904a8861fa6 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -104,11 +104,6 @@ static inline u64 get_tb(void) } #endif /* !CONFIG_PPC64 */ -static inline u64 get_tb_or_rtc(void) -{ - return get_tb(); -} - static inline void set_tb(unsigned int upper, unsigned int lower) { mtspr(SPRN_TBWL, 0); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index bf21ebd36190..2d188f81ebdb 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,7 +104,7 @@ static inline notrace unsigned long get_irq_happened(void) static inline notrace int decrementer_check_overflow(void) { - u64 now = get_tb_or_rtc(); + u64 now = get_tb(); u64 *next_tb = this_cpu_ptr(_next_tb); return now >= *next_tb; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 13c820c15d37..760ea359a7f7 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -595,7 +595,7 @@ void timer_interrupt(struct pt_regs *regs) irq_work_run(); } - now = get_tb_or_rtc(); + now = get_tb(); if (now >= *next_tb) { *next_tb = ~(u64)0; if (evt->event_handler) @@ -937,7 +937,7 @@ static void __init clocksource_init(void) static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { - __this_cpu_write(decrementers_next_tb, get_tb_or_rtc() + evt); + __this_cpu_write(decrementers_next_tb, get_tb() + evt); set_dec(evt); /* We may have raced with new irq work */ @@ -1071,7 +1071,7 @@ void __init time_init(void) tb_to_ns_scale = scale; tb_to_ns_shift = shift; /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ - boot_tb = get_tb_or_rtc(); + boot_tb = get_tb(); /* If platform provided a timezone (pmac), we correct the time */ if (timezone_offset) { -- 2.25.0
[PATCH 8/9] powerpc: Remove __USE_RTC()
Now that PowerPC 601 is gone, __USE_RTC() is never true. Remove it. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/time.h | 9 +- arch/powerpc/kernel/time.c | 52 + 2 files changed, 9 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index ce065589192a..37fa99f9783d 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -38,9 +38,6 @@ struct div_result { u64 result_low; }; -/* Accessor functions for the timebase (RTC on 601) registers. */ -#define __USE_RTC()(0) - #ifdef CONFIG_PPC64 /* For compatibility, get_tbl() is defined as get_tb() on ppc64 */ @@ -109,7 +106,7 @@ static inline u64 get_tb(void) static inline u64 get_tb_or_rtc(void) { - return __USE_RTC() ? get_rtc() : get_tb(); + return get_tb(); } static inline void set_tb(unsigned int upper, unsigned int lower) @@ -153,10 +150,6 @@ static inline void set_dec(u64 val) static inline unsigned long tb_ticks_since(unsigned long tstamp) { - if (__USE_RTC()) { - int delta = get_rtcl() - (unsigned int) tstamp; - return delta < 0 ? delta + 10 : delta; - } return get_tbl() - tstamp; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index f85539ebb513..13c820c15d37 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -75,15 +75,6 @@ #include #include -static u64 rtc_read(struct clocksource *); -static struct clocksource clocksource_rtc = { - .name = "rtc", - .rating = 400, - .flags= CLOCK_SOURCE_IS_CONTINUOUS, - .mask = CLOCKSOURCE_MASK(64), - .read = rtc_read, -}; - static u64 timebase_read(struct clocksource *); static struct clocksource clocksource_timebase = { .name = "timebase", @@ -447,19 +438,9 @@ void vtime_flush(struct task_struct *tsk) void __delay(unsigned long loops) { unsigned long start; - int diff; spin_begin(); - if (__USE_RTC()) { - start = get_rtcl(); - do { - /* the RTCL register wraps at 10 */ - diff = get_rtcl() - start; - if (diff < 0) - diff += 10; - spin_cpu_relax(); - } while (diff < loops); - } else if (tb_invalid) { + if (tb_invalid) { /* * TB is in error state and isn't ticking anymore. * HMI handler was unable to recover from TB error. @@ -696,8 +677,6 @@ EXPORT_SYMBOL_GPL(tb_to_ns); */ notrace unsigned long long sched_clock(void) { - if (__USE_RTC()) - return get_rtc(); return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } @@ -847,11 +826,6 @@ void read_persistent_clock64(struct timespec64 *ts) } /* clocksource code */ -static notrace u64 rtc_read(struct clocksource *cs) -{ - return (u64)get_rtc(); -} - static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); @@ -948,12 +922,7 @@ void update_vsyscall_tz(void) static void __init clocksource_init(void) { - struct clocksource *clock; - - if (__USE_RTC()) - clock = _rtc; - else - clock = _timebase; + struct clocksource *clock = _timebase; if (clocksource_register_hz(clock, tb_ticks_per_sec)) { printk(KERN_ERR "clocksource: %s is already registered\n", @@ -1071,17 +1040,12 @@ void __init time_init(void) u64 scale; unsigned shift; - if (__USE_RTC()) { - /* 601 processor: dec counts down by 128 every 128ns */ - ppc_tb_freq = 10; - } else { - /* Normal PowerPC with timebase register */ - ppc_md.calibrate_decr(); - printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", - ppc_tb_freq / 100, ppc_tb_freq % 100); - printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", - ppc_proc_freq / 100, ppc_proc_freq % 100); - } + /* Normal PowerPC with timebase register */ + ppc_md.calibrate_decr(); + printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", + ppc_tb_freq / 100, ppc_tb_freq % 100); + printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", + ppc_proc_freq / 100, ppc_proc_freq % 100); tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; -- 2.25.0
[PATCH 4/9] powerpc: Drop SYNC_601() ISYNC_601() and SYNC()
Those macros are now empty at all time. Drop them. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/ppc_asm.h | 4 arch/powerpc/kernel/entry_32.S | 17 + arch/powerpc/kernel/fpu.S | 1 - arch/powerpc/kernel/head_32.S | 9 - arch/powerpc/kernel/head_32.h | 1 - arch/powerpc/kernel/l2cr_6xx.S | 3 +-- arch/powerpc/kernel/misc_32.S | 1 - arch/powerpc/mm/book3s32/hash_low.S | 12 8 files changed, 2 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 0b9dc814b81c..67a421b81a50 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -382,10 +382,6 @@ GLUE(.,name): #endif /* various errata or part fixups */ -#defineSYNC -#define SYNC_601 -#define ISYNC_601 - #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define MFTB(dest) \ 90:mfspr dest, SPRN_TBRL; \ diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index f4d0af8e1136..f25ea188ecd3 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -234,7 +234,6 @@ transfer_to_handler_cont: mtspr SPRN_SRR0,r11 mtspr SPRN_SRR1,r10 mtlrr9 - SYNC RFI /* jump to handler, enable MMU */ #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) @@ -264,7 +263,6 @@ _ASM_NOKPROBE_SYMBOL(transfer_to_handler_cont) LOAD_REG_IMMEDIATE(r0, MSR_KERNEL) mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r0 - SYNC RFI reenable_mmu: @@ -323,7 +321,6 @@ stack_ovf: #endif mtspr SPRN_SRR0,r9 mtspr SPRN_SRR1,r10 - SYNC RFI _ASM_NOKPROBE_SYMBOL(stack_ovf) #endif @@ -411,7 +408,6 @@ ret_from_syscall: /* disable interrupts so current_thread_info()->flags can't change */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ - SYNC mtmsr r10 lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO @@ -474,7 +470,6 @@ syscall_exit_finish: #endif mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - SYNC RFI _ASM_NOKPROBE_SYMBOL(syscall_exit_finish) #ifdef CONFIG_44x @@ -567,7 +562,6 @@ syscall_exit_work: * lockdep as we are supposed to have IRQs on at this point */ ori r10,r10,MSR_EE - SYNC mtmsr r10 /* Save NVGPRS if they're not saved already */ @@ -606,7 +600,6 @@ ret_from_kernel_syscall: #endif mtspr SPRN_SRR0, r9 mtspr SPRN_SRR1, r10 - SYNC RFI _ASM_NOKPROBE_SYMBOL(ret_from_kernel_syscall) @@ -810,7 +803,6 @@ fast_exception_return: REST_GPR(9, r11) REST_GPR(12, r11) lwz r11,GPR11(r11) - SYNC RFI _ASM_NOKPROBE_SYMBOL(fast_exception_return) @@ -872,7 +864,6 @@ ret_from_except: * from the interrupt. */ /* Note: We don't bother telling lockdep about it */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - SYNC/* Some chip revs have problems here... */ mtmsr r10 /* disable interrupts */ lwz r3,_MSR(r1) /* Returning to user mode? */ @@ -1035,7 +1026,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) * exc_exit_restart below. -- paulus */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) - SYNC mtmsr r10 /* clear the RI bit */ .globl exc_exit_restart exc_exit_restart: @@ -1046,7 +1036,6 @@ exc_exit_restart: lwz r1,GPR1(r1) .globl exc_exit_restart_end exc_exit_restart_end: - SYNC RFI _ASM_NOKPROBE_SYMBOL(exc_exit_restart) _ASM_NOKPROBE_SYMBOL(exc_exit_restart_end) @@ -1274,7 +1263,6 @@ do_resched: /* r10 contains MSR_KERNEL here */ mfmsr r10 #endif ori r10,r10,MSR_EE - SYNC mtmsr r10 /* hard-enable interrupts */ bl schedule recheck: @@ -1283,7 +1271,6 @@ recheck: * TI_FLAGS aren't advertised. */ LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - SYNC mtmsr r10 /* disable interrupts */ lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_NEED_RESCHED @@ -1292,7 +1279,6 @@ recheck: beq restore_user do_user_signal:/* r10 contains MSR_KERNEL here */ ori r10,r10,MSR_EE - SYNC mtmsr r10 /* hard-enable interrupts */ /* save r13-r31 in the exception frame, if not already done */ lwz r3,_TRAP(r1) @@ -1382,8 +1368,7 @@ _GLOBAL(enter_rtas) mfmsr r9 stw r9,8(r1) LOAD_REG_IMMEDIATE(r0,MSR_KERNEL) - SYNC/* disable interrupts so
[PATCH 2/9] powerpc: Remove SYNC on non 6xx
SYNC is usefull for Powerpc 601 only. On everything else, SYNC is empty. Remove it from code that is not made to run on 6xx. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/head_40x.S | 1 - arch/powerpc/kernel/head_booke.h | 1 - arch/powerpc/kernel/misc_64.S| 1 - 3 files changed, 3 deletions(-) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 5b282d9965a5..44c9018aed1b 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -72,7 +72,6 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC rfi /* enables MMU */ b . /* prevent prefetch past rfi */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 18f87bf9e32b..71c359d438b5 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -176,7 +176,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) #endif mtspr SPRN_SRR1,r10 mtspr SPRN_SRR0,r11 - SYNC RFI /* jump to handler, enable MMU */ 99:b ret_from_kernel_syscall .endm diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 7bb46ad98207..070465825c21 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -365,7 +365,6 @@ _GLOBAL(kexec_smp_wait) li r4,KEXEC_STATE_REAL_MODE stb r4,PACAKEXECSTATE(r13) - SYNC b kexec_wait -- 2.25.0
[PATCH 5/9] powerpc: Remove PowerPC 601
Powerpc 601 is 25 years old. It is not selected by any defconfig. It requires a lot of special handling as it deviates from the standard 6xx. Retire it. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/cputable.c | 15 --- arch/powerpc/platforms/Kconfig.cputype | 11 ++- 2 files changed, 2 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 3d406a9626e8..1338ed6e545b 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -592,21 +592,6 @@ static struct cpu_spec __initdata cpu_specs[] = { #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC32 -#ifdef CONFIG_PPC_BOOK3S_601 - { /* 601 */ - .pvr_mask = 0x, - .pvr_value = 0x0001, - .cpu_name = "601", - .cpu_features = CPU_FTRS_PPC601, - .cpu_user_features = COMMON_USER | PPC_FEATURE_601_INSTR | - PPC_FEATURE_UNIFIED_CACHE | PPC_FEATURE_NO_TB, - .mmu_features = MMU_FTR_HPTE_TABLE, - .icache_bsize = 32, - .dcache_bsize = 32, - .machine_check = machine_check_generic, - .platform = "ppc601", - }, -#endif /* CONFIG_PPC_BOOK3S_601 */ #ifdef CONFIG_PPC_BOOK3S_6xx { /* 603 */ .pvr_mask = 0x, diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 87737ec86d39..2b39589a6a8a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -20,7 +20,7 @@ choice depends on PPC32 help There are five families of 32 bit PowerPC chips supported. - The most common ones are the desktop and server CPUs (601, 603, + The most common ones are the desktop and server CPUs (603, 604, 740, 750, 74xx) CPUs from Freescale and IBM, with their embedded 512x/52xx/82xx/83xx/86xx counterparts. The other embedded parts, namely 4xx, 8xx, e200 (55xx) and e500 @@ -30,7 +30,7 @@ choice If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx. config PPC_BOOK3S_6xx - bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601" + bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" select PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_PMU_SUPPORT @@ -38,13 +38,6 @@ config PPC_BOOK3S_6xx select PPC_HAVE_KUAP select HAVE_ARCH_VMAP_STACK -config PPC_BOOK3S_601 - bool "PowerPC 601" - select PPC_BOOK3S_32 - select PPC_FPU - select PPC_HAVE_KUAP - select HAVE_ARCH_VMAP_STACK - config PPC_85xx bool "Freescale 85xx" select E500 -- 2.25.0
[PATCH 1/9] powerpc: Remove flush_instruction_cache for book3s/32
The only callers of flush_instruction_cache() are: arch/powerpc/kernel/swsusp_booke.S: bl flush_instruction_cache arch/powerpc/mm/nohash/40x.c: flush_instruction_cache(); arch/powerpc/mm/nohash/44x.c: flush_instruction_cache(); arch/powerpc/mm/nohash/fsl_booke.c: flush_instruction_cache(); arch/powerpc/platforms/44x/machine_check.c: flush_instruction_cache(); arch/powerpc/platforms/44x/machine_check.c: flush_instruction_cache(); This function is not used by book3s/32, drop it. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc_32.S | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index b24f866fef81..bd870743c06f 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -271,9 +271,8 @@ _ASM_NOKPROBE_SYMBOL(real_writeb) /* * Flush instruction cache. - * This is a no-op on the 601. */ -#ifndef CONFIG_PPC_8xx +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) _GLOBAL(flush_instruction_cache) #if defined(CONFIG_4xx) lis r3, KERNELBASE@h @@ -290,18 +289,11 @@ _GLOBAL(flush_instruction_cache) mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 -#elif defined(CONFIG_PPC_BOOK3S_601) - blr /* for 601, do nothing */ -#else - /* 603/604 processor - use invalidate-all bit in HID0 */ - mfspr r3,SPRN_HID0 - ori r3,r3,HID0_ICFI - mtspr SPRN_HID0,r3 #endif /* CONFIG_4xx */ isync blr EXPORT_SYMBOL(flush_instruction_cache) -#endif /* CONFIG_PPC_8xx */ +#endif /* CONFIG_PPC_8xx || CONFIG_PPC_BOOK3S_32 */ /* * Copy a whole page. We use the dcbz instruction on the destination -- 2.25.0
[PATCH 3/9] powerpc: Remove CONFIG_PPC601_SYNC_FIX
This config option isn't in any defconfig. The very first versions of Powerpc 601 have a bug which requires additional sync before and/or after some instructions. This was more than 25 years ago and time has come to retire those buggy versions of the 601 from the kernel. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/ppc_asm.h | 6 -- arch/powerpc/platforms/Kconfig | 15 --- 2 files changed, 21 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index b4cc6608131c..0b9dc814b81c 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -382,15 +382,9 @@ GLUE(.,name): #endif /* various errata or part fixups */ -#ifdef CONFIG_PPC601_SYNC_FIX -#define SYNC sync; isync -#define SYNC_601 sync -#define ISYNC_601 isync -#else #defineSYNC #define SYNC_601 #define ISYNC_601 -#endif #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define MFTB(dest) \ diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index fb7515b4fa9c..f377a56ecc85 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -199,21 +199,6 @@ source "drivers/cpuidle/Kconfig" endmenu -config PPC601_SYNC_FIX - bool "Workarounds for PPC601 bugs" - depends on PPC_BOOK3S_601 && PPC_PMAC - default y - help - Some versions of the PPC601 (the first PowerPC chip) have bugs which - mean that extra synchronization instructions are required near - certain instructions, typically those that make major changes to the - CPU state. These extra instructions reduce performance slightly. - If you say N here, these extra instructions will not be included, - resulting in a kernel which will run faster but may not run at all - on some systems with the PPC601 chip. - - If in doubt, say Y here. - config TAU bool "On-chip CPU temperature sensor support" depends on PPC_BOOK3S_32 -- 2.25.0
Re: [PATCH v2 3/4] powerpc/memhotplug: Make lmb size 64bit
Hi [This is an automated email] This commit has been processed because it contains a -stable tag. The stable tag indicates that it's relevant for the following trees: all The bot has tested the following trees: v5.8, v5.7.14, v5.4.57, v4.19.138, v4.14.193, v4.9.232, v4.4.232. v5.8: Build OK! v5.7.14: Build OK! v5.4.57: Build OK! v4.19.138: Failed to apply! Possible dependencies: Unable to calculate v4.14.193: Failed to apply! Possible dependencies: Unable to calculate v4.9.232: Failed to apply! Possible dependencies: 1a367063ca0c ("powerpc/pseries: Check memory device state before onlining/offlining") 25b587fba9a4 ("powerpc/pseries: Correct possible read beyond dlpar sysfs buffer") 333f7b76865b ("powerpc/pseries: Implement indexed-count hotplug memory add") 753843471cbb ("powerpc/pseries: Implement indexed-count hotplug memory remove") 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") c21f515c7436 ("powerpc/pseries: Make the acquire/release of the drc for memory a seperate step") e70d59700fc3 ("powerpc/pseries: Introduce memory hotplug READD operation") f84775c2d5d9 ("powerpc/pseries: Fix build break when MEMORY_HOTREMOVE=n") v4.4.232: Failed to apply! Possible dependencies: 183deeea5871 ("powerpc/pseries: Consolidate CPU hotplug code to hotplug-cpu.c") 1a367063ca0c ("powerpc/pseries: Check memory device state before onlining/offlining") 1dc759566636 ("powerpc/pseries: Use kernel hotplug queue for PowerVM hotplug events") 1f859adb9253 ("powerpc/pseries: Verify CPU doesn't exist before adding") 25b587fba9a4 ("powerpc/pseries: Correct possible read beyond dlpar sysfs buffer") 333f7b76865b ("powerpc/pseries: Implement indexed-count hotplug memory add") 4a4bdfea7cb7 ("powerpc/pseries: Refactor dlpar_add_lmb() code") 753843471cbb ("powerpc/pseries: Implement indexed-count hotplug memory remove") 9054619ef54a ("powerpc/pseries: Add pseries hotplug workqueue") 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") 9dc512819e4b ("powerpc: Fix unused function warning 'lmb_to_memblock'") bdf5fc633804 ("powerpc/pseries: Update LMB associativity index during DLPAR add/remove") c21f515c7436 ("powerpc/pseries: Make the acquire/release of the drc for memory a seperate step") e70d59700fc3 ("powerpc/pseries: Introduce memory hotplug READD operation") e9d764f80396 ("powerpc/pseries: Enable kernel CPU dlpar from sysfs") ec999072442a ("powerpc/pseries: Auto-online hotplugged memory") f84775c2d5d9 ("powerpc/pseries: Fix build break when MEMORY_HOTREMOVE=n") fdb4f6e99ffa ("powerpc/pseries: Remove call to memblock_add()") NOTE: The patch will not be queued to stable trees until it is upstream. How should we proceed with this patch? -- Thanks Sasha
[PATCH] powerpc/book3s64/radix: Fix boot failure with large amount of guest memory
If the hypervisor doesn't support hugepages, the kernel ends up allocating a large number of page table pages. The early page table allocation was wrongly setting the max memblock limit to ppc64_rma_size with radix translation which resulted in boot failure as shown below. Kernel panic - not syncing: early_alloc_pgtable: Failed to allocate 16777216 bytes align=0x100 nid=-1 from=0x max_addr=0x CPU: 0 PID: 0 Comm: swapper Not tainted 5.8.0-24.9-default+ #2 Call Trace: [c16f3d00] [c07c6470] dump_stack+0xc4/0x114 (unreliable) [c16f3d40] [c014c78c] panic+0x164/0x418 [c16f3dd0] [c0098890] early_alloc_pgtable+0xe0/0xec [c16f3e60] [c10a5440] radix__early_init_mmu+0x360/0x4b4 [c16f3ef0] [c1099bac] early_init_mmu+0x1c/0x3c [c16f3f10] [c109a320] early_setup+0x134/0x170 This was because the kernel was checking for the radix feature before we enable the feature via mmu_features. This resulted in the kernel using hash restrictions on radix. Rework the early init code such that the kernel boot with memblock restrictions as imposed by hash. At that point, the kernel still hasn't finalized the translation the kernel will end up using. We have three different ways of detecting radix. 1. dt_cpu_ftrs_scan -> used only in case of PowerNV 2. ibm,pa-features -> Used when we don't use cpu_dt_ftr_scan 3. CAS -> Where we negotiate with hypervisor about the supported translation. We look at 1 or 2 early in the boot and after that, we look at the CAS vector to finalize the translation the kernel will use. We also support a kernel command line option (disable_radix) to switch to hash. Update the memblock limit after mmu_early_init_devtree() if the kernel is going to use radix translation. This forces some of the memblock allocations we do before mmu_early_init_devtree() to be within the RMA limit. Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines") Reported-by: Shirisha Ganta Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/mmu.h | 8 +--- arch/powerpc/kernel/prom.c | 6 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 2 ++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 55442d45c597..4245f99453f5 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -244,9 +244,11 @@ extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - if (early_radix_enabled()) - return radix__setup_initial_memory_limit(first_memblock_base, - first_memblock_size); + /* +* Hash has more strict restrictions. At this point we don't +* know which translations we will pick. Hence got with hash +* restrictions. +*/ return hash__setup_initial_memory_limit(first_memblock_base, first_memblock_size); } diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index d8a2fb87ba0c..340900ae95a4 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -811,6 +811,12 @@ void __init early_init_devtree(void *params) mmu_early_init_devtree(); + /* +* Reset ppc64_rma_size and memblock memory limit +*/ + if (early_radix_enabled()) + radix__setup_initial_memory_limit(memstart_addr, first_memblock_size); + #ifdef CONFIG_PPC_POWERNV /* Scan and build the list of machine check recoverable ranges */ of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 28c784976bed..094daf16acac 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -747,6 +747,8 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, * Radix mode is not limited by RMA / VRMA addressing. */ ppc64_rma_size = ULONG_MAX; + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } #ifdef CONFIG_MEMORY_HOTPLUG -- 2.26.2
Re: [PATCH] sfc_ef100: Fix build failure on powerpc
On Thu, Aug 13, 2020 at 02:39:10PM +, Christophe Leroy wrote: > ppc6xx_defconfig fails building sfc.ko module, complaining > about the lack of _umoddi3 symbol. > > This is due to the following test > > if (EFX_MIN_DMAQ_SIZE % reader->value) { > > Because reader->value is u64. > > As EFX_MIN_DMAQ_SIZE value is 512, reader->value is obviously small > enough for an u32 calculation, so cast it as (u32) for the test, to > avoid the need for _umoddi3. That isn't the same e.g. if reader->value is 2**32 + small. Which probably cannot happen, but :-) Segher
Re: linux-next: runtime warning in Linus' tree
On Thu, Aug 13, 2020 at 04:46:54PM +1000, Stephen Rothwell wrote: > [0.055220][T0] WARNING: CPU: 0 PID: 0 at mm/memcontrol.c:5220 > mem_cgroup_css_alloc+0x350/0x904 > [The line numbers in the final linux next are 5226 and 5141 due to > later patches.] > > Introduced (or exposed) by commit > > 3e38e0aaca9e ("mm: memcg: charge memcg percpu memory to the parent cgroup") > > This commit actually adds the WARN_ON, so it either adds the bug that > sets it off, or the bug already existed. > > Unfotunately, the version of this patch in linux-next up tuntil today > is different. :-( Sorry, I made a last-minute request to include these checks in that patch to make the code a bit more robust, but they trigger a false positive here. Let's remove them. --- >From de8ea7c96c056c3cbe7b93995029986a158fb9cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Aug 2020 10:40:54 -0400 Subject: [PATCH] mm: memcontrol: fix warning when allocating the root cgroup Commit 3e38e0aaca9e ("mm: memcg: charge memcg percpu memory to the parent cgroup") adds memory tracking to the memcg kernel structures themselves to make cgroups liable for the memory they are consuming through the allocation of child groups (which can be significant). This code is a bit awkward as it's spread out through several functions: The outermost function does memalloc_use_memcg(parent) to set up current->active_memcg, which designates which cgroup to charge, and the inner functions pass GFP_ACCOUNT to request charging for specific allocations. To make sure this dependency is satisfied at all times - to make sure we don't randomly charge whoever is calling the functions - the inner functions warn on !current->active_memcg. However, this triggers a false warning when the root memcg itself is allocated. No parent exists in this case, and so current->active_memcg is rightfully NULL. It's a false positive, not indicative of a bug. Delete the warnings for now, we can revisit this later. Fixes: 3e38e0aaca9e ("mm: memcg: charge memcg percpu memory to the parent cgroup") Signed-off-by: Johannes Weiner --- mm/memcontrol.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d59fd9af6e63..9d87082e64aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5137,9 +5137,6 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - /* We charge the parent cgroup, never the current task */ - WARN_ON_ONCE(!current->active_memcg); - pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, GFP_KERNEL_ACCOUNT); if (!pn->lruvec_stat_local) { @@ -5222,9 +5219,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } - /* We charge the parent cgroup, never the current task */ - WARN_ON_ONCE(!current->active_memcg); - memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_local) -- 2.28.0
[PATCH v3] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
The drmem lmb list can have hundreds of thousands of entries, and unfortunately lookups take the form of linear searches. As long as this is the case, traversals have the potential to monopolize the CPU and provoke lockup reports, workqueue stalls, and the like unless they explicitly yield. Rather than placing cond_resched() calls within various for_each_drmem_lmb() loop blocks in the code, put it in the iteration expression of the loop macro itself so users can't omit it. Introduce a drmem_lmb_next() iteration helper function which calls cond_resched() at a regular interval during array traversal. Each iteration of the loop in DLPAR code paths can involve around ten RTAS calls which can each take up to 250us, so this ensures the check is performed at worst every few milliseconds. Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format") Signed-off-by: Nathan Lynch --- Notes: Changes since v2: * Make drmem_lmb_next() more general. * Adjust reschedule interval for better code generation. * Add commentary to drmem_lmb_next() to explain the cond_resched() call. * Remove bounds assertions. Changes since v1: * Add bounds assertions in drmem_lmb_next(). * Call cond_resched() in the iterator on only every 20th element instead of on every iteration, to reduce overhead in tight loops. arch/powerpc/include/asm/drmem.h | 18 +- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 17ccc6474ab6..6fb928605ed1 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -8,6 +8,8 @@ #ifndef _ASM_POWERPC_LMB_H #define _ASM_POWERPC_LMB_H +#include + struct drmem_lmb { u64 base_addr; u32 drc_index; @@ -26,8 +28,22 @@ struct drmem_lmb_info { extern struct drmem_lmb_info *drmem_info; +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb, + const struct drmem_lmb *start) +{ + /* +* DLPAR code paths can take several milliseconds per element +* when interacting with firmware. Ensure that we don't +* unfairly monopolize the CPU. +*/ + if (((++lmb - start) % 16) == 0) + cond_resched(); + + return lmb; +} + #define for_each_drmem_lmb_in_range(lmb, start, end) \ - for ((lmb) = (start); (lmb) < (end); (lmb)++) + for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start)) #define for_each_drmem_lmb(lmb)\ for_each_drmem_lmb_in_range((lmb), \ -- 2.25.4
[PATCH] sfc_ef100: Fix build failure on powerpc
ppc6xx_defconfig fails building sfc.ko module, complaining about the lack of _umoddi3 symbol. This is due to the following test if (EFX_MIN_DMAQ_SIZE % reader->value) { Because reader->value is u64. As EFX_MIN_DMAQ_SIZE value is 512, reader->value is obviously small enough for an u32 calculation, so cast it as (u32) for the test, to avoid the need for _umoddi3. Fixes: adcfc3482fff ("sfc_ef100: read Design Parameters at probe time") Signed-off-by: Christophe Leroy --- drivers/net/ethernet/sfc/ef100_nic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index 36598d0542ed..234400b69b07 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -979,7 +979,7 @@ static int ef100_process_design_param(struct efx_nic *efx, * EFX_MIN_DMAQ_SIZE is divisible by GRANULARITY. * This is very unlikely to fail. */ - if (EFX_MIN_DMAQ_SIZE % reader->value) { + if (EFX_MIN_DMAQ_SIZE % (u32)reader->value) { netif_err(efx, probe, efx->net_dev, "%s size granularity is %llu, can't guarantee safety\n", reader->type == ESE_EF100_DP_GZ_RXQ_SIZE_GRANULARITY ? "RXQ" : "TXQ", -- 2.25.0
Re: [PATCH] powerpc/papr_scm: Limit the readability of 'perf_stats' sysfs attribute
On 8/13/20 10:04 AM, Vaibhav Jain wrote: The newly introduced 'perf_stats' attribute uses the default access mode of 0444 letting non-root users access performance stats of an nvdimm and potentially force the kernel into issuing large number of expensive HCALLs. Since the information exposed by this attribute cannot be cached hence its better to ward of access to this attribute from users who don't need to access these performance statistics. Hence this patch adds check in perf_stats_show() to only let users that are 'perfmon_capable()' to read the nvdimm performance statistics. Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance stats from PHYP') Reported-by: Aneesh Kumar K.V Signed-off-by: Vaibhav Jain --- arch/powerpc/platforms/pseries/papr_scm.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f439f0dfea7d1..36c51bf8af9a8 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -792,6 +792,10 @@ static ssize_t perf_stats_show(struct device *dev, struct nvdimm *dimm = to_nvdimm(dev); struct papr_scm_priv *p = nvdimm_provider_data(dimm); + /* Allow access only to perfmon capable users */ + if (!perfmon_capable()) + return -EACCES; + An access check is usually done in open(). This is the read callback IIUC. if (!p->stat_buffer_len) return -ENOENT; -aneesh
Re: [PATCH 1/5] powerpc: Remove flush_instruction_cache for book3s/32
On Thu, Aug 13, 2020 at 01:13:08PM +0100, Christoph Hellwig wrote: > On Thu, Aug 13, 2020 at 10:12:00AM +, Christophe Leroy wrote: > > -#ifndef CONFIG_PPC_8xx > > +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) > > _GLOBAL(flush_instruction_cache) > > #if defined(CONFIG_4xx) > > lis r3, KERNELBASE@h > > @@ -290,18 +289,11 @@ _GLOBAL(flush_instruction_cache) > > mfspr r3,SPRN_L1CSR1 > > ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR > > mtspr SPRN_L1CSR1,r3 > > -#elif defined(CONFIG_PPC_BOOK3S_601) > > - blr /* for 601, do nothing */ > > -#else > > - /* 603/604 processor - use invalidate-all bit in HID0 */ > > - mfspr r3,SPRN_HID0 > > - ori r3,r3,HID0_ICFI > > - mtspr SPRN_HID0,r3 > > #endif /* CONFIG_4xx */ > > isync > > blr > > EXPORT_SYMBOL(flush_instruction_cache) > > -#endif /* CONFIG_PPC_8xx */ > > +#endif /* CONFIG_PPC_8xx || CONFIG_PPC_BOOK3S_32 */ > > What about untangling this into entirely separate versions instead > of the ifdef mess? Also the export does not seem to be needed at all. Ok, I see that you do that later, sorry.
Re: [PATCH 1/5] powerpc: Remove flush_instruction_cache for book3s/32
On Thu, Aug 13, 2020 at 10:12:00AM +, Christophe Leroy wrote: > -#ifndef CONFIG_PPC_8xx > +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) > _GLOBAL(flush_instruction_cache) > #if defined(CONFIG_4xx) > lis r3, KERNELBASE@h > @@ -290,18 +289,11 @@ _GLOBAL(flush_instruction_cache) > mfspr r3,SPRN_L1CSR1 > ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR > mtspr SPRN_L1CSR1,r3 > -#elif defined(CONFIG_PPC_BOOK3S_601) > - blr /* for 601, do nothing */ > -#else > - /* 603/604 processor - use invalidate-all bit in HID0 */ > - mfspr r3,SPRN_HID0 > - ori r3,r3,HID0_ICFI > - mtspr SPRN_HID0,r3 > #endif /* CONFIG_4xx */ > isync > blr > EXPORT_SYMBOL(flush_instruction_cache) > -#endif /* CONFIG_PPC_8xx */ > +#endif /* CONFIG_PPC_8xx || CONFIG_PPC_BOOK3S_32 */ What about untangling this into entirely separate versions instead of the ifdef mess? Also the export does not seem to be needed at all.
[PATCH 5/5] powerpc: Rewrite 4xx flush_cache_instruction() in C
Nothing prevent flush_cache_instruction() from behing writen in C. Do it to improve readability and maintainability. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc_32.S | 13 - arch/powerpc/mm/nohash/4xx.c| 15 +++ arch/powerpc/mm/nohash/Makefile | 1 + 3 files changed, 16 insertions(+), 13 deletions(-) create mode 100644 arch/powerpc/mm/nohash/4xx.c diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 4f4a31d9fdd0..87717966f5cd 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -268,19 +268,6 @@ _ASM_NOKPROBE_SYMBOL(real_writeb) #endif /* CONFIG_40x */ - -/* - * Flush instruction cache. - */ -#ifdef CONFIG_4xx -_GLOBAL(flush_instruction_cache) - lis r3, KERNELBASE@h - iccci 0,r3 - isync - blr -EXPORT_SYMBOL(flush_instruction_cache) -#endif - /* * Copy a whole page. We use the dcbz instruction on the destination * to reduce memory traffic (it eliminates the unnecessary reads of diff --git a/arch/powerpc/mm/nohash/4xx.c b/arch/powerpc/mm/nohash/4xx.c new file mode 100644 index ..954c8aa42a32 --- /dev/null +++ b/arch/powerpc/mm/nohash/4xx.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + */ + +#include +#include +#include + +void flush_instruction_cache(void) +{ + iccci((void*)KERNELBASE); + isync(); +} diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index 0424f6ce5bd8..a7f7211b6373 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -4,6 +4,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y += mmu_context.o tlb.o tlb_low.o obj-$(CONFIG_PPC_BOOK3E_64)+= tlb_low_64e.o book3e_pgtable.o +obj-$(CONFIG_4xx) += 4xx.o obj-$(CONFIG_40x) += 40x.o obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o -- 2.25.0
[PATCH 4/5] powerpc: Rewrite FSL_BOOKE flush_cache_instruction() in C
Nothing prevent flush_cache_instruction() from behing writen in C. Do it to improve readability and maintainability. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc_32.S | 17 - arch/powerpc/mm/nohash/fsl_booke.c | 16 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index a8f6ef513115..4f4a31d9fdd0 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -281,23 +281,6 @@ _GLOBAL(flush_instruction_cache) EXPORT_SYMBOL(flush_instruction_cache) #endif -#ifdef CONFIG_FSL_BOOKE -_GLOBAL(flush_instruction_cache) -#ifdef CONFIG_E200 - mfspr r3,SPRN_L1CSR0 - ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC - /* msync; isync recommended here */ - mtspr SPRN_L1CSR0,r3 -#else - mfspr r3,SPRN_L1CSR1 - ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR - mtspr SPRN_L1CSR1,r3 -#endif - isync - blr -EXPORT_SYMBOL(flush_instruction_cache) -#endif - /* * Copy a whole page. We use the dcbz instruction on the destination * to reduce memory traffic (it eliminates the unnecessary reads of diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c index 0c294827d6e5..36bda962d3b3 100644 --- a/arch/powerpc/mm/nohash/fsl_booke.c +++ b/arch/powerpc/mm/nohash/fsl_booke.c @@ -219,6 +219,22 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } +void flush_instruction_cache(void) +{ + unsigned long tmp; + + if (IS_ENABLED(CONFIG_E200)) { + tmp = mfspr(SPRN_L1CSR0); + tmp |= L1CSR0_CFI | L1CSR0_CLFC; + mtspr(SPRN_L1CSR0, tmp); + } else { + tmp = mfspr(SPRN_L1CSR1); + tmp |= L1CSR1_ICFI | L1CSR1_ICLFR; + mtspr(SPRN_L1CSR1, tmp); + } + isync(); +} + /* * MMU_init_hw does the chip-specific initialization of the MMU hardware. */ -- 2.25.0
[PATCH 3/5] powerpc: Remove flush_instruction_cache() on 8xx
flush_instruction_cache() is never used on 8xx, remove it. Signed-off-by: Christophe Leroy --- arch/powerpc/mm/nohash/8xx.c | 7 --- 1 file changed, 7 deletions(-) diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index d2b37146ae6c..231ca95f9ffb 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -244,13 +244,6 @@ void set_context(unsigned long id, pgd_t *pgd) mb(); } -void flush_instruction_cache(void) -{ - isync(); - mtspr(SPRN_IC_CST, IDC_INVALL); - isync(); -} - #ifdef CONFIG_PPC_KUEP void __init setup_kuep(bool disabled) { -- 2.25.0
[PATCH 2/5] powerpc: Untangle flush_instruction_cache()
flush_instruction_cache() is a mixup of each PPC32 sub-arch. Untangle it by making one complete function for each sub-arch. This makes it a lot more readable and maintainable. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc_32.S | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index bd870743c06f..a8f6ef513115 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -272,28 +272,31 @@ _ASM_NOKPROBE_SYMBOL(real_writeb) /* * Flush instruction cache. */ -#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) +#ifdef CONFIG_4xx _GLOBAL(flush_instruction_cache) -#if defined(CONFIG_4xx) lis r3, KERNELBASE@h iccci 0,r3 -#elif defined(CONFIG_FSL_BOOKE) + isync + blr +EXPORT_SYMBOL(flush_instruction_cache) +#endif + +#ifdef CONFIG_FSL_BOOKE +_GLOBAL(flush_instruction_cache) #ifdef CONFIG_E200 mfspr r3,SPRN_L1CSR0 ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC /* msync; isync recommended here */ mtspr SPRN_L1CSR0,r3 - isync - blr -#endif +#else mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 -#endif /* CONFIG_4xx */ +#endif isync blr EXPORT_SYMBOL(flush_instruction_cache) -#endif /* CONFIG_PPC_8xx || CONFIG_PPC_BOOK3S_32 */ +#endif /* * Copy a whole page. We use the dcbz instruction on the destination -- 2.25.0
[PATCH 1/5] powerpc: Remove flush_instruction_cache for book3s/32
The only callers of flush_instruction_cache() are: arch/powerpc/kernel/swsusp_booke.S: bl flush_instruction_cache arch/powerpc/mm/nohash/40x.c: flush_instruction_cache(); arch/powerpc/mm/nohash/44x.c: flush_instruction_cache(); arch/powerpc/mm/nohash/fsl_booke.c: flush_instruction_cache(); arch/powerpc/platforms/44x/machine_check.c: flush_instruction_cache(); arch/powerpc/platforms/44x/machine_check.c: flush_instruction_cache(); This function is not used by book3s/32, drop it. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc_32.S | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index b24f866fef81..bd870743c06f 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -271,9 +271,8 @@ _ASM_NOKPROBE_SYMBOL(real_writeb) /* * Flush instruction cache. - * This is a no-op on the 601. */ -#ifndef CONFIG_PPC_8xx +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC_BOOK3S_32) _GLOBAL(flush_instruction_cache) #if defined(CONFIG_4xx) lis r3, KERNELBASE@h @@ -290,18 +289,11 @@ _GLOBAL(flush_instruction_cache) mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 -#elif defined(CONFIG_PPC_BOOK3S_601) - blr /* for 601, do nothing */ -#else - /* 603/604 processor - use invalidate-all bit in HID0 */ - mfspr r3,SPRN_HID0 - ori r3,r3,HID0_ICFI - mtspr SPRN_HID0,r3 #endif /* CONFIG_4xx */ isync blr EXPORT_SYMBOL(flush_instruction_cache) -#endif /* CONFIG_PPC_8xx */ +#endif /* CONFIG_PPC_8xx || CONFIG_PPC_BOOK3S_32 */ /* * Copy a whole page. We use the dcbz instruction on the destination -- 2.25.0
[PATCH] powerpc: Drop _nmask_and_or_msr()
_nmask_and_or_msr() is only used at two places to set MSR_IP. The SYNC is unnecessary as the users are not PowerPC 601. Can be easily writen in C. Do it, and drop _nmask_and_or_msr() Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/misc_32.S | 13 - arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c | 3 ++- arch/powerpc/platforms/embedded6xx/storcenter.c | 3 ++- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index b24f866fef81..8d9cb5df580e 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -215,19 +215,6 @@ _GLOBAL(low_choose_7447a_dfs) #endif /* CONFIG_CPU_FREQ_PMAC && CONFIG_PPC_BOOK3S_32 */ -/* - * complement mask on the msr then "or" some values on. - * _nmask_and_or_msr(nmask, value_to_or) - */ -_GLOBAL(_nmask_and_or_msr) - mfmsr r0 /* Get current msr */ - andcr0,r0,r3/* And off the bits set in r3 (first parm) */ - or r0,r0,r4/* Or on the bits in r4 (second parm) */ - SYNC/* Some chip revs have problems here... */ - mtmsr r0 /* Update machine state */ - isync - blr /* Done */ - #ifdef CONFIG_40x /* diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c index 15437abe1f6d..b95c3380d2b5 100644 --- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c +++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c @@ -147,7 +147,8 @@ static void __noreturn mpc7448_hpc2_restart(char *cmd) local_irq_disable(); /* Set exception prefix high - to the firmware */ - _nmask_and_or_msr(0, MSR_IP); + mtmsr(mfmsr() | MSR_IP); + isync(); for (;;) ; /* Spin until reset happens */ } diff --git a/arch/powerpc/platforms/embedded6xx/storcenter.c b/arch/powerpc/platforms/embedded6xx/storcenter.c index ed1914dd34bb..e346ddcef45e 100644 --- a/arch/powerpc/platforms/embedded6xx/storcenter.c +++ b/arch/powerpc/platforms/embedded6xx/storcenter.c @@ -101,7 +101,8 @@ static void __noreturn storcenter_restart(char *cmd) local_irq_disable(); /* Set exception prefix high - to the firmware */ - _nmask_and_or_msr(0, MSR_IP); + mtmsr(mfmsr() | MSR_IP); + isync(); /* Wait for reset to happen */ for (;;) ; -- 2.25.0
Re: [PATCH 13/16] debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries
On 8/13/20 10:57 AM, Anshuman Khandual wrote: On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote: pmd_clear() should not be used to clear pmd level pte entries. Could you please elaborate on this. The proposed change set does not match the description here. pmd_clear is implemented such that we don't use that to clear a huge pte entry. We use pmdp_huge_get_and_clear() for that. Hence we have check in pmd_clear which add a WARN if we find a _PAGE_PTE set on the entry. In the test we follow a hugepmd usage with a pmd_clear. We should instead at the end of the advanced pmd test use pmdp_huge_get_and_clear(). Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 061c19bba7f0..529892b9be2f 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -191,6 +191,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmd = READ_ONCE(*pmdp); WARN_ON(pmd_young(pmd)); + /* Clear the pte entries */ + pmdp_huge_get_and_clear(mm, vaddr, pmdp); pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } @@ -313,6 +315,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pudp_test_and_clear_young(vma, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(pud_young(pud)); + + pudp_huge_get_and_clear(mm, vaddr, pudp); } static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -431,8 +435,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, * This entry points to next level page table page. * Hence this must not qualify as pud_bad(). */ - pmd_clear(pmdp); - pud_clear(pudp); Both entires are cleared before creating a fresh page table entry. Why that is a problem. pud_populate(mm, pudp, pmdp); pud = READ_ONCE(*pudp); WARN_ON(pud_bad(pud)); @@ -564,7 +566,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, * This entry points to next level page table page. * Hence this must not qualify as pmd_bad(). */ - pmd_clear(pmdp); Ditto. pmd_populate(mm, pmdp, pgtable); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_bad(pmd));
linux-next: runtime warning in Linus' tree
Hi all, Testing Linus' tree today, my qemu runs (PowerPC powerpc_pseries_le_defconfig) produce the following WARNING: [0.021401][T0] Mount-cache hash table entries: 8192 (order: 0, 65536 bytes, linear) [0.021529][T0] Mountpoint-cache hash table entries: 8192 (order: 0, 65536 bytes, linear) [0.053969][T0] [ cut here ] [0.055220][T0] WARNING: CPU: 0 PID: 0 at mm/memcontrol.c:5220 mem_cgroup_css_alloc+0x350/0x904 [0.055355][T0] Modules linked in: [0.055812][T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.8.0 #5 [0.055976][T0] NIP: c0410010 LR: c040fd68 CTR: [0.056097][T0] REGS: c11e7ab0 TRAP: 0700 Not tainted (5.8.0) [0.056162][T0] MSR: 82029033 CR: 24000888 XER: [0.056449][T0] CFAR: c040fd80 IRQMASK: 0 [0.056449][T0] GPR00: c040fd68 c11e7d40 c11e8300 0001 [0.056449][T0] GPR04: 0228 0001 [0.056449][T0] GPR08: c0007d003208 c0007d002fe8 [0.056449][T0] GPR12: 0001 c13d 011dd528 [0.056449][T0] GPR16: 011dd840 011dd690 0018 0003 [0.056449][T0] GPR20: 0001 c10cbcf8 0003 c10cd540 [0.056449][T0] GPR24: c10e8778 c10e9080 c10cbcd8 [0.056449][T0] GPR28: c0007e2a1000 c10cbcc8 c118ea00 [0.057109][T0] NIP [c0410010] mem_cgroup_css_alloc+0x350/0x904 [0.057177][T0] LR [c040fd68] mem_cgroup_css_alloc+0xa8/0x904 [0.057394][T0] Call Trace: [0.057534][T0] [c11e7d40] [c040fd68] mem_cgroup_css_alloc+0xa8/0x904 (unreliable) [0.057814][T0] [c11e7dc0] [c0f5b13c] cgroup_init_subsys+0xbc/0x210 [0.057903][T0] [c11e7e10] [c0f5b690] cgroup_init+0x220/0x598 [0.057973][T0] [c11e7ee0] [c0f34354] start_kernel+0x67c/0x6ec [0.058047][T0] [c11e7f90] [c000cb88] start_here_common+0x1c/0x614 [0.058241][T0] Instruction dump: [0.058420][T0] eac10030 eae10038 eb410050 eb610058 4b60 6000 6000 6000 [0.058550][T0] 3be00100 4bfffdfc 6000 6000 <0fe0> 4bfffd70 6000 6000 [0.059381][T0] ---[ end trace cb2d79b4994ef1fe ]--- [0.059810][T0] [ cut here ] [0.059872][T0] WARNING: CPU: 0 PID: 0 at mm/memcontrol.c:5135 mem_cgroup_css_alloc+0x750/0x904 [0.059930][T0] Modules linked in: [0.060053][T0] CPU: 0 PID: 0 Comm: swapper/0 Tainted: GW 5.8.0 #5 [0.060113][T0] NIP: c0410410 LR: c040ff2c CTR: [0.060171][T0] REGS: c11e7ab0 TRAP: 0700 Tainted: GW (5.8.0) [0.060229][T0] MSR: 82029033 CR: 24000880 XER: [0.060332][T0] CFAR: c040fe48 IRQMASK: 0 [0.060332][T0] GPR00: c040ff2c c11e7d40 c11e8300 c0007e234c00 [0.060332][T0] GPR04: c0007e235000 0013 [0.060332][T0] GPR08: 7ec0 0001 [0.060332][T0] GPR12: c13d 011dd528 [0.060332][T0] GPR16: 011dd840 011dd690 0018 0003 [0.060332][T0] GPR20: c1223300 c0e95900 c118ea00 c12232c0 [0.060332][T0] GPR24: c10e8778 c10e9080 00400cc0 [0.060332][T0] GPR28: c0007e2a1000 c0007e234c00 [0.060855][T0] NIP [c0410410] mem_cgroup_css_alloc+0x750/0x904 [0.060911][T0] LR [c040ff2c] mem_cgroup_css_alloc+0x26c/0x904 [0.060958][T0] Call Trace: [0.061003][T0] [c11e7d40] [c040ff2c] mem_cgroup_css_alloc+0x26c/0x904 (unreliable) [0.061081][T0] [c11e7dc0] [c0f5b13c] cgroup_init_subsys+0xbc/0x210 [0.061165][T0] [c11e7e10] [c0f5b690] cgroup_init+0x220/0x598 [0.061233][T0] [c11e7ee0] [c0f34354] start_kernel+0x67c/0x6ec [0.061303][T0] [c11e7f90] [c000cb88] start_here_common+0x1c/0x614 [0.061364][T0] Instruction dump: [0.061408][T0] ebe1fff8 7c0803a6 4e800020 6000 6000 3d220004 e929d230 7c3c4800 [0.061508][T0] 41820190 e93c03d2 4bfffc80 6000 <0fe0> 4bfffa38 6000 6000 [0.061630][T0] ---[ end trace
Re: [PATCH 10/16] debug_vm_pgtable/thp: Use page table depost/withdraw with THP
On 8/13/20 10:55 AM, Anshuman Khandual wrote: On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote: Architectures like ppc64 use deposited page table while updating the huge pte entries. Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 644d28861ce9..48475d288df1 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -147,7 +147,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, - pgprot_t prot) + pgprot_t prot, pgtable_t pgtable) { pmd_t pmd; @@ -158,6 +158,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, /* Align the address wrt HPAGE_PMD_SIZE */ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + pgtable_trans_huge_deposit(mm, pmdp, pgtable); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_set_wrprotect(mm, vaddr, pmdp); @@ -188,6 +190,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_test_and_clear_young(vma, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_young(pmd)); + + pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -1002,7 +1006,7 @@ static int __init debug_vm_pgtable(void) pgd_clear_tests(mm, pgdp); pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); Makes sense, if it is required for THP to work correctly but needs to be tested across enabled platforms. Why should not the same apply for pud_advanced_tests() on platforms that supports PUD based THP. pud doesn't have page table deposit/withdraw semantics. We use that to support hugepage split. With pud mapping we don't split, we just drop the hugepage and expect it to be faulted back in as regular page. -aneesh
Re: [PATCH 16/16] debug_vm_pgtable/ppc64: Add a variant of pfn_pte/pmd
On 8/13/20 11:00 AM, Anshuman Khandual wrote: On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote: The tests do expect _PAGE_PTE bit set by different page table accessors. This is not true for the kernel. Within the kernel, _PAGE_PTE bits are usually set by set_pte_at(). To make the below tests work correctly add test specific pfn_pte/pmd helpers that set _PAGE_PTE bit. pte_t pte = pfn_pte(pfn, prot); WARN_ON(!pte_devmap(pte_mkdevmap(pte))); WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte; Signed-off-by: Aneesh Kumar K.V --- mm/debug_vm_pgtable.c | 65 +++ 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index eea62d5e503b..153c925b5273 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -31,6 +31,23 @@ #include #include +#ifdef CONFIG_PPC_BOOK3S_64 +static inline pte_t debug_vm_pfn_pte(unsigned long pfn, pgprot_t pgprot) +{ + pte_t pte = pfn_pte(pfn, pgprot); + return __pte(pte_val(pte) | _PAGE_PTE); + +} +static inline pmd_t debug_vm_pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + pmd_t pmd = pfn_pmd(pfn, pgprot); + return __pmd(pmd_val(pmd) | _PAGE_PTE); +} +#else +#define debug_vm_pfn_pte(pfn, pgprot) pfn_pte(pfn, pgprot) +#define debug_vm_pfn_pmd(pfn, pgprot) pfn_pmd(pfn, pgprot) +#endif Again, no platform specific constructs please. This defeats the whole purpose of this test. If __PAGE_PTE is required for the helpers, then pfn_pmd/pte() could be modified to accommodate that. We dont see similar issues on other platforms, hence could you please explain why ppc64 is different here. It is not platform specific. set_pte_at is the one that set the _PAGE_PTE bit. We don't call that in the test. The test seems to make the assumption that pfn_pte returns a proper pte which is not true. -aneesh
Re: INFO: task hung in pipe_release (2)
syzbot has bisected this issue to: commit fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179 Author: Aleksa Sarai Date: Sat Jan 18 12:07:59 2020 + open: introduce openat2(2) syscall bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=164e716a90 start commit: 6ba1b005 Merge tag 'asm-generic-fixes-5.8' of git://git.ke.. git tree: upstream final oops: https://syzkaller.appspot.com/x/report.txt?x=154e716a90 console output: https://syzkaller.appspot.com/x/log.txt?x=114e716a90 kernel config: https://syzkaller.appspot.com/x/.config?x=84f076779e989e69 dashboard link: https://syzkaller.appspot.com/bug?extid=61acc40a49a3e46e25ea syz repro: https://syzkaller.appspot.com/x/repro.syz?x=142ae22490 Reported-by: syzbot+61acc40a49a3e46e2...@syzkaller.appspotmail.com Fixes: fddb5d430ad9 ("open: introduce openat2(2) syscall") For information about bisection process see: https://goo.gl/tpsmEJ#bisection