Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset
On Mon, Jul 17, 2023 at 02:45:23PM -0400, Frank Li wrote: > On Mon, Jul 17, 2023 at 09:29:10PM +0530, Manivannan Sadhasivam wrote: > > On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote: > > > From: Xiaowei Bao > > > > > > A workaround for the issue where the PCI Express Endpoint (EP) controller > > > loses the values of the Maximum Link Width and Supported Link Speed from > > > the Link Capabilities Register, which initially configured by the Reset > > > Configuration Word (RCW) during a link-down or hot reset event. > > > > > > > If this fixes an issue, then there should be a Fixes tag. > > It is not fixed a exist software issue, just workaround a hardwre errata. > But the hardware errata is there from the start, right? So technically this driver doesn't address that so far and so this patch looks like a fix to me. Plus adding a fixes tag and CCing stable list will allow this patch to be backported to stable kernels. - Mani > > > > > Signed-off-by: Xiaowei Bao > > > Signed-off-by: Hou Zhiqiang > > > Signed-off-by: Frank Li > > > --- > > > drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 + > > > 1 file changed, 13 insertions(+) > > > > > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > > > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > > > index 4e4fdd1dfea7..2ef02d827eeb 100644 > > > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > > > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > > > @@ -45,6 +45,7 @@ struct ls_pcie_ep { > > > struct pci_epc_features *ls_epc; > > > const struct ls_pcie_ep_drvdata *drvdata; > > > int irq; > > > + u32 lnkcap; > > > boolbig_endian; > > > }; > > > > > > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, > > > void *dev_id) > > > struct ls_pcie_ep *pcie = dev_id; > > > struct dw_pcie *pci = pcie->pci; > > > u32 val, cfg; > > > + u8 offset; > > > > > > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); > > > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); > > > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, > > > void *dev_id) > > > return IRQ_NONE; > > > > > > if (val & PEX_PF0_PME_MES_DR_LUD) { > > > + > > > > Please add a comment on why the LNKCAP is being restored here. > > > > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > > > + > > > + dw_pcie_dbi_ro_wr_en(pci); > > > + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap); > > > > lnkcap is a 32-bit variable, so you should use dw_pcie_writel_dbi(). > > > > - Mani > > > > > + dw_pcie_dbi_ro_wr_dis(pci); > > > + > > > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); > > > cfg |= PEX_PF0_CFG_READY; > > > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); > > > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct > > > platform_device *pdev) > > > struct ls_pcie_ep *pcie; > > > struct pci_epc_features *ls_epc; > > > struct resource *dbi_base; > > > + u8 offset; > > > int ret; > > > > > > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); > > > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct > > > platform_device *pdev) > > > > > > platform_set_drvdata(pdev, pcie); > > > > > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > > > + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); > > > + > > > ret = dw_pcie_ep_init(>ep); > > > if (ret) > > > return ret; > > > -- > > > 2.34.1 > > > > > > > -- > > மணிவண்ணன் சதாசிவம் -- மணிவண்ணன் சதாசிவம்
Re: linux-next: Tree for Jul 13 (drivers/video/fbdev/ps3fb.c)
On Thu, Jul 13, 2023 at 09:11:10AM -0700, Randy Dunlap wrote: > > > On 7/12/23 19:37, Stephen Rothwell wrote: > > Hi all, > > > > Changes since 20230712: > > > > on ppc64: > > In file included from ../include/linux/device.h:15, > from ../arch/powerpc/include/asm/io.h:22, > from ../include/linux/io.h:13, > from ../include/linux/irq.h:20, > from ../arch/powerpc/include/asm/hardirq.h:6, > from ../include/linux/hardirq.h:11, > from ../include/linux/interrupt.h:11, > from ../drivers/video/fbdev/ps3fb.c:25: > ../drivers/video/fbdev/ps3fb.c: In function 'ps3fb_probe': > ../drivers/video/fbdev/ps3fb.c:1172:40: error: 'struct fb_info' has no member > named 'dev' > 1172 | dev_driver_string(info->dev), dev_name(info->dev), > |^~ > ../include/linux/dev_printk.h:110:37: note: in definition of macro > 'dev_printk_index_wrap' > 110 | _p_func(dev, fmt, ##__VA_ARGS__); > \ > | ^~~ > ../drivers/video/fbdev/ps3fb.c:1171:9: note: in expansion of macro 'dev_info' > 1171 | dev_info(info->device, "%s %s, using %u KiB of video > memory\n", > | ^~~~ > ../drivers/video/fbdev/ps3fb.c:1172:61: error: 'struct fb_info' has no member > named 'dev' > 1172 | dev_driver_string(info->dev), dev_name(info->dev), > | ^~ > ../include/linux/dev_printk.h:110:37: note: in definition of macro > 'dev_printk_index_wrap' > 110 | _p_func(dev, fmt, ##__VA_ARGS__); > \ > | ^~~ > ../drivers/video/fbdev/ps3fb.c:1171:9: note: in expansion of macro 'dev_info' > 1171 | dev_info(info->device, "%s %s, using %u KiB of video > memory\n", > | ^~~~ > > Hmm, there is no response from Thomas yet. I guess we should go with reverting bdb616479eff419, right? Regardless, I'm adding this build regression to regzbot so that parties involved are aware of it: #regzbot ^introduced: bdb616479eff419 #regzbot title: build regression in PS3 framebuffer Thanks. -- An old man doll... just what I always wanted! - Clara
[powerpc:fixes-test] BUILD SUCCESS ccb381e1af1ace292153c88eb1fffa5683d16a20
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git fixes-test branch HEAD: ccb381e1af1ace292153c88eb1fffa5683d16a20 powerpc/kasan: Disable KCOV in KASAN code elapsed time: 796m configs tested: 55 configs skipped: 145 The following configs have been built successfully. More configs may be tested in the coming days. tested configs: arc allyesconfig gcc arc axs101_defconfig gcc archsdk_defconfig gcc arm allmodconfig gcc arm allyesconfig gcc arm defconfig gcc arm pxa910_defconfig gcc armrealview_defconfig gcc arm64allyesconfig gcc arm64 defconfig gcc m68k allmodconfig gcc m68k allyesconfig gcc m68kdefconfig gcc m68km5272c3_defconfig gcc m68kstmark2_defconfig gcc mips allmodconfig gcc mips fuloong2e_defconfig gcc mipsmalta_qemu_32r6_defconfig clang mipsqi_lb60_defconfig clang powerpc allmodconfig gcc powerpc allnoconfig gcc powerpc asp8347_defconfig gcc powerpc eiger_defconfig gcc powerpc katmai_defconfig clang powerpc maple_defconfig gcc powerpc ppc40x_defconfig gcc powerpc ps3_defconfig gcc powerpc randconfig-r026-20230717 gcc riscvallmodconfig gcc riscv allnoconfig gcc riscvallyesconfig gcc riscv defconfig gcc riscv rv32_defconfig gcc sh kfr2r09_defconfig gcc sh lboxre2_defconfig gcc sh rts7751r2dplus_defconfig gcc sh sdk7786_defconfig gcc sh sh7724_generic_defconfig gcc um allmodconfig clang umallnoconfig clang um allyesconfig clang x86_64 allyesconfig gcc x86_64 defconfig gcc x86_64 randconfig-x002-20230717 gcc x86_64 randconfig-x003-20230717 gcc x86_64 randconfig-x004-20230717 gcc x86_64 randconfig-x005-20230717 gcc x86_64 randconfig-x006-20230717 gcc x86_64 randconfig-x011-20230717 clang x86_64 randconfig-x012-20230717 clang x86_64 randconfig-x013-20230717 clang x86_64 randconfig-x014-20230717 clang x86_64 randconfig-x015-20230717 clang x86_64 randconfig-x016-20230717 clang x86_64 rhel-8.3 gcc -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
[powerpc:merge] BUILD SUCCESS 7c5878b16f9cd959e232169b967be5b2a0897afa
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge branch HEAD: 7c5878b16f9cd959e232169b967be5b2a0897afa Automatic merge of 'master' into merge (2023-07-17 08:46) elapsed time: 1545m configs tested: 175 configs skipped: 4 The following configs have been built successfully. More configs may be tested in the coming days. tested configs: alphaallyesconfig gcc alpha defconfig gcc arc allyesconfig gcc arc axs101_defconfig gcc arc defconfig gcc archsdk_defconfig gcc arc randconfig-r012-20230717 gcc arc randconfig-r043-20230717 gcc arm allmodconfig gcc arm allyesconfig gcc arm aspeed_g5_defconfig gcc arm defconfig gcc arm ep93xx_defconfig clang arm orion5x_defconfig clang arm pxa910_defconfig gcc arm randconfig-r006-20230717 gcc arm randconfig-r046-20230717 clang armrealview_defconfig gcc arm sama5_defconfig gcc arm64allyesconfig gcc arm64 defconfig gcc arm64randconfig-r001-20230717 clang arm64randconfig-r002-20230717 clang arm64randconfig-r005-20230717 clang arm64randconfig-r014-20230717 gcc arm64randconfig-r034-20230717 clang arm64randconfig-r035-20230717 clang cskydefconfig gcc csky randconfig-r004-20230717 gcc csky randconfig-r013-20230717 gcc csky randconfig-r023-20230717 gcc hexagon randconfig-r041-20230717 clang hexagon randconfig-r045-20230717 clang i386 allyesconfig gcc i386 buildonly-randconfig-r004-20230717 clang i386 buildonly-randconfig-r005-20230717 clang i386 buildonly-randconfig-r006-20230717 clang i386 debian-10.3 gcc i386defconfig gcc i386 randconfig-i001-20230717 clang i386 randconfig-i002-20230717 clang i386 randconfig-i003-20230717 clang i386 randconfig-i004-20230717 clang i386 randconfig-i005-20230717 clang i386 randconfig-i006-20230717 clang i386 randconfig-i011-20230717 gcc i386 randconfig-i012-20230717 gcc i386 randconfig-i013-20230717 gcc i386 randconfig-i014-20230717 gcc i386 randconfig-i015-20230717 gcc i386 randconfig-i016-20230717 gcc i386 randconfig-r016-20230717 gcc i386 randconfig-r025-20230717 gcc i386 randconfig-r034-20230717 clang i386 randconfig-r035-20230717 clang loongarchallmodconfig gcc loongarch allnoconfig gcc loongarch defconfig gcc loongarchrandconfig-r006-20230717 gcc loongarchrandconfig-r036-20230717 gcc m68k allmodconfig gcc m68k allyesconfig gcc m68kdefconfig gcc m68km5272c3_defconfig gcc m68k randconfig-r015-20230717 gcc m68kstmark2_defconfig gcc mips allmodconfig gcc mips allyesconfig gcc mipsbcm47xx_defconfig gcc mips fuloong2e_defconfig gcc mips malta_defconfig clang mipsmalta_qemu_32r6_defconfig clang mips maltasmvp_eva_defconfig gcc mipsomega2p_defconfig clang mipsqi_lb60_defconfig clang mips randconfig-r021-20230717 clang mips randconfig-r033-20230717 gcc nios2 defconfig gcc openriscdefconfig gcc openrisc randconfig-r005-20230717 gcc openrisc randconfig-r012-20230717 gcc openrisc randconfig-r021-20230717 gcc openrisc randconfig-r024-20230717 gcc openrisc randconfig-r031-20230717 gcc openrisc randconfig-r032-20230717 gcc parisc
[powerpc:next-test] BUILD SUCCESS b059dfc41139ee194c9127b89dbea02afa409443
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test branch HEAD: b059dfc41139ee194c9127b89dbea02afa409443 powerpc/64: Enable accelerated crypto algorithms in defconfig elapsed time: 796m configs tested: 38 configs skipped: 145 The following configs have been built successfully. More configs may be tested in the coming days. tested configs: arc allyesconfig gcc arm pxa910_defconfig gcc armrealview_defconfig gcc m68k allmodconfig gcc m68k allyesconfig gcc m68kdefconfig gcc m68km5272c3_defconfig gcc m68kstmark2_defconfig gcc mips allmodconfig gcc powerpc akebono_defconfig clang powerpc allmodconfig gcc powerpc allnoconfig gcc powerpc asp8347_defconfig gcc powerpc eiger_defconfig gcc powerpc maple_defconfig gcc powerpc ppc40x_defconfig gcc powerpc ps3_defconfig gcc powerpc randconfig-r023-20230717 gcc powerpc randconfig-r026-20230717 gcc sh lboxre2_defconfig gcc sh rts7751r2dplus_defconfig gcc sh sdk7786_defconfig gcc sh sh7724_generic_defconfig gcc um allmodconfig clang umallnoconfig clang um allyesconfig clang x86_64 allyesconfig gcc x86_64 randconfig-x002-20230717 gcc x86_64 randconfig-x003-20230717 gcc x86_64 randconfig-x004-20230717 gcc x86_64 randconfig-x005-20230717 gcc x86_64 randconfig-x006-20230717 gcc x86_64 randconfig-x011-20230717 clang x86_64 randconfig-x012-20230717 clang x86_64 randconfig-x013-20230717 clang x86_64 randconfig-x014-20230717 clang x86_64 randconfig-x015-20230717 clang x86_64 randconfig-x016-20230717 clang -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Re: [PATCH 4/4] powerpc/64s/radix: combine final TLB flush and lazy tlb mm shootdown IPIs
Nicholas Piggin writes: > This performs lazy tlb mm shootdown when doing the exit TLB flush when > all mm users go away and user mappings are removed, which avoids having > to do the lazy tlb mm shootdown IPIs on the final mmput when all kernel > references disappear. > > powerpc/64s uses a broadcast TLBIE for the exit TLB flush if remote CPUs > need to be invalidated (unless TLBIE is disabled), so this doesn't > necessarily save IPIs but it does avoid a broadcast TLBIE which is quite > expensive. > > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/mm/book3s64/radix_tlb.c | 26 +- > 1 file changed, 25 insertions(+), 1 deletion(-) This gives me: [1.438910][T1] Run /init as init process [1.442759][ T96] [ cut here ] [1.442836][ T96] WARNING: CPU: 0 PID: 96 at kernel/smp.c:748 smp_call_function_many_cond+0xe0/0xad0 [1.442920][ T96] Modules linked in: [1.442960][ T96] CPU: 0 PID: 96 Comm: init Not tainted 6.5.0-rc2-g1954d181ea09 #168 [1.443028][ T96] Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf05 of:SLOF,git-6b6c16 hv:linux,kvm pSeries [1.443126][ T96] NIP: c02aab20 LR: c00a5fc4 CTR: [1.443199][ T96] REGS: cc36f5b0 TRAP: 0700 Not tainted (6.5.0-rc2-g1954d181ea09) [1.443280][ T96] MSR: 80029033 CR: 44008244 XER: 2004 [1.443382][ T96] CFAR: c02ab524 IRQMASK: 0 [1.443382][ T96] GPR00: c00a5fc4 cc36f850 c17f9000 c617c580 [1.443382][ T96] GPR04: c00a55b0 c617bd00 0001 0001 [1.443382][ T96] GPR08: c29fc88c cc25aa00 44008244 [1.443382][ T96] GPR12: fd78 c36c c4042a00 [1.443382][ T96] GPR16: 0001 [1.443382][ T96] GPR20: c00a5fc4 c29f85d0 [1.443382][ T96] GPR24: cc25b518 c617be60 c617bd00 [1.443382][ T96] GPR28: c617c580 c00a55b0 [1.443994][ T96] NIP [c02aab20] smp_call_function_many_cond+0xe0/0xad0 [1.444069][ T96] LR [c00a5fc4] radix__tlb_flush+0xf4/0x190 [1.444133][ T96] Call Trace: [1.444172][ T96] [cc36f850] [] 0x (unreliable) [1.444250][ T96] [cc36f920] [c29f7fe0] __cpu_possible_mask+0x0/0x100 [1.444326][ T96] [cc36f950] [c04f346c] tlb_finish_mmu+0x16c/0x220 [1.02][ T96] [cc36f980] [c04ee894] exit_mmap+0x1b4/0x580 [1.74][ T96] [cc36faa0] [c014c140] __mmput+0x60/0x1c0 [1.444546][ T96] [cc36fae0] [c05cf014] begin_new_exec+0x5d4/0xec0 [1.444622][ T96] [cc36fb60] [c066c6e8] load_elf_binary+0x4a8/0x1cf0 [1.444697][ T96] [cc36fc60] [c05cc410] bprm_execve+0x3b0/0xa60 [1.444773][ T96] [cc36fd30] [c05ce3a0] do_execveat_common+0x1d0/0x300 [1.444852][ T96] [cc36fde0] [c05ce524] sys_execve+0x54/0x70 [1.444928][ T96] [cc36fe10] [c0031c24] system_call_exception+0x134/0x360 [1.445000][ T96] [cc36fe50] [c000d6a0] system_call_common+0x160/0x2c4 [1.445070][ T96] --- interrupt: c00 at 0x7fffb664cc98 [1.445119][ T96] NIP: 7fffb664cc98 LR: 1004bcb0 CTR: [1.445189][ T96] REGS: cc36fe80 TRAP: 0c00 Not tainted (6.5.0-rc2-g1954d181ea09) [1.445271][ T96] MSR: 8280f033 CR: 22004842 XER: [1.445390][ T96] IRQMASK: 0 [1.445390][ T96] GPR00: 000b 7fffd9d11ec0 7fffb6767300 2b3f06e8 [1.445390][ T96] GPR04: 2b3f0780 2b3f07b0 [1.445390][ T96] GPR08: 2b3f06e8 [1.445390][ T96] GPR12: 7fffb683a930 100f0ff8 [1.445390][ T96] GPR16: 7fffd9d12020 2b3f0780 [1.445390][ T96] GPR20: 2b3f0778 2b3f1330 100c6cb0 [1.445390][ T96] GPR24: [1.445390][ T96] GPR28: 100d34ae 100c6cf8 2b3f0780 2b3f06e8 [1.446042][ T96] NIP [7fffb664cc98] 0x7fffb664cc98 [1.446095][ T96] LR [1004bcb0] 0x1004bcb0 [1.446147][ T96] --- interrupt: c00 [1.446186][ T96] Code: 8149 394a0001 9149 e8ed0030 3d420097 394ae900 7cea382e 8149 2c07 394a 9149
[PATCH v4 6/6] mm/hotplug: Embed vmem_altmap details in memory block
With memmap on memory, some architecture needs more details w.r.t altmap such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of computing them again when we remove a memory block embed vmem_altmap details in struct memory_block if we are using memmap on memory block feature. No functional change in this patch Signed-off-by: Aneesh Kumar K.V --- drivers/base/memory.c | 32 +++- include/linux/memory.h | 8 ++-- mm/memory_hotplug.c| 38 ++ 3 files changed, 43 insertions(+), 35 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b456ac213610..cef6506f0209 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -106,6 +106,7 @@ static void memory_block_release(struct device *dev) { struct memory_block *mem = to_memory_block(dev); + kfree(mem->altmap); kfree(mem); } @@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + unsigned long nr_vmemmap_pages = 0; struct zone *zone; int ret; @@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem) * stage helps to keep accounting easier to follow - e.g vmemmaps * belong to the same zone as the memory they backed. */ + if (mem->altmap) + nr_vmemmap_pages = mem->altmap->alloc + mem->altmap->reserve; + if (nr_vmemmap_pages) { ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); if (ret) @@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + unsigned long nr_vmemmap_pages = 0; int ret; if (!mem->zone) @@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem) * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). */ + if (mem->altmap) + nr_vmemmap_pages = mem->altmap->alloc + mem->altmap->reserve; + if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), mem->group, -nr_vmemmap_pages); @@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid, #endif static int add_memory_block(unsigned long block_id, unsigned long state, - unsigned long nr_vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { struct memory_block *mem; @@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, unsigned long state, mem->start_section_nr = block_id * sections_per_block; mem->state = state; mem->nid = NUMA_NO_NODE; - mem->nr_vmemmap_pages = nr_vmemmap_pages; + if (altmap) { + mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL); + if (!mem->altmap) { + kfree(mem); + return -ENOMEM; + } + memcpy(mem->altmap, altmap, sizeof(*altmap)); + } INIT_LIST_HEAD(>group_next); #ifndef CONFIG_NUMA @@ -783,14 +797,14 @@ static int __init add_boot_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return add_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, 0, NULL); + MEM_ONLINE, NULL, NULL); } static int add_hotplug_memory_block(unsigned long block_id, - unsigned long nr_vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { - return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group); + return add_memory_block(block_id, MEM_OFFLINE, altmap, group); } static void remove_memory_block(struct memory_block *memory) @@ -818,7 +832,7 @@ static void remove_memory_block(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); @@ -832,7 +846,7 @@ int
[PATCH v4 5/6] powerpc/book3s64/memhotplug: Enable memmap on memory for radix
Radix vmemmap mapping can map things correctly at the PMD level or PTE level based on different device boundary checks. Hence we skip the restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also makes the feature widely useful because to use PMD_SIZE vmemmap area we require a memory block size of 2GiB We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature can work with a memory block size of 256MB. Using altmap.reserve feature to align things correctly at pageblock granularity. We can end up losing some pages in memory with this. For ex: with a 256MiB memory block size, we require 4 pages to map vmemmap pages, In order to align things correctly we end up adding a reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/pgtable.h| 24 +++ .../platforms/pseries/hotplug-memory.c| 3 ++- mm/memory_hotplug.c | 2 ++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 116d6add0bb0..f890907e5bbf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -157,6 +157,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 68817ea7f994..3d35371395a9 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -169,6 +169,30 @@ static inline bool is_ioremap_addr(const void *x) int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, unsigned long page_size); +/* + * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details + * some of the restrictions. We don't check for PMD_SIZE because our + * vmemmap allocation code can fallback correctly. The pageblock + * alignment requirement is met using altmap->reserve blocks. + */ +#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned long vmemmap_size = nr_pages * sizeof(struct page); + + if (!radix_enabled()) + return false; + + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) + return IS_ALIGNED(vmemmap_size, PMD_SIZE); + /* +* The pageblock alignment requirement is met by using +* reserve blocks in altmap. +*/ + return true; +} + #endif /* CONFIG_PPC64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 9c62c2c3b3d0..1447509357a7 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -617,6 +617,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) static int dlpar_add_lmb(struct drmem_lmb *lmb) { + mhp_t mhp_flags = MHP_NONE | MHP_MEMMAP_ON_MEMORY; unsigned long block_sz; int nid, rc; @@ -637,7 +638,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) nid = first_online_node; /* Add the memory */ - rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE); + rc = __add_memory(nid, lmb->base_addr, block_sz, mhp_flags); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c409f5ff6a59..6da063c80733 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -2174,6 +2174,8 @@ static int __ref try_remove_memory(u64 start, u64 size) * right thing if we used vmem_altmap when hot-adding * the range. */ + mhp_altmap.base_pfn = PHYS_PFN(start); + mhp_altmap.free = PHYS_PFN(size) - nr_vmemmap_pages; mhp_altmap.alloc = nr_vmemmap_pages; altmap = _altmap; } -- 2.41.0
[PATCH v4 4/6] mm/hotplug: Allow pageblock alignment via altmap reservation
Add a new kconfig option that can be selected if we want to allow pageblock alignment by reserving pages in the vmemmap altmap area. This implies we will be reserving some pages for every memoryblock This also allows the memmap on memory feature to be widely useful with different memory block size values. Signed-off-by: Aneesh Kumar K.V --- mm/memory_hotplug.c | 109 ++-- 1 file changed, 96 insertions(+), 13 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5921c81fcb70..c409f5ff6a59 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -41,17 +41,85 @@ #include "internal.h" #include "shuffle.h" +enum { + MEMMAP_ON_MEMORY_DISABLE = 0, + MEMMAP_ON_MEMORY_ENABLE, + MEMMAP_ON_MEMORY_FORCE, +}; + +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; + +static inline unsigned long memory_block_align_base(unsigned long size) +{ + if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) { + unsigned long align; + unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT; + unsigned long vmemmap_size; + + vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct page), PAGE_SIZE); + align = pageblock_align(vmemmap_size) - vmemmap_size; + return align; + } else + return 0; +} + #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY /* * memory_hotplug.memmap_on_memory parameter */ -static bool memmap_on_memory __ro_after_init; -module_param(memmap_on_memory, bool, 0444); -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +static int set_memmap_mode(const char *val, const struct kernel_param *kp) +{ + int ret, mode; + bool enabled; + + if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { + mode = MEMMAP_ON_MEMORY_FORCE; + goto matched; + } + + ret = kstrtobool(val, ); + if (ret < 0) + return ret; + if (enabled) + mode = MEMMAP_ON_MEMORY_ENABLE; + else + mode = MEMMAP_ON_MEMORY_DISABLE; + +matched: + *((int *)kp->arg) = mode; + if (mode == MEMMAP_ON_MEMORY_FORCE) { + pr_info("Memory hotplug will reserve %ld pages in each memory block\n", + memory_block_align_base(memory_block_size_bytes())); + } + return 0; +} + +static int get_memmap_mode(char *buffer, const struct kernel_param *kp) +{ + if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE) + return sprintf(buffer, "force\n"); + if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_ENABLE) + return sprintf(buffer, "y\n"); + + return sprintf(buffer, "n\n"); +} + +static const struct kernel_param_ops memmap_mode_ops = { + .set = set_memmap_mode, + .get = get_memmap_mode, +}; +module_param_cb(memmap_on_memory, _mode_ops, _mode, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" + "With value \"force\" it could result in memory wastage due to memmap size limitations \n" + "For example, if the memmap for a memory block requires 1 MiB, but the pageblock \n" + "size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that there are \n" + "still cases where the feature cannot be enforced: for example, if the memmap is \n" + "smaller than a single page, or if the architecture does not support the forced \n" + "mode in all configurations. (y/n/force)"); static inline bool mhp_memmap_on_memory(void) { - return memmap_on_memory; + return !!memmap_mode; } #else static inline bool mhp_memmap_on_memory(void) @@ -1264,7 +1332,6 @@ static inline bool arch_supports_memmap_on_memory(unsigned long size) static bool mhp_supports_memmap_on_memory(unsigned long size) { - unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT; unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); unsigned long remaining_size = size - vmemmap_size; @@ -1295,10 +1362,23 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - return mhp_memmap_on_memory() && - size == memory_block_size_bytes() && - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) && - arch_supports_memmap_on_memory(size); + if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) + return false; + + /* +* Make sure the vmemmap allocation is fully contained +* so that we always allocate vmemmap memory from altmap area. +*/ + if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) + return false; +/* + * Without page reservation remaining pages should be pageblock
[PATCH v4 3/6] mm/hotplug: Allow architecture to override memmap on memory support check
Some architectures would want different restrictions. Hence add an architecture-specific override. Both the PMD_SIZE check and pageblock alignment check are moved there. Signed-off-by: Aneesh Kumar K.V --- mm/memory_hotplug.c | 22 +++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b19462f4e72..5921c81fcb70 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1247,9 +1247,25 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(>dev); } +#ifndef arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT; + unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + + /* +* As default, we want the vmemmap to span a complete PMD such that we +* can map the vmemmap using a single PMD if supported by the +* architecture. +*/ + return IS_ALIGNED(vmemmap_size, PMD_SIZE); +} +#endif + static bool mhp_supports_memmap_on_memory(unsigned long size) { - unsigned long nr_vmemmap_pages = size / PAGE_SIZE; + + unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT; unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); unsigned long remaining_size = size - vmemmap_size; @@ -1281,8 +1297,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) */ return mhp_memmap_on_memory() && size == memory_block_size_bytes() && - IS_ALIGNED(vmemmap_size, PMD_SIZE) && - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) && + arch_supports_memmap_on_memory(size); } /* -- 2.41.0
[PATCH v4 2/6] mm/hotplug: Allow memmap on memory hotplug request to fallback
If not supported, fallback to not using memap on memmory. This avoids the need for callers to do the fallback. Signed-off-by: Aneesh Kumar K.V --- drivers/acpi/acpi_memhotplug.c | 3 +-- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c| 13 ++--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 24f662d8bd39..d0c1a71007d0 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (!info->length) continue; - if (mhp_supports_memmap_on_memory(info->length)) - mhp_flags |= MHP_MEMMAP_ON_MEMORY; + mhp_flags |= MHP_MEMMAP_ON_MEMORY; result = __add_memory(mgid, info->start_addr, info->length, mhp_flags); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 013c69753c91..7d2076583494 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -97,6 +97,8 @@ typedef int __bitwise mhp_t; * To do so, we will use the beginning of the hot-added range to build * the page tables for the memmap array that describes the entire range. * Only selected architectures support it with SPARSE_VMEMMAP. + * This is only a hint, the core kernel can decide to not do this based on + * different alignment checks. */ #define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) /* @@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); -extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3f231cf1b410..1b19462f4e72 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(>dev); } -bool mhp_supports_memmap_on_memory(unsigned long size) +static bool mhp_supports_memmap_on_memory(unsigned long size) { unsigned long nr_vmemmap_pages = size / PAGE_SIZE; unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); @@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * Self hosted memmap array */ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { - if (!mhp_supports_memmap_on_memory(size)) { - ret = -EINVAL; - goto error; + if (mhp_supports_memmap_on_memory(size)) { + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = _altmap; } - mhp_altmap.free = PHYS_PFN(size); - mhp_altmap.base_pfn = PHYS_PFN(start); - params.altmap = _altmap; + /* fallback to not using altmap */ } /* call arch's memory hotadd */ -- 2.41.0
[PATCH v4 1/6] mm/hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig
Instead of adding menu entry with all supported architectures, add mm/Kconfig variable and select the same from supported architectures. No functional change in this patch. Acked-by: David Hildenbrand Signed-off-by: Aneesh Kumar K.V --- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- mm/Kconfig | 3 +++ 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2511b30d0f6..20245bd72b8f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -78,6 +78,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_GNU_PROPERTY select ARCH_USE_MEMTEST @@ -348,9 +349,6 @@ config GENERIC_CSUM config GENERIC_CALIBRATE_DELAY def_bool y -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - config SMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 78224aa76409..d0258e92a8af 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,7 @@ config X86 select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/mm/Kconfig b/mm/Kconfig index 923bd35f81f2..932349271e28 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -572,6 +572,9 @@ config MHP_MEMMAP_ON_MEMORY endif # MEMORY_HOTPLUG +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + bool + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. -- 2.41.0
[PATCH v4 0/6] Add support for memmap on memory feature on ppc64
This patch series update memmap on memory feature to fall back to memmap allocation outside the memory block if the alignment rules are not met. This makes the feature more useful on architectures like ppc64 where alignment rules are different with 64K page size. This patch series is dependent on dax vmemmap optimization series posted here https://lore.kernel.org/linux-mm/20230718022934.90447-1-aneesh.ku...@linux.ibm.com/ Changes from v3: * Extend the module parameter memmap_on_memory to force allocation even though we can waste hotplug memory. Changes from v2: * Rebase to latest linus tree * Redo the series based on review feedback. Multiple changes to the patchset. Changes from v1: * update the memblock to store vmemmap_altmap details. This is required so that when we remove the memory we can find the altmap details which is needed on some architectures. * rebase to latest linus tree Aneesh Kumar K.V (6): mm/hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig mm/hotplug: Allow memmap on memory hotplug request to fallback mm/hotplug: Allow architecture to override memmap on memory support check mm/hotplug: Allow pageblock alignment via altmap reservation powerpc/book3s64/memhotplug: Enable memmap on memory for radix mm/hotplug: Embed vmem_altmap details in memory block arch/arm64/Kconfig| 4 +- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/pgtable.h| 24 +++ .../platforms/pseries/hotplug-memory.c| 3 +- arch/x86/Kconfig | 4 +- drivers/acpi/acpi_memhotplug.c| 3 +- drivers/base/memory.c | 32 +++- include/linux/memory.h| 8 +- include/linux/memory_hotplug.h| 3 +- mm/Kconfig| 3 + mm/memory_hotplug.c | 168 ++ 11 files changed, 193 insertions(+), 60 deletions(-) -- 2.41.0
[PATCH v5 13/13] powerpc/book3s64/radix: Add debug message to give more details of vmemmap allocation
Add some extra vmemmap pr_debug message that will indicate the type of vmemmap allocations. For ex: with DAX vmemmap optimization we can find the below details: [ 187.166580] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166587] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166591] radix-mmu: Tail page reuse vmemmap mapping [ 187.166594] radix-mmu: Tail page reuse vmemmap mapping [ 187.166598] radix-mmu: Tail page reuse vmemmap mapping [ 187.166601] radix-mmu: Tail page reuse vmemmap mapping [ 187.166604] radix-mmu: Tail page reuse vmemmap mapping [ 187.166608] radix-mmu: Tail page reuse vmemmap mapping [ 187.166611] radix-mmu: Tail page reuse vmemmap mapping [ 187.166614] radix-mmu: Tail page reuse vmemmap mapping [ 187.166617] radix-mmu: Tail page reuse vmemmap mapping [ 187.166620] radix-mmu: Tail page reuse vmemmap mapping [ 187.166623] radix-mmu: Tail page reuse vmemmap mapping [ 187.166626] radix-mmu: Tail page reuse vmemmap mapping [ 187.166629] radix-mmu: Tail page reuse vmemmap mapping [ 187.166632] radix-mmu: Tail page reuse vmemmap mapping And without vmemmap optimization [ 293.549931] radix-mmu: PMD_SIZE vmemmap mapping [ 293.549984] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550032] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550076] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550117] radix-mmu: PMD_SIZE vmemmap mapping Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 73d0987369ff..2828e7e0802c 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1034,6 +1034,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); if (!p) return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); } else { /* * When a PTE/PMD entry is freed from the init_mm @@ -1046,6 +1047,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long */ get_page(reuse); p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); } VM_BUG_ON(!PAGE_ALIGNED(addr)); @@ -1155,6 +1157,7 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); continue; } else if (altmap) { /* -- 2.41.0
[PATCH v5 11/13] powerpc/book3s64/radix: Add support for vmemmap optimization for radix
With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)). Hence with 64K page size, we don't use vmemmap deduplication for PMD-level mapping. Signed-off-by: Aneesh Kumar K.V --- Documentation/mm/vmemmap_dedup.rst | 1 + Documentation/powerpc/index.rst| 1 + Documentation/powerpc/vmemmap_dedup.rst| 101 ++ arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/radix.h | 9 + arch/powerpc/mm/book3s64/radix_pgtable.c | 203 + 6 files changed, 316 insertions(+) create mode 100644 Documentation/powerpc/vmemmap_dedup.rst diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index a4b12ff906c4..c573e08b5043 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -210,6 +210,7 @@ the device (altmap). The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). +For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst The differences with HugeTLB are relatively minor. diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index d33b554ca7ba..a50834798454 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -36,6 +36,7 @@ powerpc ultravisor vas-api vcpudispatch_stats +vmemmap_dedup features diff --git a/Documentation/powerpc/vmemmap_dedup.rst b/Documentation/powerpc/vmemmap_dedup.rst new file mode 100644 index ..dc4db59fdf87 --- /dev/null +++ b/Documentation/powerpc/vmemmap_dedup.rst @@ -0,0 +1,101 @@ +.. SPDX-License-Identifier: GPL-2.0 + +== +Device DAX +== + +The device-dax interface uses the tail deduplication technique explained in +Documentation/mm/vmemmap_dedup.rst + +On powerpc, vmemmap deduplication is only used with radix MMU translation. Also +with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap +deduplication. + +With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap +page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no +vmemmap deduplication possible. + +With 1G PUD level mapping, we require 16384 struct pages and a single 64K +vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we +require 16 64K pages in vmemmap to map the struct page for 1G PUD level mapping. + +Here's how things look like on device-dax after the sections are populated:: + +---+ ---virt_to_page---> +---+ mapping to +---+ + | | | 0 | -> | 0 | + | | +---++---+ + | | | 1 | -> | 1 | + | | +---++---+ + | | | 2 | ^ ^ ^ ^ ^ ^ + | | +---+ | | | | | + | | | 3 | --+ | | | | + | | +---+ | | | | + | | | 4 | + | | | + |PUD| +---+ | | | + | level | | . | --+ | | + | mapping | +---+ | | + | | | . | + | + | | +---+ | + | | | 15| --+ + | | +---+ + | | + | | + | | + +---+ + + +With 4K page size, 2M PMD level mapping requires 512 struct pages and a single +4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we +require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping. + +Here's how things look like on device-dax after the sections are populated:: + + +---+ ---virt_to_page---> +---+ mapping to +---+ + | | | 0 | -> | 0 | + | | +---++---+ + | | | 1 | -> | 1 | + | | +---++---+ + | | | 2 | ^ ^ ^ ^ ^ ^ + | | +---+ | | | | | + | | | 3 | --+ | | | | + | |
[PATCH v5 12/13] powerpc/book3s64/radix: Remove mmu_vmemmap_psize
This is not used by radix anymore. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/book3s64/radix_pgtable.c | 11 --- arch/powerpc/mm/init_64.c| 21 ++--- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b492b67c0b7d..73d0987369ff 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -601,17 +601,6 @@ void __init radix__early_init_mmu(void) #else mmu_virtual_psize = MMU_PAGE_4K; #endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* -* map vmemmap using 2M if available -*/ - mmu_vmemmap_psize = MMU_PAGE_2M; - } else - mmu_vmemmap_psize = mmu_virtual_psize; -#endif #endif /* * initialize page table size diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 5701faca39ef..6db7a063ba63 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -198,17 +198,12 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, return false; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) +int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, +struct vmem_altmap *altmap) { bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; -#ifdef CONFIG_PPC_BOOK3S_64 - if (radix_enabled()) - return radix__vmemmap_populate(start, end, node, altmap); -#endif - /* Align to the page size of the linear mapping. */ start = ALIGN_DOWN(start, page_size); @@ -277,6 +272,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + + return __vmemmap_populate(start, end, node, altmap); +} + #ifdef CONFIG_MEMORY_HOTPLUG static unsigned long vmemmap_list_free(unsigned long start) { -- 2.41.0
[PATCH v5 10/13] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function
This is in preparation to update radix to implement vmemmap optimization for devdax. Below are the rules w.r.t radix vmemmap mapping 1. First try to map things using PMD (2M) 2. With altmap if altmap cross-boundary check returns true, fall back to PAGE_SIZE 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to PAGE_SIZE On removing vmemmap mapping, check if every subsection that is using the vmemmap area is invalid. If found to be invalid, that implies we can safely free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 because with 64K page size, we need to do the above check even at the PAGE_SIZE granularity. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/radix.h | 2 + arch/powerpc/include/asm/pgtable.h | 4 + arch/powerpc/mm/book3s64/radix_pgtable.c | 326 +++-- arch/powerpc/mm/init_64.c | 26 +- 4 files changed, 327 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 2ef92f36340f..f1461289643a 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -331,6 +331,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long phys); int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 6a88bfdaa69b..68817ea7f994 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -165,6 +165,10 @@ static inline bool is_ioremap_addr(const void *x) return addr >= IOREMAP_BASE && addr < IOREMAP_END; } + +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size); #endif /* CONFIG_PPC64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 227fea53c217..9a7f3707b6fb 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -744,8 +744,59 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d) p4d_clear(p4d); } +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + return !vmemmap_populated(start, PMD_SIZE); +} + +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + + return !vmemmap_populated(start, PAGE_SIZE); + +} +#endif + +static void __meminit free_vmemmap_pages(struct page *page, +struct vmem_altmap *altmap, +int order) +{ + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); + + /* +* with 2M vmemmap mmaping we can have things setup +* such that even though atlmap is specified we never +* used altmap. +*/ + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + + altmap->free + altmap->alloc + altmap->align; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + return; + } + } + + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + static void remove_pte_table(pte_t *pte_start, unsigned long addr, -unsigned long end, bool direct) +unsigned long end, bool direct, +struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte; @@ -759,24 +810,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, if (!pte_present(*pte)) continue; - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* -
[PATCH v5 09/13] powerpc/book3s64/mm: Enable transparent pud hugepage
This is enabled only with radix translation and 1G hugepage size. This will be used with devdax device memory with a namespace alignment of 1G. Anon transparent hugepage is not supported even though we do have helpers checking pud_trans_huge(). We should never find that return true. The only expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP. Some of the helpers are never expected to get called on hash translation and hence is marked to call BUG() in such a case. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/hash.h | 9 + arch/powerpc/include/asm/book3s/64/pgtable.h | 155 -- arch/powerpc/include/asm/book3s/64/radix.h| 36 .../include/asm/book3s/64/tlbflush-radix.h| 2 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 + arch/powerpc/mm/book3s64/pgtable.c| 78 + arch/powerpc/mm/book3s64/radix_pgtable.c | 28 arch/powerpc/mm/book3s64/radix_tlb.c | 7 + arch/powerpc/platforms/Kconfig.cputype| 1 + include/trace/events/thp.h| 10 ++ 10 files changed, 323 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d4a19e6547ac..6e70ae511631 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -138,7 +138,16 @@ static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) } #definehash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) + +/* + * pud comparison that will work with both pte and page table pointer. + */ +static inline int hash__pud_same(pud_t pud_a, pud_t pud_b) +{ + return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); +} #definehash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) + static inline int hash__p4d_bad(p4d_t p4d) { return (p4d_val(p4d) == 0); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 4acc9690f599..a8204566cfd0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte) { return __pud_raw(pte_raw(pte)); } + +static inline pte_t *pudp_ptep(pud_t *pud) +{ + return (pte_t *)pud; +} + +#define pud_pfn(pud) pte_pfn(pud_pte(pud)) +#define pud_dirty(pud) pte_dirty(pud_pte(pud)) +#define pud_young(pud) pte_young(pud_pte(pud)) +#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud))) +#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud))) +#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud))) +#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud))) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) +#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +#define pud_soft_dirty(pmd)pte_soft_dirty(pud_pte(pud)) +#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud))) +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud))) +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + static inline int pud_bad(pud_t pud) { if (radix_enabled()) @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); +extern void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ +} + extern int hash__has_transparent_hugepage(void); static inline int has_transparent_hugepage(void) { @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void) } #define has_transparent_hugepage has_transparent_hugepage +static inline int has_transparent_pud_hugepage(void) +{ + if (radix_enabled()) + return radix__has_transparent_pud_hugepage(); + return 0; +} +#define has_transparent_pud_hugepage has_transparent_pud_hugepage + static inline unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set) @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
[PATCH v5 08/13] powerpc/mm/trace: Convert trace event to trace event class
A follow-up patch will add a pud variant for this same event. Using event class makes that addition simpler. No functional change in this patch. Reviewed-by: Christophe Leroy Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- include/trace/events/thp.h | 23 --- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 51f48984abca..988948d69bc1 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr old = be64_to_cpu(old_be); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e7ea492ac510..02e185d2e4d6 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -962,7 +962,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add #endif old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); return old; } diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index 202b3e3e67ff..a95c78b10561 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -8,25 +8,29 @@ #include #include -TRACE_EVENT(hugepage_set_pmd, +DECLARE_EVENT_CLASS(hugepage_set, - TP_PROTO(unsigned long addr, unsigned long pmd), - TP_ARGS(addr, pmd), + TP_PROTO(unsigned long addr, unsigned long pte), + TP_ARGS(addr, pte), TP_STRUCT__entry( __field(unsigned long, addr) - __field(unsigned long, pmd) + __field(unsigned long, pte) ), TP_fast_assign( __entry->addr = addr; - __entry->pmd = pmd; + __entry->pte = pte; ), - TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd) + TP_printk("Set page table entry with 0x%lx with 0x%lx", __entry->addr, __entry->pte) ); +DEFINE_EVENT(hugepage_set, hugepage_set_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); -TRACE_EVENT(hugepage_update, +DECLARE_EVENT_CLASS(hugepage_update, TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set), TP_ARGS(addr, pte, clr, set), @@ -48,6 +52,11 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); +DEFINE_EVENT(hugepage_update, hugepage_update_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, unsigned long set), + TP_ARGS(addr, pmd, clr, set) +); + DECLARE_EVENT_CLASS(migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), -- 2.41.0
[PATCH v5 07/13] mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization
Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap optimization includes an update of both the permissions (writeable to read-only) and the output address (pfn) of the vmemmap ptes. That is not supported without unmapping of pte(marking it invalid) by some architectures. With DAX vmemmap optimization we don't require such pte updates and architectures can enable DAX vmemmap optimization while having hugetlb vmemmap optimization disabled. Hence split DAX optimization support into a different config. s390, loongarch and riscv don't have devdax support. So the DAX config is not enabled for them. With this change, arm64 should be able to select DAX optimization [1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable HUGETLB_PAGE_OPTIMIZE_VMEMMAP") Signed-off-by: Aneesh Kumar K.V --- arch/loongarch/Kconfig | 2 +- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 3 ++- fs/Kconfig | 2 +- include/linux/mm.h | 2 +- mm/Kconfig | 5 - 7 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e55511af4c77..537ca2a4005a 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -59,7 +59,7 @@ config LOONGARCH select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_NO_INSTR select BUILDTIME_TABLE_SORT select COMMON_CLK diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..6943d34c1ec1 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5b39918b7042..975fd06e4f4d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,7 @@ config S390 select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DMA_OPS if PCI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..78224aa76409 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -128,7 +128,8 @@ config X86 select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7953..9c104c130a6e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -252,7 +252,7 @@ config HUGETLB_PAGE config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a2234ee14d2..83f51ec0897d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3640,7 +3640,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #define VMEMMAP_RESERVE_NR 2 -#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { diff --git a/mm/Kconfig b/mm/Kconfig index 09130434e30d..923bd35f81f2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -487,7 +487,10 @@ config SPARSEMEM_VMEMMAP # Select this config option from the architecture Kconfig, if it is preferred # to enable the feature of HugeTLB/dev_dax vmemmap optimization. # -config ARCH_WANT_OPTIMIZE_VMEMMAP +config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP + bool + +config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP bool config HAVE_MEMBLOCK_PHYS_MAP -- 2.41.0
[PATCH v5 06/13] mm/huge pud: Use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE
pudp_set_wrprotect and move_huge_pud helpers are only used when CONFIG_TRANSPARENT_HUGEPAGE is enabled. Similar to pmdp_set_wrprotect and move_huge_pmd_helpers use architecture override only if CONFIG_TRANSPARENT_HUGEPAGE is set Reviewed-by: Christophe Leroy Signed-off-by: Aneesh Kumar K.V --- include/linux/pgtable.h | 2 ++ mm/mremap.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ca67ecbd9a66..bc9d6b681e25 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -558,6 +558,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { @@ -571,6 +572,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm, { BUILD_BUG(); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif diff --git a/mm/mremap.c b/mm/mremap.c index 11e06e4ab33b..056478c106ee 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -349,7 +349,7 @@ static inline bool move_normal_pud(struct vm_area_struct *vma, } #endif -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) { -- 2.41.0
[PATCH v5 05/13] mm: Add pud_same similar to __HAVE_ARCH_P4D_SAME
This helps architectures to override pmd_same and pud_same independently. Signed-off-by: Aneesh Kumar K.V --- include/linux/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 6fd9b2831338..ca67ecbd9a66 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -693,11 +693,14 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } +#endif +#ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } +#define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME -- 2.41.0
[PATCH v5 04/13] mm/vmemmap: Allow architectures to override how vmemmap optimization works
Architectures like powerpc will like to use different page table allocators and mapping mechanisms to implement vmemmap optimization. Similar to vmemmap_populate allow architectures to implement vmemap_populate_compound_pages Signed-off-by: Aneesh Kumar K.V --- mm/sparse-vmemmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a044a130405b..a2cbe44c48e1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -358,6 +358,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, return 0; } +#ifndef vmemmap_populate_compound_pages /* * For compound pages bigger than section size (e.g. x86 1G compound * pages with 2M subsection size) fill the rest of sections as tail @@ -446,6 +447,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, return 0; } +#endif + struct page * __meminit __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) -- 2.41.0
[PATCH v5 03/13] mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override
dax vmemmap optimization requires a minimum of 2 PAGE_SIZE area within vmemmap such that tail page mapping can point to the second PAGE_SIZE area. Enforce that in vmemmap_can_optimize() function. Architectures like powerpc also want to enable vmemmap optimization conditionally (only with radix MMU translation). Hence allow architecture override. Reviewed-by: Christophe Leroy Signed-off-by: Aneesh Kumar K.V --- include/linux/mm.h | 27 +++ mm/mm_init.c | 2 +- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dd73e4f3d8e..1a2234ee14d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3639,13 +3639,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP -static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) +static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - return is_power_of_2(sizeof(struct page)) && - pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap; + unsigned long nr_pages; + unsigned long nr_vmemmap_pages; + + if (!pgmap || !is_power_of_2(sizeof(struct page))) + return false; + + nr_pages = pgmap_vmemmap_nr(pgmap); + nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); + /* +* For vmemmap optimization with DAX we need minimum 2 vmemmap +* pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst +*/ + return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } +/* + * If we don't have an architecture override, use the generic rule + */ +#ifndef vmemmap_can_optimize +#define vmemmap_can_optimize __vmemmap_can_optimize +#endif + #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) diff --git a/mm/mm_init.c b/mm/mm_init.c index a1963c3322af..245ac69b66a5 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1020,7 +1020,7 @@ static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, if (!vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap); - return 2 * (PAGE_SIZE / sizeof(struct page)); + return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); } static void __ref memmap_init_compound(struct page *head, -- 2.41.0
[PATCH v5 02/13] mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg
We will use this in a later patch to do tlb flush when clearing pud entries on powerpc. This is similar to commit 93a98695f2f9 ("mm: change pmdp_huge_get_and_clear_full take vm_area_struct as arg") Reviewed-by: Christophe Leroy Signed-off-by: Aneesh Kumar K.V --- include/linux/pgtable.h | 4 ++-- mm/debug_vm_pgtable.c | 2 +- mm/huge_memory.c| 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index cf13f8d938a8..6fd9b2831338 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -450,11 +450,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL -static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, +static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { - return pudp_huge_get_and_clear(mm, address, pudp); + return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ee119e33fef1..ee2c4c1dcfc8 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -385,7 +385,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1); + pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1); pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eb3678360b97..ba20cef681a4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1981,7 +1981,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); + pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); -- 2.41.0
[PATCH v5 00/13] Add support for DAX vmemmap optimization for ppc64
This patch series implements changes required to support DAX vmemmap optimization for ppc64. The vmemmap optimization is only enabled with radix MMU translation and 1GB PUD mapping with 64K page size. The patch series also split hugetlb vmemmap optimization as a separate Kconfig variable so that architectures can enable DAX vmemmap optimization without enabling hugetlb vmemmap optimization. This should enable architectures like arm64 to enable DAX vmemmap optimization while they can't enable hugetlb vmemmap optimization. More details of the same are in patch "mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization" Changes from v4: * Address review feedback * Add the Reviewed-by: Changes from v3: * Rebase to latest linus tree * Build fix with SPARSEMEM_VMEMMP disabled * Add hash_pud_same outisde THP Kconfig Changes from v2: * Rebase to latest linus tree * Address review feedback Changes from V1: * Fix make htmldocs warning * Fix vmemmap allocation bugs with different alignment values. * Correctly check for section validity to before we free vmemmap area Aneesh Kumar K.V (13): mm/hugepage pud: Allow arch-specific helper function to check huge page pud support mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override mm/vmemmap: Allow architectures to override how vmemmap optimization works mm: Add pud_same similar to __HAVE_ARCH_P4D_SAME mm/huge pud: Use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization powerpc/mm/trace: Convert trace event to trace event class powerpc/book3s64/mm: Enable transparent pud hugepage powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function powerpc/book3s64/radix: Add support for vmemmap optimization for radix powerpc/book3s64/radix: Remove mmu_vmemmap_psize powerpc/book3s64/radix: Add debug message to give more details of vmemmap allocation Documentation/mm/vmemmap_dedup.rst| 1 + Documentation/powerpc/index.rst | 1 + Documentation/powerpc/vmemmap_dedup.rst | 101 +++ arch/loongarch/Kconfig| 2 +- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/hash.h | 9 + arch/powerpc/include/asm/book3s/64/pgtable.h | 155 - arch/powerpc/include/asm/book3s/64/radix.h| 47 ++ .../include/asm/book3s/64/tlbflush-radix.h| 2 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 + arch/powerpc/include/asm/pgtable.h| 4 + arch/powerpc/mm/book3s64/hash_pgtable.c | 2 +- arch/powerpc/mm/book3s64/pgtable.c| 78 +++ arch/powerpc/mm/book3s64/radix_pgtable.c | 573 -- arch/powerpc/mm/book3s64/radix_tlb.c | 7 + arch/powerpc/mm/init_64.c | 37 +- arch/powerpc/platforms/Kconfig.cputype| 1 + arch/riscv/Kconfig| 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 3 +- drivers/nvdimm/pfn_devs.c | 2 +- fs/Kconfig| 2 +- include/linux/mm.h| 29 +- include/linux/pgtable.h | 12 +- include/trace/events/thp.h| 33 +- mm/Kconfig| 5 +- mm/debug_vm_pgtable.c | 2 +- mm/huge_memory.c | 2 +- mm/mm_init.c | 2 +- mm/mremap.c | 2 +- mm/sparse-vmemmap.c | 3 + 31 files changed, 1048 insertions(+), 82 deletions(-) create mode 100644 Documentation/powerpc/vmemmap_dedup.rst -- 2.41.0
[PATCH v5 01/13] mm/hugepage pud: Allow arch-specific helper function to check huge page pud support
Architectures like powerpc would like to enable transparent huge page pud support only with radix translation. To support that add has_transparent_pud_hugepage() helper that architectures can override. Reviewed-by: Christophe Leroy Signed-off-by: Aneesh Kumar K.V --- drivers/nvdimm/pfn_devs.c | 2 +- include/linux/pgtable.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index af7d9301520c..18ad315581ca 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -100,7 +100,7 @@ static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments) if (has_transparent_hugepage()) { alignments[1] = HPAGE_PMD_SIZE; - if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + if (has_transparent_pud_hugepage()) alignments[2] = HPAGE_PUD_SIZE; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5063b482e34f..cf13f8d938a8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1499,6 +1499,9 @@ typedef unsigned int pgtbl_mod_mask; #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif +#ifndef has_transparent_pud_hugepage +#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +#endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. -- 2.41.0
[PATCH] ALSA: ps3: Fix errors in snd_ps3.h
The following checkpatch errors are removed: ERROR: "foo * bar" should be "foo *bar" Signed-off-by: Jie Shi --- sound/ppc/snd_ps3.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/ppc/snd_ps3.h b/sound/ppc/snd_ps3.h index 8b554a79bc14..63bdb8ee3953 100644 --- a/sound/ppc/snd_ps3.h +++ b/sound/ppc/snd_ps3.h @@ -69,11 +69,11 @@ struct snd_ps3_card_info { /* dma buffer management */ spinlock_t dma_lock; /* dma_lock start */ -void * dma_start_vaddr[2]; /* 0 for L, 1 for R */ +void *dma_start_vaddr[2]; /* 0 for L, 1 for R */ dma_addr_t dma_start_bus_addr[2]; size_t dma_buffer_size; -void * dma_last_transfer_vaddr[2]; -void * dma_next_transfer_vaddr[2]; +void *dma_last_transfer_vaddr[2]; +void *dma_next_transfer_vaddr[2]; intsilent; /* dma_lock end */
Re: linux-next: Tree for Jul 13 (drivers/video/fbdev/ps3fb.c)
Hi Thomas, On 7/14/23 13:46, Randy Dunlap wrote: > Thomas, > > On 7/13/23 09:11, Randy Dunlap wrote: >> >> >> On 7/12/23 19:37, Stephen Rothwell wrote: >>> Hi all, >>> > > I still see this build error on linux-next 20230714. I still see this build error on linux-next 20230717. > >>> Changes since 20230712: >>> >> >> on ppc64: >> >> In file included from ../include/linux/device.h:15, >> from ../arch/powerpc/include/asm/io.h:22, >> from ../include/linux/io.h:13, >> from ../include/linux/irq.h:20, >> from ../arch/powerpc/include/asm/hardirq.h:6, >> from ../include/linux/hardirq.h:11, >> from ../include/linux/interrupt.h:11, >> from ../drivers/video/fbdev/ps3fb.c:25: >> ../drivers/video/fbdev/ps3fb.c: In function 'ps3fb_probe': >> ../drivers/video/fbdev/ps3fb.c:1172:40: error: 'struct fb_info' has no >> member named 'dev' >> 1172 | dev_driver_string(info->dev), dev_name(info->dev), >> | ^~ >> ../include/linux/dev_printk.h:110:37: note: in definition of macro >> 'dev_printk_index_wrap' >> 110 | _p_func(dev, fmt, ##__VA_ARGS__); \ >> | ^~~ >> ../drivers/video/fbdev/ps3fb.c:1171:9: note: in expansion of macro 'dev_info' >> 1171 | dev_info(info->device, "%s %s, using %u KiB of video memory\n", >> | ^~~~ >> ../drivers/video/fbdev/ps3fb.c:1172:61: error: 'struct fb_info' has no >> member named 'dev' >> 1172 | dev_driver_string(info->dev), dev_name(info->dev), >> | ^~ >> ../include/linux/dev_printk.h:110:37: note: in definition of macro >> 'dev_printk_index_wrap' -- ~Randy [using gmail temporarily while infradead is down]
Re: [PATCH 0/2] eventfd: simplify signal helpers
On Mon, 17 Jul 2023 19:12:16 -0300 Jason Gunthorpe wrote: > On Mon, Jul 17, 2023 at 01:08:31PM -0600, Alex Williamson wrote: > > > What would that mechanism be? We've been iterating on getting the > > serialization and buffering correct, but I don't know of another means > > that combines the notification with a value, so we'd likely end up with > > an eventfd only for notification and a separate ring buffer for > > notification values. > > All FDs do this. You just have to make a FD with custom > file_operations that does what this wants. The uAPI shouldn't be able > to tell if the FD is backing it with an eventfd or otherwise. Have the > kernel return the FD instead of accepting it. Follow the basic design > of eg mlx5vf_save_fops Sure, userspace could poll on any fd and read a value from it, but at that point we're essentially duplicating a lot of what eventfd provides for a minor(?) semantic difference over how the counter value is interpreted. Using an actual eventfd allows the ACPI notification to work as just another interrupt index within the existing vfio IRQ uAPI. Thanks, Alex
Re: [PATCH 0/2] eventfd: simplify signal helpers
On Mon, Jul 17, 2023 at 01:08:31PM -0600, Alex Williamson wrote: > What would that mechanism be? We've been iterating on getting the > serialization and buffering correct, but I don't know of another means > that combines the notification with a value, so we'd likely end up with > an eventfd only for notification and a separate ring buffer for > notification values. All FDs do this. You just have to make a FD with custom file_operations that does what this wants. The uAPI shouldn't be able to tell if the FD is backing it with an eventfd or otherwise. Have the kernel return the FD instead of accepting it. Follow the basic design of eg mlx5vf_save_fops Jason
Re: [PATCH 0/2] Add support for rpmsg sound card on i.MX93 platform
On Fri, 14 Jul 2023 17:29:11 +0800, Chancel Liu wrote: > Support rpmsg sound card on i.MX93 platform. > > Chancel Liu (2): > ASoC: dt-bindings: fsl_rpmsg: Add compatible string for i.MX93 > ASoC: fsl_rpmsg: Add support for i.MX93 platform > > Documentation/devicetree/bindings/sound/fsl,rpmsg.yaml | 1 + > sound/soc/fsl/fsl_rpmsg.c | 8 > 2 files changed, 9 insertions(+) > > [...] Applied to https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next Thanks! [1/2] ASoC: dt-bindings: fsl_rpmsg: Add compatible string for i.MX93 commit: 143f8c69a27f3fa8ed30c7f6790ea039fff57cfe [2/2] ASoC: fsl_rpmsg: Add support for i.MX93 platform commit: 60f38a592efe08e5ced454e8a05f6814e6e221ec All being well this means that it will be integrated into the linux-next tree (usually sometime in the next 24 hours) and sent to Linus during the next merge window (or sooner if it is a bug fix), however if problems are discovered then the patch may be dropped or reverted. You may get further e-mails resulting from automated or manual testing and review of the tree, please engage with people reporting problems and send followup patches addressing any issues that are reported if needed. If any updates are required or you are submitting further changes they should be sent as incremental updates against current git, existing patches will not be replaced. Please add any relevant lists and maintainers to the CCs when replying to this mail. Thanks, Mark
Re: [PATCH 0/2] eventfd: simplify signal helpers
On Mon, 17 Jul 2023 10:29:34 +0200 Grzegorz Jaszczyk wrote: > pt., 14 lip 2023 o 09:05 Christian Brauner napisał(a): > > > > On Thu, Jul 13, 2023 at 11:10:54AM -0600, Alex Williamson wrote: > > > On Thu, 13 Jul 2023 12:05:36 +0200 > > > Christian Brauner wrote: > > > > > > > Hey everyone, > > > > > > > > This simplifies the eventfd_signal() and eventfd_signal_mask() helpers > > > > by removing the count argument which is effectively unused. > > > > > > We have a patch under review which does in fact make use of the > > > signaling value: > > > > > > https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/ > > > > Huh, thanks for the link. > > > > Quoting from > > https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/#25266856 > > > > > Reading an eventfd returns an 8-byte value, we generally only use it > > > as a counter, but it's been discussed previously and IIRC, it's possible > > > to use that value as a notification value. > > > > So the goal is to pipe a specific value through eventfd? But it is > > explicitly a counter. The whole thing is written around a counter and > > each write and signal adds to the counter. > > > > The consequences are pretty well described in the cover letter of > > v6 https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/ > > > > > Since the eventfd counter is used as ACPI notification value > > > placeholder, the eventfd signaling needs to be serialized in order to > > > not end up with notification values being coalesced. Therefore ACPI > > > notification values are buffered and signalized one by one, when the > > > previous notification value has been consumed. > > > > But isn't this a good indication that you really don't want an eventfd > > but something that's explicitly designed to associate specific data with > > a notification? Using eventfd in that manner requires serialization, > > buffering, and enforces ordering. What would that mechanism be? We've been iterating on getting the serialization and buffering correct, but I don't know of another means that combines the notification with a value, so we'd likely end up with an eventfd only for notification and a separate ring buffer for notification values. As this series demonstrates, the current in-kernel users only increment the counter and most userspace likely discards the counter value, which makes the counter largely a waste. While perhaps unconventional, there's no requirement that the counter may only be incremented by one, nor any restriction that I see in how userspace must interpret the counter value. As I understand the ACPI notification proposal that Grzegorz links below, a notification with an interpreted value allows for a more direct userspace implementation when dealing with a series of discrete notification with value events. Thanks, Alex > > I have no skin in the game aside from having to drop this conversion > > which I'm fine to do if there are actually users for this btu really, > > that looks a lot like abusing an api that really wasn't designed for > > this. > > https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/ > was posted at the beginig of March and one of the main things we've > discussed was the mechanism for propagating acpi notification value. > We've endup with eventfd as the best mechanism and have actually been > using it from v2. I really do not want to waste this effort, I think > we are quite advanced with v6 now. Additionally we didn't actually > modify any part of eventfd support that was in place, we only used it > in a specific (and discussed beforehand) way.
Re: [PATCH v9 01/42] mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()
On Mon, Jul 17, 2023 at 03:55:50PM +, Edgecombe, Rick P wrote: > On Fri, 2023-07-14 at 23:57 +0100, Mark Brown wrote: > > The same issue seems to apply with the version that was in -next > > based > > on v6.4-rc4 too. > The version in your branch is not the same as the version in tip (which > had a squashed build fix). I was able to reproduce the build error with > your branch. But not with the one in tip rebased on v6.5-rc1. So can > you try this version: > https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=x86/shstk=899223d69ce9f338056f4c41ef870d70040fc860 Ah, I'd not seen that patch or that tip had been rebased - I'd actually been using literally the branch from tip as my base at whatever point I last noticed it changing up until I rebased onto -rc1. signature.asc Description: PGP signature
Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset
On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote: > From: Xiaowei Bao > > A workaround for the issue where the PCI Express Endpoint (EP) controller > loses the values of the Maximum Link Width and Supported Link Speed from > the Link Capabilities Register, which initially configured by the Reset > Configuration Word (RCW) during a link-down or hot reset event. > If this fixes an issue, then there should be a Fixes tag. > Signed-off-by: Xiaowei Bao > Signed-off-by: Hou Zhiqiang > Signed-off-by: Frank Li > --- > drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > index 4e4fdd1dfea7..2ef02d827eeb 100644 > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > @@ -45,6 +45,7 @@ struct ls_pcie_ep { > struct pci_epc_features *ls_epc; > const struct ls_pcie_ep_drvdata *drvdata; > int irq; > + u32 lnkcap; > boolbig_endian; > }; > > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > struct ls_pcie_ep *pcie = dev_id; > struct dw_pcie *pci = pcie->pci; > u32 val, cfg; > + u8 offset; > > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > return IRQ_NONE; > > if (val & PEX_PF0_PME_MES_DR_LUD) { > + Please add a comment on why the LNKCAP is being restored here. > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > + > + dw_pcie_dbi_ro_wr_en(pci); > + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap); lnkcap is a 32-bit variable, so you should use dw_pcie_writel_dbi(). - Mani > + dw_pcie_dbi_ro_wr_dis(pci); > + > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); > cfg |= PEX_PF0_CFG_READY; > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device > *pdev) > struct ls_pcie_ep *pcie; > struct pci_epc_features *ls_epc; > struct resource *dbi_base; > + u8 offset; > int ret; > > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device > *pdev) > > platform_set_drvdata(pdev, pcie); > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); > + > ret = dw_pcie_ep_init(>ep); > if (ret) > return ret; > -- > 2.34.1 > -- மணிவண்ணன் சதாசிவம்
Re: [PATCH v9 01/42] mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()
On Fri, 2023-07-14 at 23:57 +0100, Mark Brown wrote: > On Mon, Jun 12, 2023 at 05:10:27PM -0700, Rick Edgecombe wrote: > > The x86 Shadow stack feature includes a new type of memory called > > shadow > > stack. This shadow stack memory has some unusual properties, which > > requires > > some core mm changes to function properly. > > This seems to break sparc64_defconfig when applied on top of v6.5- > rc1: > > In file included from /home/broonie/git/bisect/include/linux/mm.h:29, > from /home/broonie/git/bisect/net/core/skbuff.c:40: > /home/broonie/git/bisect/include/linux/pgtable.h: In function > 'pmd_mkwrite': > /home/broonie/git/bisect/include/linux/pgtable.h:528:9: error: > implicit declaration of function 'pmd_mkwrite_novma'; did you mean > 'pte_mkwrite_novma'? [-Werror=implicit-function-declaration] > return pmd_mkwrite_novma(pmd); > ^ > pte_mkwrite_novma > /home/broonie/git/bisect/include/linux/pgtable.h:528:9: error: > incompatible types when returning type 'int' but 'pmd_t' {aka 'struct > '} was expected > return pmd_mkwrite_novma(pmd); > ^~ > > The same issue seems to apply with the version that was in -next > based > on v6.4-rc4 too. The version in your branch is not the same as the version in tip (which had a squashed build fix). I was able to reproduce the build error with your branch. But not with the one in tip rebased on v6.5-rc1. So can you try this version: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=x86/shstk=899223d69ce9f338056f4c41ef870d70040fc860
Re: [PATCH 1/2] PCI: layerscape: Add support for Link down notification
On Thu, Jun 15, 2023 at 12:41:11PM -0400, Frank Li wrote: > Add support to pass Link down notification to Endpoint function driver > so that the LINK_DOWN event can be processed by the function. > > Signed-off-by: Frank Li One nit below. With that, Acked-by: Manivannan Sadhasivam > --- > drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > index de4c1758a6c3..4e4fdd1dfea7 100644 > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > @@ -88,6 +88,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > > dev_dbg(pci->dev, "Link up\n"); > } else if (val & PEX_PF0_PME_MES_DR_LDD) { > + pci_epc_linkdown(pci->ep.epc); It'd be good to move this call after dev_dbg(). - Mani > dev_dbg(pci->dev, "Link down\n"); > } else if (val & PEX_PF0_PME_MES_DR_HRD) { > dev_dbg(pci->dev, "Hot reset\n"); > -- > 2.34.1 > -- மணிவண்ணன் சதாசிவம்
[PATCH 1/1] sound:soc: fix return value check in imx_audmux_suspend
check the return value of clk_prepare_enable, and if clk_prepare_enable got an unexpected return value, imx_audmux_suspend should return the error value. Signed-off-by: Yuanjun Gong --- sound/soc/fsl/imx-audmux.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/soc/fsl/imx-audmux.c b/sound/soc/fsl/imx-audmux.c index be003a117b39..962b6baf0a34 100644 --- a/sound/soc/fsl/imx-audmux.c +++ b/sound/soc/fsl/imx-audmux.c @@ -325,8 +325,11 @@ static void imx_audmux_remove(struct platform_device *pdev) static int imx_audmux_suspend(struct device *dev) { int i; + ssize_t ret; - clk_prepare_enable(audmux_clk); + ret = clk_prepare_enable(audmux_clk); + if (ret) + return ret; for (i = 0; i < reg_max; i++) regcache[i] = readl(audmux_base + i * 4); -- 2.17.1
[PATCH 1/1] sound:soc: fix return value check in imx_audmux_resume
check the return value of clk_prepare_enable, and if clk_prepare_enable got an unexpected return value, imx_audmux_resume should return the error value. Signed-off-by: Yuanjun Gong --- sound/soc/fsl/imx-audmux.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/soc/fsl/imx-audmux.c b/sound/soc/fsl/imx-audmux.c index be003a117b39..e8a3a1baf18d 100644 --- a/sound/soc/fsl/imx-audmux.c +++ b/sound/soc/fsl/imx-audmux.c @@ -339,8 +339,11 @@ static int imx_audmux_suspend(struct device *dev) static int imx_audmux_resume(struct device *dev) { int i; + ssize_t ret; - clk_prepare_enable(audmux_clk); + ret = clk_prepare_enable(audmux_clk); + if (ret) + return ret; for (i = 0; i < reg_max; i++) writel(regcache[i], audmux_base + i * 4); -- 2.17.1
Re: [PATCH] powerpc/build: vdso linker warning for orphan sections
Hi Nicholas, On 2023-06-09, Nicholas Piggin wrote: > Add --orphan-handlin for vdsos, and adjust vdso linker scripts to deal > with orphan sections. I'm reporting that I am getting a linker warning with 6.5-rc2. The warning message is: ld: warning: discarding dynamic section .rela.opd and bisects to: 8ad57add77d3 ("powerpc/build: vdso linker warning for orphan sections") Despite the warning, my ppc64 system seems to run fine. Let me know if you need any other information from me. I noticed [0] this with 6.5-rc1 but didn't contact the right people. John Ogness [0] https://lore.kernel.org/lkml/871qhf1q3j@jogness.linutronix.de
Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset
On Mon, Jul 17, 2023 at 09:29:10PM +0530, Manivannan Sadhasivam wrote: > On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote: > > From: Xiaowei Bao > > > > A workaround for the issue where the PCI Express Endpoint (EP) controller > > loses the values of the Maximum Link Width and Supported Link Speed from > > the Link Capabilities Register, which initially configured by the Reset > > Configuration Word (RCW) during a link-down or hot reset event. > > > > If this fixes an issue, then there should be a Fixes tag. It is not fixed a exist software issue, just workaround a hardwre errata. > > > Signed-off-by: Xiaowei Bao > > Signed-off-by: Hou Zhiqiang > > Signed-off-by: Frank Li > > --- > > drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 + > > 1 file changed, 13 insertions(+) > > > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > > index 4e4fdd1dfea7..2ef02d827eeb 100644 > > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > > @@ -45,6 +45,7 @@ struct ls_pcie_ep { > > struct pci_epc_features *ls_epc; > > const struct ls_pcie_ep_drvdata *drvdata; > > int irq; > > + u32 lnkcap; > > boolbig_endian; > > }; > > > > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > > *dev_id) > > struct ls_pcie_ep *pcie = dev_id; > > struct dw_pcie *pci = pcie->pci; > > u32 val, cfg; > > + u8 offset; > > > > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); > > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); > > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, > > void *dev_id) > > return IRQ_NONE; > > > > if (val & PEX_PF0_PME_MES_DR_LUD) { > > + > > Please add a comment on why the LNKCAP is being restored here. > > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > > + > > + dw_pcie_dbi_ro_wr_en(pci); > > + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap); > > lnkcap is a 32-bit variable, so you should use dw_pcie_writel_dbi(). > > - Mani > > > + dw_pcie_dbi_ro_wr_dis(pci); > > + > > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); > > cfg |= PEX_PF0_CFG_READY; > > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); > > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct > > platform_device *pdev) > > struct ls_pcie_ep *pcie; > > struct pci_epc_features *ls_epc; > > struct resource *dbi_base; > > + u8 offset; > > int ret; > > > > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); > > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct > > platform_device *pdev) > > > > platform_set_drvdata(pdev, pcie); > > > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > > + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); > > + > > ret = dw_pcie_ep_init(>ep); > > if (ret) > > return ret; > > -- > > 2.34.1 > > > > -- > மணிவண்ணன் சதாசிவம்
Re: [PATCH] misc: Explicitly include correct DT includes
On Fri, Jul 14, 2023 at 11:47 AM Rob Herring wrote: > > The DT of_device.h and of_platform.h date back to the separate > of_platform_bus_type before it as merged into the regular platform bus. > As part of that merge prepping Arm DT support 13 years ago, they > "temporarily" include each other. They also include platform_device.h > and of.h. As a result, there's a pretty much random mix of those include > files used throughout the tree. In order to detangle these headers and > replace the implicit includes with struct declarations, users need to > explicitly include the correct includes. > > Signed-off-by: Rob Herring > --- > drivers/misc/cxl/base.c| 1 + > drivers/misc/fastrpc.c | 1 + > drivers/misc/lis3lv02d/lis3lv02d.c | 2 +- > drivers/misc/qcom-coincell.c | 1 - > drivers/misc/sram.c| 2 +- > drivers/misc/vcpu_stall_detector.c | 1 - > drivers/misc/xilinx_sdfec.c| 4 +++- > drivers/misc/xilinx_tmr_inject.c | 3 ++- > drivers/misc/xilinx_tmr_manager.c | 3 ++- > 9 files changed, 11 insertions(+), 7 deletions(-) > > diff --git a/drivers/misc/cxl/base.c b/drivers/misc/cxl/base.c > index cc0caf9192dc..b054562c046e 100644 > --- a/drivers/misc/cxl/base.c > +++ b/drivers/misc/cxl/base.c > @@ -7,6 +7,7 @@ > #include > #include > #include > +#include > #include > #include "cxl.h" > > diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c > index 9666d28037e1..1c7c0532da6f 100644 > --- a/drivers/misc/fastrpc.c > +++ b/drivers/misc/fastrpc.c > @@ -13,6 +13,7 @@ > #include > #include > #include > +#include > #include > #include > #include > diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c > b/drivers/misc/lis3lv02d/lis3lv02d.c > index 299d316f1bda..49868a45c0ad 100644 > --- a/drivers/misc/lis3lv02d/lis3lv02d.c > +++ b/drivers/misc/lis3lv02d/lis3lv02d.c > @@ -26,7 +26,7 @@ > #include > #include > #include > -#include > +#include > #include "lis3lv02d.h" > > #define DRIVER_NAME "lis3lv02d" > diff --git a/drivers/misc/qcom-coincell.c b/drivers/misc/qcom-coincell.c > index 54d4f6ee..3c57f7429147 100644 > --- a/drivers/misc/qcom-coincell.c > +++ b/drivers/misc/qcom-coincell.c > @@ -8,7 +8,6 @@ > #include > #include > #include > -#include > #include > > struct qcom_coincell { > diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c > index 5757adf418b1..a88f92cf35be 100644 > --- a/drivers/misc/sram.c > +++ b/drivers/misc/sram.c > @@ -10,8 +10,8 @@ > #include > #include > #include > +#include > #include > -#include > #include > #include > #include > diff --git a/drivers/misc/vcpu_stall_detector.c > b/drivers/misc/vcpu_stall_detector.c > index 53b5506080e1..6479c962da1a 100644 > --- a/drivers/misc/vcpu_stall_detector.c > +++ b/drivers/misc/vcpu_stall_detector.c > @@ -13,7 +13,6 @@ > #include > #include > #include > -#include > #include > #include > #include > diff --git a/drivers/misc/xilinx_sdfec.c b/drivers/misc/xilinx_sdfec.c > index 270ff4c5971a..35941c006552 100644 > --- a/drivers/misc/xilinx_sdfec.c > +++ b/drivers/misc/xilinx_sdfec.c > @@ -15,12 +15,14 @@ > #include > #include > #include > -#include > +#include > +#include > #include > #include > #include > #include > #include > +#include Double include of of.h. v2 coming.
Re: [PATCH] usb: Explicitly include correct DT includes
On Fri, Jul 14, 2023 at 11:50 AM Rob Herring wrote: > > The DT of_device.h and of_platform.h date back to the separate > of_platform_bus_type before it as merged into the regular platform bus. > As part of that merge prepping Arm DT support 13 years ago, they > "temporarily" include each other. They also include platform_device.h > and of.h. As a result, there's a pretty much random mix of those include > files used throughout the tree. In order to detangle these headers and > replace the implicit includes with struct declarations, users need to > explicitly include the correct includes. > > Signed-off-by: Rob Herring > --- [...] > diff --git a/drivers/usb/host/fsl-mph-dr-of.c > b/drivers/usb/host/fsl-mph-dr-of.c > index a9877f2569f4..2574bccc151b 100644 > --- a/drivers/usb/host/fsl-mph-dr-of.c > +++ b/drivers/usb/host/fsl-mph-dr-of.c > @@ -10,10 +10,12 @@ > #include > #include > #include > -#include > +#include > +#include > #include > #include > #include > +#include Double include of of.h here. v2 coming. Rob
Re: [PATCH v2 02/12] mm: introduce execmem_text_alloc() and jit_text_alloc()
On Mon, Jun 26, 2023, at 10:48 AM, Song Liu wrote: > On Mon, Jun 26, 2023 at 5:31 AM Mark Rutland wrote: >> > [...] >> > >> > So the idea was that jit_text_alloc() will have a cache of large pages >> > mapped ROX, will allocate memory from those caches and there will be >> > jit_update() that uses text poking for writing to that memory. >> > >> > Upon allocation of a large page to increase the cache, that large page will >> > be "invalidated" by filling it with breakpoint instructions (e.g int3 on >> > x86) >> >> Does that work on x86? >> >> That is in no way gauranteed for other architectures; on arm64 you need >> explicit cache maintenance (with I-cache maintenance at the VA to be executed >> from) followed by context-synchronization-events (e.g. via ISB instructions, >> or >> IPIs). > > I guess we need: > 1) Invalidate unused part of the huge ROX pages; > 2) Do not put two jit users (including module text, bpf, etc.) in the > same cache line; > 3) Explicit cache maintenance; > 4) context-synchronization-events. > > Would these (or a subset of them) be sufficient to protect us from torn read? Maybe? #4 is sufficiently vague that I can't really interpret it. I have a half-drafted email asking for official clarification on the rules that might help shed light on this. I find that this type of request works best when it's really well written :) > > Thanks, > Song
Re: Kernel Crash Dump (kdump) broken with 6.5
On 2023-07-17 20:15:53 Mon, Sachin Sant wrote: > Kdump seems to be broken with 6.5 for ppc64le. > > [ 14.200412] systemd[1]: Starting dracut pre-pivot and cleanup hook... > [[0;32m OK [0m] Started dracut pre-pivot and cleanup hook. > Starting Kdump Vmcore Save Service... > [ 14.231669] systemd[1]: Started dracut pre-pivot and cleanup hook. > [ 14.231801] systemd[1]: Starting Kdump Vmcore Save Service... > [ 14.341035] kdump.sh[297]: kdump: saving to > /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/ > [ 14.350053] EXT4-fs (sda2): re-mounted e971a335-1ef8-4295-ab4e-3940f28e53fc > r/w. Quota mode: none. > [ 14.345979] kdump.sh[297]: kdump: saving vmcore-dmesg.txt to > /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/ > [ 14.348742] kdump.sh[331]: Cannot open /proc/vmcore: No such file or > directory > [ 14.348845] kdump.sh[297]: kdump: saving vmcore-dmesg.txt failed > [ 14.349014] kdump.sh[297]: kdump: saving vmcore > [ 14.443422] kdump.sh[332]: open_dump_memory: Can't open the dump > memory(/proc/vmcore). No such file or directory > [ 14.456413] kdump.sh[332]: makedumpfile Failed. > [ 14.456662] kdump.sh[297]: kdump: saving vmcore failed, _exitcode:1 > [ 14.456822] kdump.sh[297]: kdump: saving the /run/initramfs/kexec-dmesg.log > to /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/ > [ 14.487002] kdump.sh[297]: kdump: saving vmcore failed > [[0;1;31mFAILED[0m] Failed to start Kdump Vmcore Save Service. Thanks Sachin for catching this. > > 6.4 was good. Git bisect points to following patch > > commit 606787fed7268feb256957872586370b56af697a > powerpc/64s: Remove support for ELFv1 little endian userspace > > Reverting this patch allows a successful capture of vmcore. > > Does this change require any corresponding change to kdump > and/or kexec tools? Need to investigate that. It looks like vmcore_elf64_check_arch() check from fs/proc/vmcore.c is failing after above commit. static int __init parse_crash_elf64_headers(void) { [...] /* Do some basic Verification. */ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || (ehdr.e_type != ET_CORE) || !vmcore_elf64_check_arch() || [...] It looks like ehdr->e_flags are not set properly while generating vmcore ELF header. I see that in kexec_file_load, ehdr->e_flags left set to 0 irrespective of IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) is true or false. -Mahesh
Re: [PATCH] dmaengine: Explicitly include correct DT includes
On Fri, Jul 14, 2023 at 11:44 AM Rob Herring wrote: > > The DT of_device.h and of_platform.h date back to the separate > of_platform_bus_type before it as merged into the regular platform bus. > As part of that merge prepping Arm DT support 13 years ago, they > "temporarily" include each other. They also include platform_device.h > and of.h. As a result, there's a pretty much random mix of those include > files used throughout the tree. In order to detangle these headers and > replace the implicit includes with struct declarations, users need to > explicitly include the correct includes. > > Signed-off-by: Rob Herring > --- > drivers/dma/apple-admac.c | 3 ++- > drivers/dma/at_hdmac.c | 2 +- > drivers/dma/bcm-sba-raid.c | 4 +++- > drivers/dma/bestcomm/bestcomm.c| 4 +--- v2 coming for this: >> drivers/dma/bestcomm/bestcomm. c:80:13: error: call to undeclared function 'irq_of_parse_and_map'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 80 | tsk->irq = irq_of_parse_and_map(bcom_eng->ofnode, tsk->tasknum); |^ >> drivers/dma/bestcomm/bestcomm.c:105:4: error: call to undeclared function >> 'irq_dispose_mapping'; ISO C99 and later do not support implicit function >> declarations [-Wimplicit-function-declaration] 105 | irq_dispose_mapping(tsk->irq); | ^ drivers/dma/bestcomm/bestcomm.c:128:2: error: call to undeclared function 'irq_dispose_mapping'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 128 | irq_dispose_mapping(tsk->irq); | ^ 3 errors generated.
Re: [PATCH v5] Revert "powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto"
Le 17/07/2023 à 07:01, Michael Ellerman a écrit : > Christophe Leroy writes: >> Le 12/07/2023 à 15:45, Michael Ellerman a écrit : >>> From: Christophe Leroy >>> >>> This partly reverts commit 1e688dd2a3d6759d416616ff07afc4bb836c4213. >>> >>> That commit aimed at optimising the code around generation of >>> WARN_ON/BUG_ON but this leads to a lot of dead code erroneously >>> generated by GCC. >>> >>> That dead code becomes a problem when we start using objtool validation >>> because objtool will abort validation with a warning as soon as it >>> detects unreachable code. This is because unreachable code might >>> be the indication that objtool doesn't properly decode object text. >>> >>>textdata bss dec hex filename >>> 9551585 3627834 224376 13403795 cc8693 vmlinux.before >>> 9535281 3628358 224376 13388015 cc48ef vmlinux.after >>> >>> Once this change is reverted, in a standard configuration (pmac32 + >>> function tracer) the text is reduced by 16k which is around 1.7% >>> >>> We already had problem with it when starting to use objtool on powerpc >>> as a replacement for recordmcount, see commit 93e3f45a2631 ("powerpc: >>> Fix __WARN_FLAGS() for use with Objtool") >>> >>> There is also a problem with at least GCC 12, on ppc64_defconfig + >>> CONFIG_CC_OPTIMIZE_FOR_SIZE=y + CONFIG_DEBUG_SECTION_MISMATCH=y : >>> >>> LD .tmp_vmlinux.kallsyms1 >>> powerpc64-linux-ld: net/ipv4/tcp_input.o:(__ex_table+0xc4): undefined >>> reference to `.L2136' >>> make[2]: *** [scripts/Makefile.vmlinux:36: vmlinux] Error 1 >>> make[1]: *** [/home/chleroy/linux-powerpc/Makefile:1238: vmlinux] Error >>> 2 >>> >>> Taking into account that other problems are encountered with that >>> 'asm goto' in WARN_ON(), including build failures, keeping that >>> change is not worth it allthough it is primarily a compiler bug. >>> >>> Revert it for now. >>> >>> mpe: Retain EMIT_WARN_ENTRY as a synonym for EMIT_BUG_ENTRY to reduce >>> churn, as there are now nearly as many uses of EMIT_WARN_ENTRY as >>> EMIT_BUG_ENTRY. >> >> In that case, should we keep __EMIT_BUG_ENTRY and also keep the check >> that makes sure nobody uses EMIT_BUG_ENTRY with BUGFLAG_WARNING ? > > I didn't think it was worth it, now that it's not a correctness issue. > > I think the better option would be to have EMIT_WARN_ENTRY add > BUGFLAG_WARNING itself, rather than the caller having to pass it. > Ok that's fine for me. I'll do that in a follow-up patch one day. Christophe
Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset
On Mon, Jul 17, 2023 at 08:45:14AM -0600, Rob Herring wrote: > On Thu, Jun 15, 2023 at 10:41 AM Frank Li wrote: > > > > From: Xiaowei Bao > > > > A workaround for the issue where the PCI Express Endpoint (EP) controller > > loses the values of the Maximum Link Width and Supported Link Speed from > > the Link Capabilities Register, which initially configured by the Reset > > Configuration Word (RCW) during a link-down or hot reset event. > > What makes this Layerscape specific? Seems like something internal to DWC. layerscape designed behavor is that LINK speed and width controled by RCW. But design have been 'defect' when switch to dwc controller, may not correct connect some wire. So provide an errata, ask software recover such information when link up/down to align design spec. For example, RCW config max link is 2lan, after link down/up, DWC reset to max link to 4lan. So host side get a report, max link is 4 lan. It will not impact function, just information miss matched. Frank > > > > > Signed-off-by: Xiaowei Bao > > Signed-off-by: Hou Zhiqiang > > Signed-off-by: Frank Li > > --- > > drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 + > > 1 file changed, 13 insertions(+) > > > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > > index 4e4fdd1dfea7..2ef02d827eeb 100644 > > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > > @@ -45,6 +45,7 @@ struct ls_pcie_ep { > > struct pci_epc_features *ls_epc; > > const struct ls_pcie_ep_drvdata *drvdata; > > int irq; > > + u32 lnkcap; > > boolbig_endian; > > }; > > > > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > > *dev_id) > > struct ls_pcie_ep *pcie = dev_id; > > struct dw_pcie *pci = pcie->pci; > > u32 val, cfg; > > + u8 offset; > > > > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); > > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); > > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, > > void *dev_id) > > return IRQ_NONE; > > > > if (val & PEX_PF0_PME_MES_DR_LUD) { > > + > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > > + > > + dw_pcie_dbi_ro_wr_en(pci); > > + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, > > pcie->lnkcap); > > + dw_pcie_dbi_ro_wr_dis(pci); > > + > > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); > > cfg |= PEX_PF0_CFG_READY; > > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); > > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct > > platform_device *pdev) > > struct ls_pcie_ep *pcie; > > struct pci_epc_features *ls_epc; > > struct resource *dbi_base; > > + u8 offset; > > int ret; > > > > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); > > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct > > platform_device *pdev) > > > > platform_set_drvdata(pdev, pcie); > > > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > > + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); > > + > > ret = dw_pcie_ep_init(>ep); > > if (ret) > > return ret; > > -- > > 2.34.1 > >
Kernel Crash Dump (kdump) broken with 6.5
Kdump seems to be broken with 6.5 for ppc64le. [ 14.200412] systemd[1]: Starting dracut pre-pivot and cleanup hook... [[0;32m OK [0m] Started dracut pre-pivot and cleanup hook. Starting Kdump Vmcore Save Service... [ 14.231669] systemd[1]: Started dracut pre-pivot and cleanup hook. [ 14.231801] systemd[1]: Starting Kdump Vmcore Save Service... [ 14.341035] kdump.sh[297]: kdump: saving to /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/ [ 14.350053] EXT4-fs (sda2): re-mounted e971a335-1ef8-4295-ab4e-3940f28e53fc r/w. Quota mode: none. [ 14.345979] kdump.sh[297]: kdump: saving vmcore-dmesg.txt to /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/ [ 14.348742] kdump.sh[331]: Cannot open /proc/vmcore: No such file or directory [ 14.348845] kdump.sh[297]: kdump: saving vmcore-dmesg.txt failed [ 14.349014] kdump.sh[297]: kdump: saving vmcore [ 14.443422] kdump.sh[332]: open_dump_memory: Can't open the dump memory(/proc/vmcore). No such file or directory [ 14.456413] kdump.sh[332]: makedumpfile Failed. [ 14.456662] kdump.sh[297]: kdump: saving vmcore failed, _exitcode:1 [ 14.456822] kdump.sh[297]: kdump: saving the /run/initramfs/kexec-dmesg.log to /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/ [ 14.487002] kdump.sh[297]: kdump: saving vmcore failed [[0;1;31mFAILED[0m] Failed to start Kdump Vmcore Save Service. 6.4 was good. Git bisect points to following patch commit 606787fed7268feb256957872586370b56af697a powerpc/64s: Remove support for ELFv1 little endian userspace Reverting this patch allows a successful capture of vmcore. Does this change require any corresponding change to kdump and/or kexec tools? - Sachin
Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset
On Thu, Jun 15, 2023 at 10:41 AM Frank Li wrote: > > From: Xiaowei Bao > > A workaround for the issue where the PCI Express Endpoint (EP) controller > loses the values of the Maximum Link Width and Supported Link Speed from > the Link Capabilities Register, which initially configured by the Reset > Configuration Word (RCW) during a link-down or hot reset event. What makes this Layerscape specific? Seems like something internal to DWC. > > Signed-off-by: Xiaowei Bao > Signed-off-by: Hou Zhiqiang > Signed-off-by: Frank Li > --- > drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > index 4e4fdd1dfea7..2ef02d827eeb 100644 > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > @@ -45,6 +45,7 @@ struct ls_pcie_ep { > struct pci_epc_features *ls_epc; > const struct ls_pcie_ep_drvdata *drvdata; > int irq; > + u32 lnkcap; > boolbig_endian; > }; > > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > struct ls_pcie_ep *pcie = dev_id; > struct dw_pcie *pci = pcie->pci; > u32 val, cfg; > + u8 offset; > > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > return IRQ_NONE; > > if (val & PEX_PF0_PME_MES_DR_LUD) { > + > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > + > + dw_pcie_dbi_ro_wr_en(pci); > + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, > pcie->lnkcap); > + dw_pcie_dbi_ro_wr_dis(pci); > + > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); > cfg |= PEX_PF0_CFG_READY; > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device > *pdev) > struct ls_pcie_ep *pcie; > struct pci_epc_features *ls_epc; > struct resource *dbi_base; > + u8 offset; > int ret; > > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device > *pdev) > > platform_set_drvdata(pdev, pcie); > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); > + > ret = dw_pcie_ep_init(>ep); > if (ret) > return ret; > -- > 2.34.1 >
Re: [RFC][PATCH] sched: Rename DIE domain
On Mon, Jul 17, 2023 at 03:51:25PM +0200, Vincent Guittot wrote: > On Wed, 12 Jul 2023 at 16:11, Peter Zijlstra wrote: > > > > Hi > > > > Thomas just tripped over the x86 topology setup creating a 'DIE' domain > > for the package mask :-) > > May be a link to the change that triggers this patch could be useful Thomas should post soonish..
Re: [PATCH] char: Explicitly include correct DT includes
On Fri Jul 14, 2023 at 5:43 PM UTC, Rob Herring wrote: > The DT of_device.h and of_platform.h date back to the separate > of_platform_bus_type before it as merged into the regular platform bus. > As part of that merge prepping Arm DT support 13 years ago, they > "temporarily" include each other. They also include platform_device.h > and of.h. As a result, there's a pretty much random mix of those include > files used throughout the tree. In order to detangle these headers and > replace the implicit includes with struct declarations, users need to > explicitly include the correct includes. > > Signed-off-by: Rob Herring > --- > drivers/char/agp/uninorth-agp.c| 1 + > drivers/char/bsr.c | 3 +-- > drivers/char/hw_random/atmel-rng.c | 2 +- > drivers/char/hw_random/bcm2835-rng.c | 3 +-- > drivers/char/hw_random/ingenic-trng.c | 2 +- > drivers/char/hw_random/iproc-rng200.c | 3 +-- > drivers/char/hw_random/npcm-rng.c | 3 +-- > drivers/char/hw_random/omap-rng.c | 2 -- > drivers/char/hw_random/omap3-rom-rng.c | 1 - > drivers/char/hw_random/pasemi-rng.c| 3 +-- > drivers/char/hw_random/pic32-rng.c | 3 +-- > drivers/char/hw_random/stm32-rng.c | 3 ++- > drivers/char/hw_random/xgene-rng.c | 5 ++--- > drivers/char/hw_random/xiphera-trng.c | 1 - > drivers/char/ipmi/kcs_bmc_aspeed.c | 1 - > drivers/char/tpm/tpm_ftpm_tee.c| 1 - > drivers/char/tpm/tpm_tis.c | 1 - > drivers/char/tpm/tpm_tis_spi_main.c| 2 +- > drivers/char/tpm/tpm_tis_synquacer.c | 1 - > 19 files changed, 14 insertions(+), 27 deletions(-) > > diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c > index 62de7f4ba864..84411b13c49f 100644 > --- a/drivers/char/agp/uninorth-agp.c > +++ b/drivers/char/agp/uninorth-agp.c > @@ -3,6 +3,7 @@ > * UniNorth AGPGART routines. > */ > #include > +#include > #include > #include > #include > diff --git a/drivers/char/bsr.c b/drivers/char/bsr.c > index 12143854aeac..70d31aed9011 100644 > --- a/drivers/char/bsr.c > +++ b/drivers/char/bsr.c > @@ -6,11 +6,10 @@ > * Author: Sonny Rao > */ > > +#include > #include > #include > #include > -#include > -#include > #include > #include > #include > diff --git a/drivers/char/hw_random/atmel-rng.c > b/drivers/char/hw_random/atmel-rng.c > index b8effe77d80f..a37367ebcbac 100644 > --- a/drivers/char/hw_random/atmel-rng.c > +++ b/drivers/char/hw_random/atmel-rng.c > @@ -15,7 +15,7 @@ > #include > #include > #include > -#include > +#include > #include > #include > > diff --git a/drivers/char/hw_random/bcm2835-rng.c > b/drivers/char/hw_random/bcm2835-rng.c > index e98fcac578d6..e19b0f9f48b9 100644 > --- a/drivers/char/hw_random/bcm2835-rng.c > +++ b/drivers/char/hw_random/bcm2835-rng.c > @@ -8,8 +8,7 @@ > #include > #include > #include > -#include > -#include > +#include > #include > #include > #include > diff --git a/drivers/char/hw_random/ingenic-trng.c > b/drivers/char/hw_random/ingenic-trng.c > index 0eb80f786f4d..759445d4f65a 100644 > --- a/drivers/char/hw_random/ingenic-trng.c > +++ b/drivers/char/hw_random/ingenic-trng.c > @@ -11,8 +11,8 @@ > #include > #include > #include > +#include > #include > -#include > #include > #include > > diff --git a/drivers/char/hw_random/iproc-rng200.c > b/drivers/char/hw_random/iproc-rng200.c > index 06bc060534d8..34df3f0d3e45 100644 > --- a/drivers/char/hw_random/iproc-rng200.c > +++ b/drivers/char/hw_random/iproc-rng200.c > @@ -12,8 +12,7 @@ > #include > #include > #include > -#include > -#include > +#include > #include > #include > > diff --git a/drivers/char/hw_random/npcm-rng.c > b/drivers/char/hw_random/npcm-rng.c > index 9903d0357e06..8a304b754217 100644 > --- a/drivers/char/hw_random/npcm-rng.c > +++ b/drivers/char/hw_random/npcm-rng.c > @@ -8,12 +8,11 @@ > #include > #include > #include > +#include > #include > #include > #include > -#include > #include > -#include > > #define NPCM_RNGCS_REG 0x00/* Control and status register > */ > #define NPCM_RNGD_REG0x04/* Data register */ > diff --git a/drivers/char/hw_random/omap-rng.c > b/drivers/char/hw_random/omap-rng.c > index 00ff96703dd2..be03f76a2a80 100644 > --- a/drivers/char/hw_random/omap-rng.c > +++ b/drivers/char/hw_random/omap-rng.c > @@ -26,8 +26,6 @@ > #include > #include > #include > -#include > -#include > #include > #include > #include > diff --git a/drivers/char/hw_random/omap3-rom-rng.c > b/drivers/char/hw_random/omap3-rom-rng.c > index f06e4f95114f..18dc46b1b58e 100644 > --- a/drivers/char/hw_random/omap3-rom-rng.c > +++ b/drivers/char/hw_random/omap3-rom-rng.c > @@ -20,7 +20,6 @@ > #include > #include > #include > -#include > #include > #include > > diff --git a/drivers/char/hw_random/pasemi-rng.c > b/drivers/char/hw_random/pasemi-rng.c > index
Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset
On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote: > From: Xiaowei Bao > > A workaround for the issue where the PCI Express Endpoint (EP) controller > loses the values of the Maximum Link Width and Supported Link Speed from > the Link Capabilities Register, which initially configured by the Reset > Configuration Word (RCW) during a link-down or hot reset event. > > Signed-off-by: Xiaowei Bao > Signed-off-by: Hou Zhiqiang > Signed-off-by: Frank Li > --- @lorenzo: It is only for layerscape and workaround a small errata. Could you please pick this up? Frank > drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > index 4e4fdd1dfea7..2ef02d827eeb 100644 > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > @@ -45,6 +45,7 @@ struct ls_pcie_ep { > struct pci_epc_features *ls_epc; > const struct ls_pcie_ep_drvdata *drvdata; > int irq; > + u32 lnkcap; > boolbig_endian; > }; > > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > struct ls_pcie_ep *pcie = dev_id; > struct dw_pcie *pci = pcie->pci; > u32 val, cfg; > + u8 offset; > > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > return IRQ_NONE; > > if (val & PEX_PF0_PME_MES_DR_LUD) { > + > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > + > + dw_pcie_dbi_ro_wr_en(pci); > + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap); > + dw_pcie_dbi_ro_wr_dis(pci); > + > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); > cfg |= PEX_PF0_CFG_READY; > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device > *pdev) > struct ls_pcie_ep *pcie; > struct pci_epc_features *ls_epc; > struct resource *dbi_base; > + u8 offset; > int ret; > > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device > *pdev) > > platform_set_drvdata(pdev, pcie); > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); > + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); > + > ret = dw_pcie_ep_init(>ep); > if (ret) > return ret; > -- > 2.34.1 >
Re: [PATCH 1/2] PCI: layerscape: Add support for Link down notification
O Thu, Jun 15, 2023 at 12:41:11PM -0400, Frank Li wrote: > Add support to pass Link down notification to Endpoint function driver > so that the LINK_DOWN event can be processed by the function. > > Signed-off-by: Frank Li > --- @Lorenzo: No comment over 1 months. Just change layerscape and 1 line code change. Could you please consider pick this up? Frank Li > drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > index de4c1758a6c3..4e4fdd1dfea7 100644 > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > @@ -88,6 +88,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void > *dev_id) > > dev_dbg(pci->dev, "Link up\n"); > } else if (val & PEX_PF0_PME_MES_DR_LDD) { > + pci_epc_linkdown(pci->ep.epc); > dev_dbg(pci->dev, "Link down\n"); > } else if (val & PEX_PF0_PME_MES_DR_HRD) { > dev_dbg(pci->dev, "Hot reset\n"); > -- > 2.34.1 >
Re: [RFC][PATCH] sched: Rename DIE domain
On Wed, 12 Jul 2023 at 16:11, Peter Zijlstra wrote: > > Hi > > Thomas just tripped over the x86 topology setup creating a 'DIE' domain > for the package mask :-) May be a link to the change that triggers this patch could be useful > > Since these names are SCHED_DEBUG only, rename them. > I don't think anybody *should* be relying on this, but who knows. Apart the remaining reference to DIE already mentioned by others, looks good to me > > Signed-off-by: Peter Zijlstra (Intel) > --- > arch/powerpc/kernel/smp.c | 2 +- > arch/s390/kernel/topology.c | 2 +- > arch/x86/kernel/smpboot.c | 2 +- > kernel/sched/topology.c | 2 +- > 4 files changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c > index fbbb695bae3d..5ed6b9fe5094 100644 > --- a/arch/powerpc/kernel/smp.c > +++ b/arch/powerpc/kernel/smp.c > @@ -1050,7 +1050,7 @@ static struct sched_domain_topology_level > powerpc_topology[] = { > #endif > { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) > }, > { cpu_mc_mask, SD_INIT_NAME(MC) }, > - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, > + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, > { NULL, }, > }; > > diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c > index 68adf1de..c803f5e6ab46 100644 > --- a/arch/s390/kernel/topology.c > +++ b/arch/s390/kernel/topology.c > @@ -522,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] > = { > { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, > { cpu_book_mask, SD_INIT_NAME(BOOK) }, > { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, > - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, > + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, > { NULL, }, > }; > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index e1aa2cd7734b..09cc9d0aa358 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -653,7 +653,7 @@ static void __init build_sched_topology(void) > */ > if (!x86_has_numa_in_package) { > x86_topology[i++] = (struct sched_domain_topology_level){ > - cpu_cpu_mask, SD_INIT_NAME(DIE) > + cpu_cpu_mask, SD_INIT_NAME(PKG) > }; > } > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index d3a3b2646ec4..e9d9cf776b7a 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -1670,7 +1670,7 @@ static struct sched_domain_topology_level > default_topology[] = { > #ifdef CONFIG_SCHED_MC > { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, > #endif > - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, > + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, > { NULL, }, > }; > >
Re: [PATCH v2 1/2] powerpc/tpm: Create linux,sml-base/size as big endian
On Wed Jul 12, 2023 at 12:39 PM UTC, Michael Ellerman wrote: > Jarkko Sakkinen writes: > > On Tue, 2023-07-11 at 08:47 -0400, Stefan Berger wrote: > >> On 7/10/23 17:23, Jarkko Sakkinen wrote: > >> > On Thu, 2023-06-15 at 22:37 +1000, Michael Ellerman wrote: > >> > > There's code in prom_instantiate_sml() to do a "SML handover" (Stored > >> > > Measurement Log) from OF to Linux, before Linux shuts down Open > >> > > Firmware. > >> > > > >> > > This involves creating a buffer to hold the SML, and creating two > >> > > device > >> > > tree properties to record its base address and size. The kernel then > >> > > later reads those properties from the device tree to find the SML. > >> > > > >> > > When the code was initially added in commit 4a727429abec ("PPC64: Add > >> > > support for instantiating SML from Open Firmware") the powerpc kernel > >> > > was always built big endian, so the properties were created big endian > >> > > by default. > >> > > > >> > > However since then little endian support was added to powerpc, and now > >> > > the code lacks conversions to big endian when creating the properties. > >> > > > >> > > This means on little endian kernels the device tree properties are > >> > > little endian, which is contrary to the device tree spec, and in > >> > > contrast to all other device tree properties. > >> > > > >> > > To cope with that a workaround was added in tpm_read_log_of() to skip > >> > > the endian conversion if the properties were created via the SML > >> > > handover. > >> > > > >> > > A better solution is to encode the properties as big endian as they > >> > > should be, and remove the workaround. > >> > > > >> > > Typically changing the encoding of a property like this would present > >> > > problems for kexec. However the SML is not propagated across kexec, so > >> > > changing the encoding of the properties is a non-issue. > >> > > > >> > > Fixes: e46e22f12b19 ("tpm: enhance read_log_of() to support Physical > >> > > TPM event log") > >> > > Signed-off-by: Michael Ellerman > >> > > Reviewed-by: Stefan Berger > >> > > --- > >> > > arch/powerpc/kernel/prom_init.c | 8 ++-- > >> > > drivers/char/tpm/eventlog/of.c | 23 --- > >> > > 2 files changed, 10 insertions(+), 21 deletions(-) > >> > > >> > Split into two patches (producer and consumer). > >> > >> I think this wouldn't be right since it would break the system when only > >> one patch is applied since it would be reading the fields in the wrong > >> endianess. > > > > I think it would help if the commit message would better explain > > what is going on. It is somewhat difficult to decipher, if you > > don't have deep knowledge of the powerpc architecture. > > I mean, it's already 8 paragraphs ¯\_(ツ)_/¯ > > But I'm happy to expand it. I just don't really know what extra detail > is needed to make it clearer. Adding more text is not the right way to clarify things. I'd start by explaining shortly SML and then move to the handover. It can't be that hard, right? Just adding new paragraphs would probably just make it even more confusing. BR, Jarkko
[PATCH v11 1/4] mm/tlbbatch: Introduce arch_tlbbatch_should_defer()
From: Anshuman Khandual The entire scheme of deferred TLB flush in reclaim path rests on the fact that the cost to refill TLB entries is less than flushing out individual entries by sending IPI to remote CPUs. But architecture can have different ways to evaluate that. Hence apart from checking TTU_BATCH_FLUSH in the TTU flags, rest of the decision should be architecture specific. Signed-off-by: Anshuman Khandual [https://lore.kernel.org/linuxppc-dev/20171101101735.2318-2-khand...@linux.vnet.ibm.com/] Signed-off-by: Yicong Yang [Rebase and fix incorrect return value type] Reviewed-by: Kefeng Wang Reviewed-by: Anshuman Khandual Reviewed-by: Barry Song Reviewed-by: Xin Hao Tested-by: Punit Agrawal --- arch/x86/include/asm/tlbflush.h | 12 mm/rmap.c | 9 + 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385..cf2a1de5d388 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -253,6 +253,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + bool should_defer = false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* diff --git a/mm/rmap.c b/mm/rmap.c index 0c0d8857dfce..6480e526c154 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -688,17 +688,10 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { - bool should_defer = false; - if (!(flags & TTU_BATCH_FLUSH)) return false; - /* If remote CPUs need to be flushed then defer batch the flush */ - if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) - should_defer = true; - put_cpu(); - - return should_defer; + return arch_tlbbatch_should_defer(mm); } /* -- 2.24.0
[PATCH v11 4/4] arm64: support batched/deferred tlb shootdown during page reclamation/migration
From: Barry Song on x86, batched and deferred tlb shootdown has lead to 90% performance increase on tlb shootdown. on arm64, HW can do tlb shootdown without software IPI. But sync tlbi is still quite expensive. Even running a simplest program which requires swapout can prove this is true, #include #include #include #include int main() { #define SIZE (1 * 1024 * 1024) volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); memset(p, 0x88, SIZE); for (int k = 0; k < 1; k++) { /* swap in */ for (int i = 0; i < SIZE; i += 4096) { (void)p[i]; } /* swap out */ madvise(p, SIZE, MADV_PAGEOUT); } } Perf result on snapdragon 888 with 8 cores by using zRAM as the swap block device. ~ # perf record taskset -c 4 ./a.out [ perf record: Woken up 10 times to write data ] [ perf record: Captured and wrote 2.297 MB perf.data (60084 samples) ] ~ # perf report # To display the perf.data header info, please use --header/--header-only options. # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 60K of event 'cycles' # Event count (approx.): 35706225414 # # Overhead Command Shared Object Symbol # ... . .. # 21.07% a.out[kernel.kallsyms] [k] _raw_spin_unlock_irq 8.23% a.out[kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 6.67% a.out[kernel.kallsyms] [k] filemap_map_pages 6.16% a.out[kernel.kallsyms] [k] __zram_bvec_write 5.36% a.out[kernel.kallsyms] [k] ptep_clear_flush 3.71% a.out[kernel.kallsyms] [k] _raw_spin_lock 3.49% a.out[kernel.kallsyms] [k] memset64 1.63% a.out[kernel.kallsyms] [k] clear_page 1.42% a.out[kernel.kallsyms] [k] _raw_spin_unlock 1.26% a.out[kernel.kallsyms] [k] mod_zone_state.llvm.8525150236079521930 1.23% a.out[kernel.kallsyms] [k] xas_load 1.15% a.out[kernel.kallsyms] [k] zram_slot_lock ptep_clear_flush() takes 5.36% CPU in the micro-benchmark swapping in/out a page mapped by only one process. If the page is mapped by multiple processes, typically, like more than 100 on a phone, the overhead would be much higher as we have to run tlb flush 100 times for one single page. Plus, tlb flush overhead will increase with the number of CPU cores due to the bad scalability of tlb shootdown in HW, so those ARM64 servers should expect much higher overhead. Further perf annonate shows 95% cpu time of ptep_clear_flush is actually used by the final dsb() to wait for the completion of tlb flush. This provides us a very good chance to leverage the existing batched tlb in kernel. The minimum modification is that we only send async tlbi in the first stage and we send dsb while we have to sync in the second stage. With the above simplest micro benchmark, collapsed time to finish the program decreases around 5%. Typical collapsed time w/o patch: ~ # time taskset -c 4 ./a.out 0.21user 14.34system 0:14.69elapsed w/ patch: ~ # time taskset -c 4 ./a.out 0.22user 13.45system 0:13.80elapsed Also tested with benchmark in the commit on Kunpeng920 arm64 server and observed an improvement around 12.5% with command `time ./swap_bench`. w/o w/ real0m13.460s 0m11.771s user0m0.248s0m0.279s sys 0m12.039s 0m11.458s Originally it's noticed a 16.99% overhead of ptep_clear_flush() which has been eliminated by this patch: [root@localhost yang]# perf record -- ./swap_bench && perf report [...] 16.99% swap_bench [kernel.kallsyms] [k] ptep_clear_flush It is tested on 4,8,128 CPU platforms and shows to be beneficial on large systems but may not have improvement on small systems like on a 4 CPU platform. Also this patch improve the performance of page migration. Using pmbench and tries to migrate the pages of pmbench between node 0 and node 1 for 100 times for 1G memory, this patch decrease the time used around 20% (prev 18.338318910 sec after 13.981866350 sec) and saved the time used by ptep_clear_flush(). Cc: Anshuman Khandual Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual --- .../features/vm/TLB/arch-support.txt | 2 +- arch/arm64/Kconfig| 1 + arch/arm64/include/asm/tlbbatch.h | 12 + arch/arm64/include/asm/tlbflush.h | 44 +-- 4 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/tlbbatch.h diff --git
[PATCH v11 3/4] mm/tlbbatch: Introduce arch_flush_tlb_batched_pending()
From: Yicong Yang Currently we'll flush the mm in flush_tlb_batched_pending() to avoid race between reclaim unmaps pages by batched TLB flush and mprotect/munmap/etc. Other architectures like arm64 may only need a synchronization barrier(dsb) here rather than a full mm flush. So add arch_flush_tlb_batched_pending() to allow an arch-specific implementation here. This intends no functional changes on x86 since still a full mm flush for x86. Signed-off-by: Yicong Yang --- arch/x86/include/asm/tlbflush.h | 5 + mm/rmap.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1c7d3a36e16c..837e4a50281a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -284,6 +284,11 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b cpumask_or(>cpumask, >cpumask, mm_cpumask(mm)); } +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); +} + extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); static inline bool pte_flags_need_flush(unsigned long oldflags, diff --git a/mm/rmap.c b/mm/rmap.c index 9699c6011b0e..3a16c91be7e2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -717,7 +717,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm) int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { - flush_tlb_mm(mm); + arch_flush_tlb_batched_pending(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. -- 2.24.0
[PATCH v11 2/4] mm/tlbbatch: Rename and extend some functions
From: Barry Song This patch does some preparation works to extend batched TLB flush to arm64. Including: - Extend set_tlb_ubc_flush_pending() and arch_tlbbatch_add_mm() to accept an additional argument for address, architectures like arm64 may need this for tlbi. - Rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending() to match its current function since we don't need to handle mm on architectures like arm64 and add_mm is not proper, add_pending will make sense to both as on x86 we're pending the TLB flush operations while on arm64 we're pending the synchronize operations. This intends no functional changes on x86. Cc: Anshuman Khandual Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual --- arch/x86/include/asm/tlbflush.h | 5 +++-- include/linux/mm_types_task.h | 4 ++-- mm/rmap.c | 12 +++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf2a1de5d388..1c7d3a36e16c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -276,8 +276,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(>context.tlb_gen); } -static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, +struct mm_struct *mm, +unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(>cpumask, >cpumask, mm_cpumask(mm)); diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 5414b5c6a103..aa44fff8bb9d 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -52,8 +52,8 @@ struct tlbflush_unmap_batch { #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * The arch code makes the following promise: generic code can modify a -* PTE, then call arch_tlbbatch_add_mm() (which internally provides all -* needed barriers), then call arch_tlbbatch_flush(), and the entries +* PTE, then call arch_tlbbatch_add_pending() (which internally provides +* all needed barriers), then call arch_tlbbatch_flush(), and the entries * will be flushed on all CPUs by the time that arch_tlbbatch_flush() * returns. */ diff --git a/mm/rmap.c b/mm/rmap.c index 6480e526c154..9699c6011b0e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -642,7 +642,8 @@ void try_to_unmap_flush_dirty(void) #define TLB_FLUSH_BATCH_PENDING_LARGE \ (TLB_FLUSH_BATCH_PENDING_MASK / 2) -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, + unsigned long uaddr) { struct tlbflush_unmap_batch *tlb_ubc = >tlb_ubc; int batch; @@ -651,7 +652,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) if (!pte_accessible(mm, pteval)) return; - arch_tlbbatch_add_mm(_ubc->arch, mm); + arch_tlbbatch_add_pending(_ubc->arch, mm, uaddr); tlb_ubc->flush_required = true; /* @@ -726,7 +727,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm) } } #else -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, + unsigned long uaddr) { } @@ -1579,7 +1581,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval); + set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } @@ -1962,7 +1964,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval); + set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } -- 2.24.0
[PATCH v11 0/4] arm64: support batched/deferred tlb shootdown during page reclamation/migration
From: Yicong Yang Though ARM64 has the hardware to do tlb shootdown, the hardware broadcasting is not free. A simplest micro benchmark shows even on snapdragon 888 with only 8 cores, the overhead for ptep_clear_flush is huge even for paging out one page mapped by only one process: 5.36% a.out[kernel.kallsyms] [k] ptep_clear_flush While pages are mapped by multiple processes or HW has more CPUs, the cost should become even higher due to the bad scalability of tlb shootdown. The same benchmark can result in 16.99% CPU consumption on ARM64 server with around 100 cores according to the test on patch 4/4. This patchset leverages the existing BATCHED_UNMAP_TLB_FLUSH by 1. only send tlbi instructions in the first stage - arch_tlbbatch_add_mm() 2. wait for the completion of tlbi by dsb while doing tlbbatch sync in arch_tlbbatch_flush() Testing on snapdragon shows the overhead of ptep_clear_flush is removed by the patchset. The micro benchmark becomes 5% faster even for one page mapped by single process on snapdragon 888. Since BATCHED_UNMAP_TLB_FLUSH is implemented only on x86, the patchset does some renaming/extension for the current implementation first (Patch 1-3), then add the support on arm64 (Patch 4). -v11: - Enable ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH config unconditionally on arm64. Link: https://lore.kernel.org/linux-mm/20230710083914.18336-1-yangyic...@huawei.com/T/#mc343b7e7c4a090392ef43b620af85a3eea76abad -v10: 1. Enable BATCHED_UNMAP_TLB_FLUSH regardless of CPU numbers, per Catalin. 2. Split the renaming/extension works in a separate PATCH 2, per Catalin. Since it's split from PATCH 2/2 in v9, so inherit the tags. 3. Add arch_flush_tlb_batched_pending() to allow arch-specific implementation, per Catalin. Since it's some kind of an optimization on arm64 so a separate Patch 3/4. Link: https://lore.kernel.org/linux-mm/20230518065934.12877-1-yangyic...@huawei.com/ -v9: 1. Using a runtime tunable to control batched TLB flush, per Catalin in v7. Sorry for missing this on v8. Link: https://lore.kernel.org/all/20230329035512.57392-1-yangyic...@huawei.com/ -v8: 1. Rebase on 6.3-rc4 2. Tested the optimization on page migration and mentioned it in the commit 3. Thanks the review from Anshuman. Link: https://lore.kernel.org/linux-mm/20221117082648.47526-1-yangyic...@huawei.com/ -v7: 1. rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending() as suggested, since it takes an extra address for arm64, per Nadav and Anshuman. Also mentioned in the commit. 2. add tags from Xin Hao, thanks. Link: https://lore.kernel.org/lkml/20221115031425.44640-1-yangyic...@huawei.com/ -v6: 1. comment we don't defer TLB flush on platforms affected by ARM64_WORKAROUND_REPEAT_TLBI 2. use cpus_have_const_cap() instead of this_cpu_has_cap() 3. add tags from Punit, Thanks. 4. default enable the feature when cpus >= 8 rather than > 8, since the original improvement is observed on snapdragon 888 with 8 cores. Link: https://lore.kernel.org/lkml/20221028081255.19157-1-yangyic...@huawei.com/ -v5: 1. Make ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH depends on EXPERT for this stage on arm64. 2. Make a threshold of CPU numbers for enabling batched TLP flush on arm64 Link: https://lore.kernel.org/linux-arm-kernel/20220921084302.43631-1-yangyic...@huawei.com/T/ -v4: 1. Add tags from Kefeng and Anshuman, Thanks. 2. Limit the TLB batch/defer on systems with >4 CPUs, per Anshuman 3. Merge previous Patch 1,2-3 into one, per Anshuman Link: https://lore.kernel.org/linux-mm/20220822082120.8347-1-yangyic...@huawei.com/ -v3: 1. Declare arch's tlbbatch defer support by arch_tlbbatch_should_defer() instead of ARCH_HAS_MM_CPUMASK, per Barry and Kefeng 2. Add Tested-by from Xin Hao Link: https://lore.kernel.org/linux-mm/20220711034615.482895-1-21cn...@gmail.com/ -v2: 1. Collected Yicong's test result on kunpeng920 ARM64 server; 2. Removed the redundant vma parameter in arch_tlbbatch_add_mm() according to the comments of Peter Zijlstra and Dave Hansen 3. Added ARCH_HAS_MM_CPUMASK rather than checking if mm_cpumask is empty according to the comments of Nadav Amit Thanks, Peter, Dave and Nadav for your testing or reviewing , and comments. -v1: https://lore.kernel.org/lkml/20220707125242.425242-1-21cn...@gmail.com/ Anshuman Khandual (1): mm/tlbbatch: Introduce arch_tlbbatch_should_defer() Barry Song (2): mm/tlbbatch: Rename and extend some functions arm64: support batched/deferred tlb shootdown during page reclamation/migration Yicong Yang (1): mm/tlbbatch: Introduce arch_flush_tlb_batched_pending() .../features/vm/TLB/arch-support.txt | 2 +- arch/arm64/Kconfig| 1 + arch/arm64/include/asm/tlbbatch.h | 12 + arch/arm64/include/asm/tlbflush.h | 44 +-- arch/x86/include/asm/tlbflush.h | 22 +- include/linux/mm_types_task.h | 4 +- mm/rmap.c
Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter
On Mon, Jul 17, 2023 at 9:55 AM Mark Brown wrote: > I'll just put a non-specific Cc stable tag on it, that should be enough > to get it backported. Sounds good. Thanks, Mark.
Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter
On Mon, Jul 17, 2023 at 09:31:38AM -0300, Fabio Estevam wrote: > On Wed, Jul 12, 2023 at 9:53 AM Matus Gajdos wrote: > > > > Otherwise bit clock remains running writing invalid data to the DAC. > > > > Signed-off-by: Matus Gajdos > > Should this contain a Fixes tag so that it could be backported to > stable kernels? I'll just put a non-specific Cc stable tag on it, that should be enough to get it backported. signature.asc Description: PGP signature
Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter
On Wed, Jul 12, 2023 at 9:53 AM Matus Gajdos wrote: > > Otherwise bit clock remains running writing invalid data to the DAC. > > Signed-off-by: Matus Gajdos Should this contain a Fixes tag so that it could be backported to stable kernels?
Re: [PATCH v10 4/4] arm64: support batched/deferred tlb shootdown during page reclamation/migration
On 2023/7/16 23:11, Catalin Marinas wrote: > On Mon, Jul 10, 2023 at 04:39:14PM +0800, Yicong Yang wrote: >> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >> index 7856c3a3e35a..f0ce8208c57f 100644 >> --- a/arch/arm64/Kconfig >> +++ b/arch/arm64/Kconfig >> @@ -96,6 +96,7 @@ config ARM64 >> select ARCH_SUPPORTS_NUMA_BALANCING >> select ARCH_SUPPORTS_PAGE_TABLE_CHECK >> select ARCH_SUPPORTS_PER_VMA_LOCK >> +select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if EXPERT > > I don't want EXPERT to turn on a feature that's not selectable by the > user. This would lead to different performance behaviour based on > EXPERT. Just select it unconditionally. Got it. will drop it and address the comment below. Thanks. > >> diff --git a/arch/arm64/include/asm/tlbflush.h >> b/arch/arm64/include/asm/tlbflush.h >> index 412a3b9a3c25..4bb9cec62e26 100644 >> --- a/arch/arm64/include/asm/tlbflush.h >> +++ b/arch/arm64/include/asm/tlbflush.h >> @@ -254,17 +254,23 @@ static inline void flush_tlb_mm(struct mm_struct *mm) >> dsb(ish); >> } >> >> -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, >> - unsigned long uaddr) >> +static inline void __flush_tlb_page_nosync(struct mm_struct *mm, >> + unsigned long uaddr) >> { >> unsigned long addr; >> >> dsb(ishst); >> -addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); >> +addr = __TLBI_VADDR(uaddr, ASID(mm)); >> __tlbi(vale1is, addr); >> __tlbi_user(vale1is, addr); >> } >> >> +static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, >> + unsigned long uaddr) >> +{ >> +return __flush_tlb_page_nosync(vma->vm_mm, uaddr); >> +} >> + >> static inline void flush_tlb_page(struct vm_area_struct *vma, >>unsigned long uaddr) >> { >> @@ -272,6 +278,42 @@ static inline void flush_tlb_page(struct vm_area_struct >> *vma, >> dsb(ish); >> } >> >> +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH > > If it's selected unconditionally, we won't need this #ifdef here. > >> + >> +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) >> +{ >> +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI >> +/* >> + * TLB flush deferral is not required on systems, which are affected >> with > > "affected by" and drop the comma before "which". >
Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter
On Wed, Jul 12, 2023 at 8:53 PM Matus Gajdos wrote: > Otherwise bit clock remains running writing invalid data to the DAC. > > Signed-off-by: Matus Gajdos > Acked-by: Shengjiu Wang Best regards Wang Shengjiu > --- > sound/soc/fsl/fsl_sai.c | 2 +- > sound/soc/fsl/fsl_sai.h | 1 + > 2 files changed, 2 insertions(+), 1 deletion(-) > > diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c > index 5e09f634c61b..dcc7fbe7acac 100644 > --- a/sound/soc/fsl/fsl_sai.c > +++ b/sound/soc/fsl/fsl_sai.c > @@ -719,7 +719,7 @@ static void fsl_sai_config_disable(struct fsl_sai > *sai, int dir) > u32 xcsr, count = 100; > > regmap_update_bits(sai->regmap, FSL_SAI_xCSR(tx, ofs), > - FSL_SAI_CSR_TERE, 0); > + FSL_SAI_CSR_TERE | FSL_SAI_CSR_BCE, 0); > > /* TERE will remain set till the end of current frame */ > do { > diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h > index 8254c3547b87..550df87b6a06 100644 > --- a/sound/soc/fsl/fsl_sai.h > +++ b/sound/soc/fsl/fsl_sai.h > @@ -91,6 +91,7 @@ > /* SAI Transmit/Receive Control Register */ > #define FSL_SAI_CSR_TERE BIT(31) > #define FSL_SAI_CSR_SE BIT(30) > +#define FSL_SAI_CSR_BCEBIT(28) > #define FSL_SAI_CSR_FR BIT(25) > #define FSL_SAI_CSR_SR BIT(24) > #define FSL_SAI_CSR_xF_SHIFT 16 > -- > 2.25.1 > >
[PATCH v1] powerpc/pseries: use kfree_sensitive() in plpks_gen_password()
password might contain private information, so better use kfree_sensitive to free it. In plpks_gen_password() use kfree_sensitive(). Signed-off-by: Minjie Du --- arch/powerpc/platforms/pseries/plpks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index b0658ea3e..3441e616e 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -150,7 +150,7 @@ static int plpks_gen_password(void) ospasswordlength = maxpwsize; ospassword = kzalloc(maxpwsize, GFP_KERNEL); if (!ospassword) { - kfree(password); + kfree_sensitive(password); return -ENOMEM; } memcpy(ospassword, password, ospasswordlength); @@ -163,7 +163,7 @@ static int plpks_gen_password(void) } } out: - kfree(password); + kfree_sensitive(password); return pseries_status_to_err(rc); } -- 2.39.0
Re: [PATCH 2/2] ASoC: fsl_rpmsg: Add support for i.MX93 platform
On Fri, Jul 14, 2023 at 5:30 PM Chancel Liu wrote: > Add compatible string and specific soc data to support rpmsg sound card > on i.MX93 platform. > > Signed-off-by: Chancel Liu > Acked-by: Shengjiu Wang Best regards wang shengjiu > --- > sound/soc/fsl/fsl_rpmsg.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/sound/soc/fsl/fsl_rpmsg.c b/sound/soc/fsl/fsl_rpmsg.c > index 15b48b5ea856..abe19a8a7aa7 100644 > --- a/sound/soc/fsl/fsl_rpmsg.c > +++ b/sound/soc/fsl/fsl_rpmsg.c > @@ -170,12 +170,20 @@ static const struct fsl_rpmsg_soc_data imx8mp_data = > { >SNDRV_PCM_FMTBIT_S32_LE, > }; > > +static const struct fsl_rpmsg_soc_data imx93_data = { > + .rates = SNDRV_PCM_RATE_16000 | SNDRV_PCM_RATE_32000 | > +SNDRV_PCM_RATE_48000 | SNDRV_PCM_RATE_96000, > + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE | > + SNDRV_PCM_FMTBIT_S32_LE, > +}; > + > static const struct of_device_id fsl_rpmsg_ids[] = { > { .compatible = "fsl,imx7ulp-rpmsg-audio", .data = _data}, > { .compatible = "fsl,imx8mm-rpmsg-audio", .data = _data}, > { .compatible = "fsl,imx8mn-rpmsg-audio", .data = _data}, > { .compatible = "fsl,imx8mp-rpmsg-audio", .data = _data}, > { .compatible = "fsl,imx8ulp-rpmsg-audio", .data = _data}, > + { .compatible = "fsl,imx93-rpmsg-audio", .data = _data}, > { /* sentinel */ } > }; > MODULE_DEVICE_TABLE(of, fsl_rpmsg_ids); > -- > 2.25.1 > >
[PATCH] powerpc/64: Enable accelerated crypto algorithms in defconfig
Enable all the acclerated crypto algorithms as modules in the 64-bit defconfig, to get more test coverage. Signed-off-by: Michael Ellerman --- arch/powerpc/configs/ppc64_defconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 268fa361a06d..40a1f4a4274c 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -390,8 +390,11 @@ CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_LZO=m CONFIG_CRYPTO_CRC32C_VPMSUM=m +CONFIG_CRYPTO_CRCT10DIF_VPMSUM=m +CONFIG_CRYPTO_VPMSUM_TESTER=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_SHA1_PPC=m +CONFIG_CRYPTO_AES_GCM_P10=m CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m CONFIG_CRYPTO_DEV_VMX=y -- 2.41.0
Re: [PATCH] misc: Explicitly include correct DT includes
On Fri, 2023-07-14 at 11:47 -0600, Rob Herring wrote: > The DT of_device.h and of_platform.h date back to the separate > of_platform_bus_type before it as merged into the regular platform > bus. > As part of that merge prepping Arm DT support 13 years ago, they > "temporarily" include each other. They also include platform_device.h > and of.h. As a result, there's a pretty much random mix of those > include > files used throughout the tree. In order to detangle these headers > and > replace the implicit includes with struct declarations, users need to > explicitly include the correct includes. > > Signed-off-by: Rob Herring Acked-by: Andrew Donnellan # cxl -- Andrew DonnellanOzLabs, ADL Canberra a...@linux.ibm.com IBM Australia Limited
Fwd: [PATCH] drivers: macintosh: space required after that ','
This patch fixes the checkpatch.pl error: ./drivers/macintosh/adbhid.c:1091: ERROR: space required after that ',' (ctx:VxV) Signed-off-by: maqimei <2433033...@qq.com> --- drivers/macintosh/adbhid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c index b2fe7a3..293e72a 100644 --- a/drivers/macintosh/adbhid.c +++ b/drivers/macintosh/adbhid.c @@ -1088,7 +1088,7 @@ static void adbhid_input_unregister(int id) unsigned char r1_buffer[8]; adb_request(, NULL, ADBREQ_SYNC | ADBREQ_REPLY, 1, -ADB_READREG(id,1)); +ADB_READREG(id, 1)); if (req.reply_len < 8) pr_err("%s: bad length for reg. 1\n", __func__); else
[PATCH] powerpc: xmon: Remove space after '(' and before ')'
The patch fixes the following errors detected by checkpatch: ./arch/powerpc/xmon/xmon.c:2426: ERROR: space prohibited after that open parenthesis '(' ./arch/powerpc/xmon/xmon.c:2426: ERROR: space prohibited before that close parenthesis ')' ./arch/powerpc/xmon/xmon.c:2426: ERROR: space required before the open parenthesis '(' Signed-off-by: ztt <1549089...@qq.com> --- arch/powerpc/xmon/xmon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 013b63eb4cd9..c10d9ff02af1 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1057,7 +1057,7 @@ cmds(struct pt_regs *excp) flush_input(); termch = 0; cmd = skipbl(); -if(cmd == '\n' ) { +if (cmd == '\n') { if (last_cmd == NULL) continue; take_input(last_cmd); @@ -2423,7 +2423,7 @@ memex(void) } last_cmd = "m\n"; while ((cmd = skipbl()) != '\n') { -switch( cmd ){ +switch (cmd) { case 'b':size = 1;break; case 'w':size = 2;break; case 'l':size = 4;break;
[PATCH] drivers: macintosh: space required after that ','
This patch fixes the checkpatch.pl error: ./drivers/macintosh/adbhid.c:1091: ERROR: space required after that ',' (ctx:VxV) Signed-off-by: maqimei <2433033...@qq.com> --- drivers/macintosh/adbhid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c index b2fe7a3..293e72a 100644 --- a/drivers/macintosh/adbhid.c +++ b/drivers/macintosh/adbhid.c @@ -1088,7 +1088,7 @@ static void adbhid_input_unregister(int id) unsigned char r1_buffer[8]; adb_request(, NULL, ADBREQ_SYNC | ADBREQ_REPLY, 1, -ADB_READREG(id,1)); +ADB_READREG(id, 1)); if (req.reply_len < 8) pr_err("%s: bad length for reg. 1\n", __func__); else
[PATCH] powerpc: xmon: insert space before the open parenthesis '('
Fixes checkpatch error: ./arch/powerpc/xmon/xmon.c:1052: ERROR: space required before the open parenthesis '(' Signed-off-by: ztt <1549089...@qq.com> --- arch/powerpc/xmon/xmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 70c4c59a1a8f..6a1a2f0b9084 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1049,7 +1049,7 @@ cmds(struct pt_regs *excp) xmon_show_stack(excp->gpr[1], excp->link, excp->nip); -for(;;) { +for (;;) { #ifdef CONFIG_SMP printf("%x:", smp_processor_id()); #endif /* CONFIG_SMP */
[PATCH] drivers: macintosh: add spaces required around that ':' and '?'
This patch adds spaces required around that ':' and '?'. ./drivers/macintosh/macio-adb.c:143: ERROR: spaces required around that '?' (ctx:VxW) ./drivers/macintosh/macio-adb.c:143: ERROR: spaces required around that ':' (ctx:VxW) Signed-off-by: maqimei <2433033...@qq.com> --- drivers/macintosh/macio-adb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c index 55a9f8c..4de4883 100644 --- a/drivers/macintosh/macio-adb.c +++ b/drivers/macintosh/macio-adb.c @@ -140,7 +140,7 @@ static int macio_adb_autopoll(int devs) spin_lock_irqsave(_lock, flags); out_8(>active_hi.r, devs >> 8); out_8(>active_lo.r, devs); -out_8(>autopoll.r, devs? APE: 0); +out_8(>autopoll.r, devs ? APE : 0); spin_unlock_irqrestore(_lock, flags); return 0; }
Re: [PATCH 0/2] eventfd: simplify signal helpers
pt., 14 lip 2023 o 09:05 Christian Brauner napisał(a): > > On Thu, Jul 13, 2023 at 11:10:54AM -0600, Alex Williamson wrote: > > On Thu, 13 Jul 2023 12:05:36 +0200 > > Christian Brauner wrote: > > > > > Hey everyone, > > > > > > This simplifies the eventfd_signal() and eventfd_signal_mask() helpers > > > by removing the count argument which is effectively unused. > > > > We have a patch under review which does in fact make use of the > > signaling value: > > > > https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/ > > Huh, thanks for the link. > > Quoting from > https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/#25266856 > > > Reading an eventfd returns an 8-byte value, we generally only use it > > as a counter, but it's been discussed previously and IIRC, it's possible > > to use that value as a notification value. > > So the goal is to pipe a specific value through eventfd? But it is > explicitly a counter. The whole thing is written around a counter and > each write and signal adds to the counter. > > The consequences are pretty well described in the cover letter of > v6 https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/ > > > Since the eventfd counter is used as ACPI notification value > > placeholder, the eventfd signaling needs to be serialized in order to > > not end up with notification values being coalesced. Therefore ACPI > > notification values are buffered and signalized one by one, when the > > previous notification value has been consumed. > > But isn't this a good indication that you really don't want an eventfd > but something that's explicitly designed to associate specific data with > a notification? Using eventfd in that manner requires serialization, > buffering, and enforces ordering. > > I have no skin in the game aside from having to drop this conversion > which I'm fine to do if there are actually users for this btu really, > that looks a lot like abusing an api that really wasn't designed for > this. https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/ was posted at the beginig of March and one of the main things we've discussed was the mechanism for propagating acpi notification value. We've endup with eventfd as the best mechanism and have actually been using it from v2. I really do not want to waste this effort, I think we are quite advanced with v6 now. Additionally we didn't actually modify any part of eventfd support that was in place, we only used it in a specific (and discussed beforehand) way.
Re: [RFC 0/3] Asynchronous EEH recovery
On 6/13/23 8:06 AM, Oliver O'Halloran wrote: On Tue, Jun 13, 2023 at 11:44 AM Ganesh Goudar wrote: Hi, EEH recovery is currently serialized and these patches shorten the time taken for EEH recovery by making the recovery to run in parallel. The original author of these patches is Sam Bobroff, I have rebased and tested these patches. On powervm with 64 VFs from same PHB, I see approximately 48% reduction in time taken in EEH recovery. On powernv with 9 network cards, Where 2 cards installed on one PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs in total. I see approximately 33% reduction in time taken in EEH recovery. These patches were originally posted as separate RFCs by Sam, And I rebased and posted these patches almost a year back, I stopped pursuing these patches as I was not able test this on powernv, Due to the issues in drivers of cards I was testing this on, Which are now resolved. Since I am re-posting this after long time, Posting this as a fresh RFC, Please comment. What changes have you made since the last time you posted this series? If the patches are the same then the comments I posted last time still apply. Hi Oliver, You asked about the way we are testing this on powervm, You expressed concerns about having this on powernv, suggested to have this feature just for powervm for now, and also expressed concerns on having two locks. On powervm using two port card we are instantiating 64 VFS, for an lpar and injecting the error on the bus from phyp, to observe the behavior. I was able to test this on powernv with 16 PFs from 8 cards installed on separate PHBs, Where I saw considerable performance improvement. Regarding two locks idea, I may not have tested it for all scenarios, So far I have not faced any issue, Are you suggesting a different approach. Thanks
Re: [PATCH] net: Explicitly include correct DT includes
On Fri Jul 14 2023, Rob Herring wrote: > The DT of_device.h and of_platform.h date back to the separate > of_platform_bus_type before it as merged into the regular platform bus. > As part of that merge prepping Arm DT support 13 years ago, they > "temporarily" include each other. They also include platform_device.h > and of.h. As a result, there's a pretty much random mix of those include > files used throughout the tree. In order to detangle these headers and > replace the implicit includes with struct declarations, users need to > explicitly include the correct includes. > > Signed-off-by: Rob Herring > --- [...] > drivers/net/dsa/hirschmann/hellcreek.c | 1 - > drivers/net/dsa/hirschmann/hellcreek_ptp.c | 1 + Acked-by: Kurt Kanzenbach # hellcreek signature.asc Description: PGP signature
Re: [PATCH] net: Explicitly include correct DT includes
On Friday 14 July 2023 19:48:00 CEST Rob Herring wrote: > > The DT of_device.h and of_platform.h date back to the separate > of_platform_bus_type before it as merged into the regular platform bus. > As part of that merge prepping Arm DT support 13 years ago, they > "temporarily" include each other. They also include platform_device.h > and of.h. As a result, there's a pretty much random mix of those include > files used throughout the tree. In order to detangle these headers and > replace the implicit includes with struct declarations, users need to > explicitly include the correct includes. > > Signed-off-by: Rob Herring > --- > drivers/net/can/bxcan.c | 1 - > drivers/net/can/ifi_canfd/ifi_canfd.c | 1 - > drivers/net/can/m_can/m_can.c | 1 - > drivers/net/can/m_can/m_can.h | 1 - > drivers/net/can/rcar/rcar_canfd.c | 1 - > drivers/net/can/sja1000/sja1000_platform.c | 1 - > drivers/net/can/sun4i_can.c | 1 - > drivers/net/can/ti_hecc.c | 1 - > drivers/net/dsa/b53/b53_mdio.c | 1 + > drivers/net/dsa/b53/b53_mmap.c | 1 + > drivers/net/dsa/hirschmann/hellcreek.c | 1 - > drivers/net/dsa/hirschmann/hellcreek_ptp.c | 1 + > drivers/net/dsa/lan9303-core.c | 1 + > drivers/net/dsa/microchip/ksz8863_smi.c | 3 +++ > drivers/net/dsa/microchip/ksz_common.c | 2 +- > drivers/net/dsa/mt7530-mmio.c | 3 ++- > drivers/net/dsa/mv88e6xxx/chip.c| 2 +- > drivers/net/dsa/ocelot/felix_vsc9959.c | 1 + > drivers/net/dsa/ocelot/seville_vsc9953.c| 3 ++- > drivers/net/dsa/qca/qca8k-leds.c| 1 + > drivers/net/dsa/realtek/realtek-mdio.c | 2 +- > drivers/net/dsa/realtek/realtek-smi.c | 1 - > drivers/net/dsa/sja1105/sja1105_main.c | 1 - > drivers/net/dsa/vitesse-vsc73xx-core.c | 1 - > drivers/net/dsa/xrs700x/xrs700x.c | 2 +- > drivers/net/ethernet/aeroflex/greth.c | 4 ++-- > drivers/net/ethernet/amd/sunlance.c | 2 +- > drivers/net/ethernet/apm/xgene-v2/main.h| 1 + > drivers/net/ethernet/arc/emac_main.c| 2 +- > drivers/net/ethernet/atheros/ag71xx.c | 3 ++- > drivers/net/ethernet/cadence/macb_main.c| 1 - > drivers/net/ethernet/cirrus/cs89x0.c| 1 - > drivers/net/ethernet/ezchip/nps_enet.c | 5 ++--- > drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 3 ++- > drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 2 ++ > drivers/net/ethernet/freescale/enetc/enetc_ierb.c | 2 +- > drivers/net/ethernet/freescale/fec_mpc52xx.c| 4 ++-- > drivers/net/ethernet/freescale/fec_mpc52xx_phy.c| 3 ++- > drivers/net/ethernet/freescale/fec_ptp.c| 1 - > drivers/net/ethernet/freescale/fman/fman.c | 1 + > drivers/net/ethernet/freescale/fman/fman_port.c | 1 + > drivers/net/ethernet/freescale/fman/mac.c | 2 ++ > drivers/net/ethernet/freescale/fs_enet/mac-fcc.c| 1 - > drivers/net/ethernet/freescale/fs_enet/mac-fec.c| 1 - > drivers/net/ethernet/freescale/fs_enet/mac-scc.c| 1 - > drivers/net/ethernet/freescale/fsl_pq_mdio.c| 1 + > drivers/net/ethernet/freescale/gianfar.c| 2 +- > drivers/net/ethernet/freescale/gianfar_ethtool.c| 2 ++ > drivers/net/ethernet/freescale/ucc_geth.c | 3 ++- > drivers/net/ethernet/freescale/xgmac_mdio.c | 4 ++-- > drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c | 3 --- > drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c | 4 > drivers/net/ethernet/ibm/ehea/ehea_main.c | 1 + > drivers/net/ethernet/ibm/emac/core.c| 1 + > drivers/net/ethernet/ibm/emac/core.h| 1 - > drivers/net/ethernet/ibm/emac/mal.c | 2 ++ > drivers/net/ethernet/ibm/emac/rgmii.c | 2 ++ > drivers/net/ethernet/ibm/emac/tah.c | 2 ++ > drivers/net/ethernet/ibm/emac/zmii.c| 2 ++ > drivers/net/ethernet/korina.c | 2 +- > drivers/net/ethernet/marvell/mvmdio.c | 2 +- > drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 - > drivers/net/ethernet/marvell/prestera/prestera_rxtx.c | 3 --- > drivers/net/ethernet/marvell/sky2.c | 1 - > drivers/net/ethernet/mediatek/mtk_eth_soc.c | 3 ++- > drivers/net/ethernet/mediatek/mtk_star_emac.c | 1 - >
Re: [PATCH v2] KVM: ppc64: Enable ring-based dirty memory tracking on ppc64: enable config options and implement relevant functions
Kautuk Consul writes: > - Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly > ordered. > - Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the > kvmppc_xive_native_set_attr is called in the context of an ioctl > syscall and will call kvmppc_xive_native_eq_sync for setting the > KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page() > when there isn't a running vcpu. Implemented the > kvm_arch_allow_write_without_running_vcpu to always return true > to allow mark_page_dirty_in_slot to mark the page dirty in the > memslot->dirty_bitmap in this case. > - Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page > offset. > - Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required > for the generic KVM code to call. > - Add a check to kvmppc_vcpu_run_hv for checking whether the dirty > ring is soft full. > - Implement the kvm_arch_flush_remote_tlbs_memslot function to support > the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option. > > Test Results > > On testing with live migration it was found that there is around > 150-180 ms improvment in overall migration time with this patch. > > Bare Metal P9 testing with patch: > > (qemu) info migrate > globals: > store-global-state: on > only-migratable: off > send-configuration: on > send-section-footer: on > decompress-error-check: on > clear-bitmap-shift: 18 > Migration status: completed > total time: 20694 ms > downtime: 73 ms > setup: 23 ms > transferred ram: 2604370 kbytes > throughput: 1033.55 mbps > remaining ram: 0 kbytes > total ram: 16777216 kbytes > duplicate: 3555398 pages > skipped: 0 pages > normal: 642026 pages > normal bytes: 2568104 kbytes > dirty sync count: 3 > page size: 4 kbytes > multifd bytes: 0 kbytes > pages-per-second: 32455 > precopy ram: 2581549 kbytes > downtime ram: 22820 kbytes > > Bare Metal P9 testing without patch: > --- > (qemu) info migrate > globals: > store-global-state: on > only-migratable: off > send-configuration: on > send-section-footer: on > decompress-error-check: on > clear-bitmap-shift: 18 > Migration status: completed > total time: 20873 ms > downtime: 62 ms > setup: 19 ms > transferred ram: 2612900 kbytes > throughput: 1027.83 mbps > remaining ram: 0 kbytes > total ram: 16777216 kbytes > duplicate: 3553329 pages > skipped: 0 pages > normal: 644159 pages > normal bytes: 2576636 kbytes > dirty sync count: 4 > page size: 4 kbytes > multifd bytes: 0 kbytes > pages-per-second: 88297 > precopy ram: 2603645 kbytes > downtime ram: 9254 kbytes > > Signed-off-by: Kautuk Consul > --- > Documentation/virt/kvm/api.rst | 2 +- > arch/powerpc/include/uapi/asm/kvm.h | 2 ++ > arch/powerpc/kvm/Kconfig| 2 ++ > arch/powerpc/kvm/book3s.c | 46 + > arch/powerpc/kvm/book3s_hv.c| 3 ++ > include/linux/kvm_dirty_ring.h | 5 > virt/kvm/dirty_ring.c | 1 + > 7 files changed, 60 insertions(+), 1 deletion(-) > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > index c0ddd3035462..84c180ccd178 100644 > --- a/Documentation/virt/kvm/api.rst > +++ b/Documentation/virt/kvm/api.rst > @@ -8114,7 +8114,7 @@ regardless of what has actually been exposed through > the CPUID leaf. > 8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL > -- > > -:Architectures: x86, arm64 > +:Architectures: x86, arm64, ppc64 > :Parameters: args[0] - size of the dirty log ring > > KVM is capable of tracking dirty memory using ring buffers that are > diff --git a/arch/powerpc/include/uapi/asm/kvm.h > b/arch/powerpc/include/uapi/asm/kvm.h > index 9f18fa090f1f..f722309ed7fb 100644 > --- a/arch/powerpc/include/uapi/asm/kvm.h > +++ b/arch/powerpc/include/uapi/asm/kvm.h > @@ -33,6 +33,8 @@ > /* Not always available, but if it is, this is the correct offset. */ > #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 > > +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 > + > struct kvm_regs { > __u64 pc; > __u64 cr; > diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig > index 902611954200..c93354ec3bd5 100644 > --- a/arch/powerpc/kvm/Kconfig > +++ b/arch/powerpc/kvm/Kconfig > @@ -26,6 +26,8 @@ config KVM > select IRQ_BYPASS_MANAGER > select HAVE_KVM_IRQ_BYPASS > select INTERVAL_TREE > + select HAVE_KVM_DIRTY_RING_ACQ_REL > + select NEED_KVM_DIRTY_RING_WITH_BITMAP > > config KVM_BOOK3S_HANDLER > bool > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c > index 686d8d9eda3e..01aa4fe2c424 100644 > --- a/arch/powerpc/kvm/book3s.c > +++ b/arch/powerpc/kvm/book3s.c > @@ -32,6 +32,7 @@ > #include > #include > #include > +#include > > #include "book3s.h" > #include "trace.h" > @@ -1070,6 +1071,51 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned > irqchip, unsigned
[PATCH v2] KVM: ppc64: Enable ring-based dirty memory tracking on ppc64: enable config options and implement relevant functions
- Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly ordered. - Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the kvmppc_xive_native_set_attr is called in the context of an ioctl syscall and will call kvmppc_xive_native_eq_sync for setting the KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page() when there isn't a running vcpu. Implemented the kvm_arch_allow_write_without_running_vcpu to always return true to allow mark_page_dirty_in_slot to mark the page dirty in the memslot->dirty_bitmap in this case. - Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page offset. - Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required for the generic KVM code to call. - Add a check to kvmppc_vcpu_run_hv for checking whether the dirty ring is soft full. - Implement the kvm_arch_flush_remote_tlbs_memslot function to support the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option. Test Results On testing with live migration it was found that there is around 150-180 ms improvment in overall migration time with this patch. Bare Metal P9 testing with patch: (qemu) info migrate globals: store-global-state: on only-migratable: off send-configuration: on send-section-footer: on decompress-error-check: on clear-bitmap-shift: 18 Migration status: completed total time: 20694 ms downtime: 73 ms setup: 23 ms transferred ram: 2604370 kbytes throughput: 1033.55 mbps remaining ram: 0 kbytes total ram: 16777216 kbytes duplicate: 3555398 pages skipped: 0 pages normal: 642026 pages normal bytes: 2568104 kbytes dirty sync count: 3 page size: 4 kbytes multifd bytes: 0 kbytes pages-per-second: 32455 precopy ram: 2581549 kbytes downtime ram: 22820 kbytes Bare Metal P9 testing without patch: --- (qemu) info migrate globals: store-global-state: on only-migratable: off send-configuration: on send-section-footer: on decompress-error-check: on clear-bitmap-shift: 18 Migration status: completed total time: 20873 ms downtime: 62 ms setup: 19 ms transferred ram: 2612900 kbytes throughput: 1027.83 mbps remaining ram: 0 kbytes total ram: 16777216 kbytes duplicate: 3553329 pages skipped: 0 pages normal: 644159 pages normal bytes: 2576636 kbytes dirty sync count: 4 page size: 4 kbytes multifd bytes: 0 kbytes pages-per-second: 88297 precopy ram: 2603645 kbytes downtime ram: 9254 kbytes Signed-off-by: Kautuk Consul --- Documentation/virt/kvm/api.rst | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 2 ++ arch/powerpc/kvm/Kconfig| 2 ++ arch/powerpc/kvm/book3s.c | 46 + arch/powerpc/kvm/book3s_hv.c| 3 ++ include/linux/kvm_dirty_ring.h | 5 virt/kvm/dirty_ring.c | 1 + 7 files changed, 60 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index c0ddd3035462..84c180ccd178 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8114,7 +8114,7 @@ regardless of what has actually been exposed through the CPUID leaf. 8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL -- -:Architectures: x86, arm64 +:Architectures: x86, arm64, ppc64 :Parameters: args[0] - size of the dirty log ring KVM is capable of tracking dirty memory using ring buffers that are diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 9f18fa090f1f..f722309ed7fb 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -33,6 +33,8 @@ /* Not always available, but if it is, this is the correct offset. */ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 + struct kvm_regs { __u64 pc; __u64 cr; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 902611954200..c93354ec3bd5 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -26,6 +26,8 @@ config KVM select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select INTERVAL_TREE + select HAVE_KVM_DIRTY_RING_ACQ_REL + select NEED_KVM_DIRTY_RING_WITH_BITMAP config KVM_BOOK3S_HANDLER bool diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 686d8d9eda3e..01aa4fe2c424 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "book3s.h" #include "trace.h" @@ -1070,6 +1071,51 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) #endif /* CONFIG_KVM_XICS */ +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * dirty pages. + * + * It write protects selected pages to enable dirty logging for them. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, +