Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset

2023-07-17 Thread Manivannan Sadhasivam
On Mon, Jul 17, 2023 at 02:45:23PM -0400, Frank Li wrote:
> On Mon, Jul 17, 2023 at 09:29:10PM +0530, Manivannan Sadhasivam wrote:
> > On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote:
> > > From: Xiaowei Bao 
> > > 
> > > A workaround for the issue where the PCI Express Endpoint (EP) controller
> > > loses the values of the Maximum Link Width and Supported Link Speed from
> > > the Link Capabilities Register, which initially configured by the Reset
> > > Configuration Word (RCW) during a link-down or hot reset event.
> > > 
> > 
> > If this fixes an issue, then there should be a Fixes tag.
> 
> It is not fixed a exist software issue, just workaround a hardwre errata.
> 

But the hardware errata is there from the start, right? So technically this
driver doesn't address that so far and so this patch looks like a fix to me.

Plus adding a fixes tag and CCing stable list will allow this patch to be
backported to stable kernels.

- Mani

> > 
> > > Signed-off-by: Xiaowei Bao 
> > > Signed-off-by: Hou Zhiqiang 
> > > Signed-off-by: Frank Li 
> > > ---
> > >  drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 +
> > >  1 file changed, 13 insertions(+)
> > > 
> > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> > > b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > > index 4e4fdd1dfea7..2ef02d827eeb 100644
> > > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > > @@ -45,6 +45,7 @@ struct ls_pcie_ep {
> > >   struct pci_epc_features *ls_epc;
> > >   const struct ls_pcie_ep_drvdata *drvdata;
> > >   int irq;
> > > + u32 lnkcap;
> > >   boolbig_endian;
> > >  };
> > >  
> > > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, 
> > > void *dev_id)
> > >   struct ls_pcie_ep *pcie = dev_id;
> > >   struct dw_pcie *pci = pcie->pci;
> > >   u32 val, cfg;
> > > + u8 offset;
> > >  
> > >   val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
> > >   ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
> > > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, 
> > > void *dev_id)
> > >   return IRQ_NONE;
> > >  
> > >   if (val & PEX_PF0_PME_MES_DR_LUD) {
> > > +
> > 
> > Please add a comment on why the LNKCAP is being restored here.
> > 
> > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> > > +
> > > + dw_pcie_dbi_ro_wr_en(pci);
> > > + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap);
> > 
> > lnkcap is a 32-bit variable, so you should use dw_pcie_writel_dbi().
> > 
> > - Mani
> > 
> > > + dw_pcie_dbi_ro_wr_dis(pci);
> > > +
> > >   cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
> > >   cfg |= PEX_PF0_CFG_READY;
> > >   ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
> > > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct 
> > > platform_device *pdev)
> > >   struct ls_pcie_ep *pcie;
> > >   struct pci_epc_features *ls_epc;
> > >   struct resource *dbi_base;
> > > + u8 offset;
> > >   int ret;
> > >  
> > >   pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
> > > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct 
> > > platform_device *pdev)
> > >  
> > >   platform_set_drvdata(pdev, pcie);
> > >  
> > > + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> > > + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
> > > +
> > >   ret = dw_pcie_ep_init(>ep);
> > >   if (ret)
> > >   return ret;
> > > -- 
> > > 2.34.1
> > > 
> > 
> > -- 
> > மணிவண்ணன் சதாசிவம்

-- 
மணிவண்ணன் சதாசிவம்


Re: linux-next: Tree for Jul 13 (drivers/video/fbdev/ps3fb.c)

2023-07-17 Thread Bagas Sanjaya
On Thu, Jul 13, 2023 at 09:11:10AM -0700, Randy Dunlap wrote:
> 
> 
> On 7/12/23 19:37, Stephen Rothwell wrote:
> > Hi all,
> > 
> > Changes since 20230712:
> > 
> 
> on ppc64:
> 
> In file included from ../include/linux/device.h:15,
>  from ../arch/powerpc/include/asm/io.h:22,
>  from ../include/linux/io.h:13,
>  from ../include/linux/irq.h:20,
>  from ../arch/powerpc/include/asm/hardirq.h:6,
>  from ../include/linux/hardirq.h:11,
>  from ../include/linux/interrupt.h:11,
>  from ../drivers/video/fbdev/ps3fb.c:25:
> ../drivers/video/fbdev/ps3fb.c: In function 'ps3fb_probe':
> ../drivers/video/fbdev/ps3fb.c:1172:40: error: 'struct fb_info' has no member 
> named 'dev'
>  1172 |  dev_driver_string(info->dev), dev_name(info->dev),
>   |^~
> ../include/linux/dev_printk.h:110:37: note: in definition of macro 
> 'dev_printk_index_wrap'
>   110 | _p_func(dev, fmt, ##__VA_ARGS__); 
>   \
>   | ^~~
> ../drivers/video/fbdev/ps3fb.c:1171:9: note: in expansion of macro 'dev_info'
>  1171 | dev_info(info->device, "%s %s, using %u KiB of video 
> memory\n",
>   | ^~~~
> ../drivers/video/fbdev/ps3fb.c:1172:61: error: 'struct fb_info' has no member 
> named 'dev'
>  1172 |  dev_driver_string(info->dev), dev_name(info->dev),
>   | ^~
> ../include/linux/dev_printk.h:110:37: note: in definition of macro 
> 'dev_printk_index_wrap'
>   110 | _p_func(dev, fmt, ##__VA_ARGS__); 
>   \
>   | ^~~
> ../drivers/video/fbdev/ps3fb.c:1171:9: note: in expansion of macro 'dev_info'
>  1171 | dev_info(info->device, "%s %s, using %u KiB of video 
> memory\n",
>   | ^~~~
> 
> 

Hmm, there is no response from Thomas yet. I guess we should go with
reverting bdb616479eff419, right? Regardless, I'm adding this build regression
to regzbot so that parties involved are aware of it:

#regzbot ^introduced: bdb616479eff419
#regzbot title: build regression in PS3 framebuffer

Thanks.

-- 
An old man doll... just what I always wanted! - Clara


[powerpc:fixes-test] BUILD SUCCESS ccb381e1af1ace292153c88eb1fffa5683d16a20

2023-07-17 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
fixes-test
branch HEAD: ccb381e1af1ace292153c88eb1fffa5683d16a20  powerpc/kasan: Disable 
KCOV in KASAN code

elapsed time: 796m

configs tested: 55
configs skipped: 145

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
arc  allyesconfig   gcc  
arc  axs101_defconfig   gcc  
archsdk_defconfig   gcc  
arm  allmodconfig   gcc  
arm  allyesconfig   gcc  
arm defconfig   gcc  
arm  pxa910_defconfig   gcc  
armrealview_defconfig   gcc  
arm64allyesconfig   gcc  
arm64   defconfig   gcc  
m68k allmodconfig   gcc  
m68k allyesconfig   gcc  
m68kdefconfig   gcc  
m68km5272c3_defconfig   gcc  
m68kstmark2_defconfig   gcc  
mips allmodconfig   gcc  
mips  fuloong2e_defconfig   gcc  
mipsmalta_qemu_32r6_defconfig   clang
mipsqi_lb60_defconfig   clang
powerpc  allmodconfig   gcc  
powerpc   allnoconfig   gcc  
powerpc asp8347_defconfig   gcc  
powerpc   eiger_defconfig   gcc  
powerpc  katmai_defconfig   clang
powerpc   maple_defconfig   gcc  
powerpc  ppc40x_defconfig   gcc  
powerpc ps3_defconfig   gcc  
powerpc  randconfig-r026-20230717   gcc  
riscvallmodconfig   gcc  
riscv allnoconfig   gcc  
riscvallyesconfig   gcc  
riscv   defconfig   gcc  
riscv  rv32_defconfig   gcc  
sh  kfr2r09_defconfig   gcc  
sh  lboxre2_defconfig   gcc  
sh   rts7751r2dplus_defconfig   gcc  
sh  sdk7786_defconfig   gcc  
sh   sh7724_generic_defconfig   gcc  
um   allmodconfig   clang
umallnoconfig   clang
um   allyesconfig   clang
x86_64   allyesconfig   gcc  
x86_64  defconfig   gcc  
x86_64   randconfig-x002-20230717   gcc  
x86_64   randconfig-x003-20230717   gcc  
x86_64   randconfig-x004-20230717   gcc  
x86_64   randconfig-x005-20230717   gcc  
x86_64   randconfig-x006-20230717   gcc  
x86_64   randconfig-x011-20230717   clang
x86_64   randconfig-x012-20230717   clang
x86_64   randconfig-x013-20230717   clang
x86_64   randconfig-x014-20230717   clang
x86_64   randconfig-x015-20230717   clang
x86_64   randconfig-x016-20230717   clang
x86_64   rhel-8.3   gcc  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


[powerpc:merge] BUILD SUCCESS 7c5878b16f9cd959e232169b967be5b2a0897afa

2023-07-17 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
merge
branch HEAD: 7c5878b16f9cd959e232169b967be5b2a0897afa  Automatic merge of 
'master' into merge (2023-07-17 08:46)

elapsed time: 1545m

configs tested: 175
configs skipped: 4

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
alphaallyesconfig   gcc  
alpha   defconfig   gcc  
arc  allyesconfig   gcc  
arc  axs101_defconfig   gcc  
arc defconfig   gcc  
archsdk_defconfig   gcc  
arc  randconfig-r012-20230717   gcc  
arc  randconfig-r043-20230717   gcc  
arm  allmodconfig   gcc  
arm  allyesconfig   gcc  
arm   aspeed_g5_defconfig   gcc  
arm defconfig   gcc  
arm  ep93xx_defconfig   clang
arm orion5x_defconfig   clang
arm  pxa910_defconfig   gcc  
arm  randconfig-r006-20230717   gcc  
arm  randconfig-r046-20230717   clang
armrealview_defconfig   gcc  
arm   sama5_defconfig   gcc  
arm64allyesconfig   gcc  
arm64   defconfig   gcc  
arm64randconfig-r001-20230717   clang
arm64randconfig-r002-20230717   clang
arm64randconfig-r005-20230717   clang
arm64randconfig-r014-20230717   gcc  
arm64randconfig-r034-20230717   clang
arm64randconfig-r035-20230717   clang
cskydefconfig   gcc  
csky randconfig-r004-20230717   gcc  
csky randconfig-r013-20230717   gcc  
csky randconfig-r023-20230717   gcc  
hexagon  randconfig-r041-20230717   clang
hexagon  randconfig-r045-20230717   clang
i386 allyesconfig   gcc  
i386 buildonly-randconfig-r004-20230717   clang
i386 buildonly-randconfig-r005-20230717   clang
i386 buildonly-randconfig-r006-20230717   clang
i386  debian-10.3   gcc  
i386defconfig   gcc  
i386 randconfig-i001-20230717   clang
i386 randconfig-i002-20230717   clang
i386 randconfig-i003-20230717   clang
i386 randconfig-i004-20230717   clang
i386 randconfig-i005-20230717   clang
i386 randconfig-i006-20230717   clang
i386 randconfig-i011-20230717   gcc  
i386 randconfig-i012-20230717   gcc  
i386 randconfig-i013-20230717   gcc  
i386 randconfig-i014-20230717   gcc  
i386 randconfig-i015-20230717   gcc  
i386 randconfig-i016-20230717   gcc  
i386 randconfig-r016-20230717   gcc  
i386 randconfig-r025-20230717   gcc  
i386 randconfig-r034-20230717   clang
i386 randconfig-r035-20230717   clang
loongarchallmodconfig   gcc  
loongarch allnoconfig   gcc  
loongarch   defconfig   gcc  
loongarchrandconfig-r006-20230717   gcc  
loongarchrandconfig-r036-20230717   gcc  
m68k allmodconfig   gcc  
m68k allyesconfig   gcc  
m68kdefconfig   gcc  
m68km5272c3_defconfig   gcc  
m68k randconfig-r015-20230717   gcc  
m68kstmark2_defconfig   gcc  
mips allmodconfig   gcc  
mips allyesconfig   gcc  
mipsbcm47xx_defconfig   gcc  
mips  fuloong2e_defconfig   gcc  
mips  malta_defconfig   clang
mipsmalta_qemu_32r6_defconfig   clang
mips  maltasmvp_eva_defconfig   gcc  
mipsomega2p_defconfig   clang
mipsqi_lb60_defconfig   clang
mips randconfig-r021-20230717   clang
mips randconfig-r033-20230717   gcc  
nios2   defconfig   gcc  
openriscdefconfig   gcc  
openrisc randconfig-r005-20230717   gcc  
openrisc randconfig-r012-20230717   gcc  
openrisc randconfig-r021-20230717   gcc  
openrisc randconfig-r024-20230717   gcc  
openrisc randconfig-r031-20230717   gcc  
openrisc randconfig-r032-20230717   gcc  
parisc

[powerpc:next-test] BUILD SUCCESS b059dfc41139ee194c9127b89dbea02afa409443

2023-07-17 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
branch HEAD: b059dfc41139ee194c9127b89dbea02afa409443  powerpc/64: Enable 
accelerated crypto algorithms in defconfig

elapsed time: 796m

configs tested: 38
configs skipped: 145

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
arc  allyesconfig   gcc  
arm  pxa910_defconfig   gcc  
armrealview_defconfig   gcc  
m68k allmodconfig   gcc  
m68k allyesconfig   gcc  
m68kdefconfig   gcc  
m68km5272c3_defconfig   gcc  
m68kstmark2_defconfig   gcc  
mips allmodconfig   gcc  
powerpc akebono_defconfig   clang
powerpc  allmodconfig   gcc  
powerpc   allnoconfig   gcc  
powerpc asp8347_defconfig   gcc  
powerpc   eiger_defconfig   gcc  
powerpc   maple_defconfig   gcc  
powerpc  ppc40x_defconfig   gcc  
powerpc ps3_defconfig   gcc  
powerpc  randconfig-r023-20230717   gcc  
powerpc  randconfig-r026-20230717   gcc  
sh  lboxre2_defconfig   gcc  
sh   rts7751r2dplus_defconfig   gcc  
sh  sdk7786_defconfig   gcc  
sh   sh7724_generic_defconfig   gcc  
um   allmodconfig   clang
umallnoconfig   clang
um   allyesconfig   clang
x86_64   allyesconfig   gcc  
x86_64   randconfig-x002-20230717   gcc  
x86_64   randconfig-x003-20230717   gcc  
x86_64   randconfig-x004-20230717   gcc  
x86_64   randconfig-x005-20230717   gcc  
x86_64   randconfig-x006-20230717   gcc  
x86_64   randconfig-x011-20230717   clang
x86_64   randconfig-x012-20230717   clang
x86_64   randconfig-x013-20230717   clang
x86_64   randconfig-x014-20230717   clang
x86_64   randconfig-x015-20230717   clang
x86_64   randconfig-x016-20230717   clang

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


Re: [PATCH 4/4] powerpc/64s/radix: combine final TLB flush and lazy tlb mm shootdown IPIs

2023-07-17 Thread Michael Ellerman
Nicholas Piggin  writes:
> This performs lazy tlb mm shootdown when doing the exit TLB flush when
> all mm users go away and user mappings are removed, which avoids having
> to do the lazy tlb mm shootdown IPIs on the final mmput when all kernel
> references disappear.
>
> powerpc/64s uses a broadcast TLBIE for the exit TLB flush if remote CPUs
> need to be invalidated (unless TLBIE is disabled), so this doesn't
> necessarily save IPIs but it does avoid a broadcast TLBIE which is quite
> expensive.
>
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/mm/book3s64/radix_tlb.c | 26 +-
>  1 file changed, 25 insertions(+), 1 deletion(-)

This gives me:

[1.438910][T1] Run /init as init process
[1.442759][   T96] [ cut here ]
[1.442836][   T96] WARNING: CPU: 0 PID: 96 at kernel/smp.c:748 
smp_call_function_many_cond+0xe0/0xad0
[1.442920][   T96] Modules linked in:
[1.442960][   T96] CPU: 0 PID: 96 Comm: init Not tainted 
6.5.0-rc2-g1954d181ea09 #168
[1.443028][   T96] Hardware name: IBM pSeries (emulated by qemu) POWER9 
(raw) 0x4e1202 0xf05 of:SLOF,git-6b6c16 hv:linux,kvm pSeries
[1.443126][   T96] NIP:  c02aab20 LR: c00a5fc4 CTR: 

[1.443199][   T96] REGS: cc36f5b0 TRAP: 0700   Not tainted  
(6.5.0-rc2-g1954d181ea09)
[1.443280][   T96] MSR:  80029033   CR: 
44008244  XER: 2004
[1.443382][   T96] CFAR: c02ab524 IRQMASK: 0
[1.443382][   T96] GPR00: c00a5fc4 cc36f850 
c17f9000 c617c580
[1.443382][   T96] GPR04: c00a55b0 c617bd00 
0001 0001
[1.443382][   T96] GPR08: c29fc88c cc25aa00 
 44008244
[1.443382][   T96] GPR12: fd78 c36c 
 c4042a00
[1.443382][   T96] GPR16: 0001  
 
[1.443382][   T96] GPR20:  c00a5fc4 
 c29f85d0
[1.443382][   T96] GPR24: cc25b518  
c617be60 c617bd00
[1.443382][   T96] GPR28: c617c580 c00a55b0 
 
[1.443994][   T96] NIP [c02aab20] 
smp_call_function_many_cond+0xe0/0xad0
[1.444069][   T96] LR [c00a5fc4] radix__tlb_flush+0xf4/0x190
[1.444133][   T96] Call Trace:
[1.444172][   T96] [cc36f850] [] 0x 
(unreliable)
[1.444250][   T96] [cc36f920] [c29f7fe0] 
__cpu_possible_mask+0x0/0x100
[1.444326][   T96] [cc36f950] [c04f346c] 
tlb_finish_mmu+0x16c/0x220
[1.02][   T96] [cc36f980] [c04ee894] 
exit_mmap+0x1b4/0x580
[1.74][   T96] [cc36faa0] [c014c140] __mmput+0x60/0x1c0
[1.444546][   T96] [cc36fae0] [c05cf014] 
begin_new_exec+0x5d4/0xec0
[1.444622][   T96] [cc36fb60] [c066c6e8] 
load_elf_binary+0x4a8/0x1cf0
[1.444697][   T96] [cc36fc60] [c05cc410] 
bprm_execve+0x3b0/0xa60
[1.444773][   T96] [cc36fd30] [c05ce3a0] 
do_execveat_common+0x1d0/0x300
[1.444852][   T96] [cc36fde0] [c05ce524] 
sys_execve+0x54/0x70
[1.444928][   T96] [cc36fe10] [c0031c24] 
system_call_exception+0x134/0x360
[1.445000][   T96] [cc36fe50] [c000d6a0] 
system_call_common+0x160/0x2c4
[1.445070][   T96] --- interrupt: c00 at 0x7fffb664cc98
[1.445119][   T96] NIP:  7fffb664cc98 LR: 1004bcb0 CTR: 

[1.445189][   T96] REGS: cc36fe80 TRAP: 0c00   Not tainted  
(6.5.0-rc2-g1954d181ea09)
[1.445271][   T96] MSR:  8280f033 
  CR: 22004842  XER: 
[1.445390][   T96] IRQMASK: 0
[1.445390][   T96] GPR00: 000b 7fffd9d11ec0 
7fffb6767300 2b3f06e8
[1.445390][   T96] GPR04: 2b3f0780 2b3f07b0 
 
[1.445390][   T96] GPR08: 2b3f06e8  
 
[1.445390][   T96] GPR12:  7fffb683a930 
100f0ff8 
[1.445390][   T96] GPR16:  7fffd9d12020 
2b3f0780 
[1.445390][   T96] GPR20: 2b3f0778 2b3f1330 
 100c6cb0
[1.445390][   T96] GPR24:   
 
[1.445390][   T96] GPR28: 100d34ae 100c6cf8 
2b3f0780 2b3f06e8
[1.446042][   T96] NIP [7fffb664cc98] 0x7fffb664cc98
[1.446095][   T96] LR [1004bcb0] 0x1004bcb0
[1.446147][   T96] --- interrupt: c00
[1.446186][   T96] Code: 8149 394a0001 9149 e8ed0030 3d420097 
394ae900 7cea382e 8149 2c07 394a 9149 

[PATCH v4 6/6] mm/hotplug: Embed vmem_altmap details in memory block

2023-07-17 Thread Aneesh Kumar K.V
With memmap on memory, some architecture needs more details w.r.t altmap
such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
computing them again when we remove a memory block embed vmem_altmap
details in struct memory_block if we are using memmap on memory block
feature.

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/base/memory.c  | 32 +++-
 include/linux/memory.h |  8 ++--
 mm/memory_hotplug.c| 38 ++
 3 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..cef6506f0209 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -106,6 +106,7 @@ static void memory_block_release(struct device *dev)
 {
struct memory_block *mem = to_memory_block(dev);
 
+   kfree(mem->altmap);
kfree(mem);
 }
 
@@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
 
@@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem)
 * stage helps to keep accounting easier to follow - e.g vmemmaps
 * belong to the same zone as the memory they backed.
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->alloc + mem->altmap->reserve;
+
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 
zone);
if (ret)
@@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
int ret;
 
if (!mem->zone)
@@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem)
 * Unaccount before offlining, such that unpopulated zone and kthreads
 * can properly be torn down in offline_pages().
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->alloc + mem->altmap->reserve;
+
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  -nr_vmemmap_pages);
@@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid,
 #endif
 
 static int add_memory_block(unsigned long block_id, unsigned long state,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
struct memory_block *mem;
@@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, 
unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = NUMA_NO_NODE;
-   mem->nr_vmemmap_pages = nr_vmemmap_pages;
+   if (altmap) {
+   mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+   if (!mem->altmap) {
+   kfree(mem);
+   return -ENOMEM;
+   }
+   memcpy(mem->altmap, altmap, sizeof(*altmap));
+   }
INIT_LIST_HEAD(>group_next);
 
 #ifndef CONFIG_NUMA
@@ -783,14 +797,14 @@ static int __init add_boot_memory_block(unsigned long 
base_section_nr)
if (section_count == 0)
return 0;
return add_memory_block(memory_block_id(base_section_nr),
-   MEM_ONLINE, 0,  NULL);
+   MEM_ONLINE, NULL,  NULL);
 }
 
 static int add_hotplug_memory_block(unsigned long block_id,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
-   return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+   return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 }
 
 static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +832,7 @@ static void remove_memory_block(struct memory_block *memory)
  * Called under device_hotplug_lock.
  */
 int create_memory_block_devices(unsigned long start, unsigned long size,
-   unsigned long vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -832,7 +846,7 @@ int 

[PATCH v4 5/6] powerpc/book3s64/memhotplug: Enable memmap on memory for radix

2023-07-17 Thread Aneesh Kumar K.V
Radix vmemmap mapping can map things correctly at the PMD level or PTE
level based on different device boundary checks. Hence we skip the
restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also
makes the feature widely useful because to use PMD_SIZE vmemmap area we
require a memory block size of 2GiB

We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature
can work with a memory block size of 256MB. Using altmap.reserve feature
to align things correctly at pageblock granularity. We can end up
losing some pages in memory with this. For ex: with a 256MiB memory block
size, we require 4 pages to map vmemmap pages, In order to align things
correctly we end up adding a reserve of 28 pages. ie, for every 4096
pages 28 pages get reserved.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/pgtable.h| 24 +++
 .../platforms/pseries/hotplug-memory.c|  3 ++-
 mm/memory_hotplug.c   |  2 ++
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 116d6add0bb0..f890907e5bbf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -157,6 +157,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 68817ea7f994..3d35371395a9 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -169,6 +169,30 @@ static inline bool is_ioremap_addr(const void *x)
 int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
 bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
   unsigned long page_size);
+/*
+ * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
+ * some of the restrictions. We don't check for PMD_SIZE because our
+ * vmemmap allocation code can fallback correctly. The pageblock
+ * alignment requirement is met using altmap->reserve blocks.
+ */
+#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long size)
+{
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   unsigned long vmemmap_size = nr_pages * sizeof(struct page);
+
+   if (!radix_enabled())
+   return false;
+
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+   /*
+* The pageblock alignment requirement is met by using
+* reserve blocks in altmap.
+*/
+   return true;
+}
+
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..1447509357a7 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -617,6 +617,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
u32 drc_index)
 
 static int dlpar_add_lmb(struct drmem_lmb *lmb)
 {
+   mhp_t mhp_flags = MHP_NONE | MHP_MEMMAP_ON_MEMORY;
unsigned long block_sz;
int nid, rc;
 
@@ -637,7 +638,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = first_online_node;
 
/* Add the memory */
-   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
+   rc = __add_memory(nid, lmb->base_addr, block_sz, mhp_flags);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c409f5ff6a59..6da063c80733 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -2174,6 +2174,8 @@ static int __ref try_remove_memory(u64 start, u64 size)
 * right thing if we used vmem_altmap when hot-adding
 * the range.
 */
+   mhp_altmap.base_pfn = PHYS_PFN(start);
+   mhp_altmap.free = PHYS_PFN(size) - nr_vmemmap_pages;
mhp_altmap.alloc = nr_vmemmap_pages;
altmap = _altmap;
}
-- 
2.41.0



[PATCH v4 4/6] mm/hotplug: Allow pageblock alignment via altmap reservation

2023-07-17 Thread Aneesh Kumar K.V
Add a new kconfig option that can be selected if we want to allow
pageblock alignment by reserving pages in the vmemmap altmap area.
This implies we will be reserving some pages for every memoryblock
This also allows the memmap on memory feature to be widely useful
with different memory block size values.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 109 ++--
 1 file changed, 96 insertions(+), 13 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5921c81fcb70..c409f5ff6a59 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,85 @@
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+   MEMMAP_ON_MEMORY_DISABLE = 0,
+   MEMMAP_ON_MEMORY_ENABLE,
+   MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_align_base(unsigned long size)
+{
+   if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) {
+   unsigned long align;
+   unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
+   unsigned long vmemmap_size;
+
+   vmemmap_size = DIV_ROUND_UP(nr_vmemmap_pages * sizeof(struct 
page), PAGE_SIZE);
+   align = pageblock_align(vmemmap_size) - vmemmap_size;
+   return align;
+   } else
+   return 0;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+   int ret, mode;
+   bool enabled;
+
+   if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+   mode =  MEMMAP_ON_MEMORY_FORCE;
+   goto matched;
+   }
+
+   ret = kstrtobool(val, );
+   if (ret < 0)
+   return ret;
+   if (enabled)
+   mode =  MEMMAP_ON_MEMORY_ENABLE;
+   else
+   mode =  MEMMAP_ON_MEMORY_DISABLE;
+
+matched:
+   *((int *)kp->arg) =  mode;
+   if (mode == MEMMAP_ON_MEMORY_FORCE) {
+   pr_info("Memory hotplug will reserve %ld pages in each memory 
block\n",
+   memory_block_align_base(memory_block_size_bytes()));
+   }
+   return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+   if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
+   return sprintf(buffer,  "force\n");
+   if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_ENABLE)
+   return sprintf(buffer,  "y\n");
+
+   return sprintf(buffer,  "n\n");
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+   .set = set_memmap_mode,
+   .get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, _mode_ops, _mode, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug\n"
+   "With value \"force\" it could result in memory wastage due to memmap 
size limitations \n"
+   "For example, if the memmap for a memory block requires 1 MiB, but the 
pageblock \n"
+   "size is 2 MiB, 1 MiB of hotplugged memory will be wasted. Note that 
there are \n"
+   "still cases where the feature cannot be enforced: for example, if the 
memmap is \n"
+   "smaller than a single page, or if the architecture does not support 
the forced \n"
+   "mode in all configurations. (y/n/force)");
 
 static inline bool mhp_memmap_on_memory(void)
 {
-   return memmap_on_memory;
+   return !!memmap_mode;
 }
 #else
 static inline bool mhp_memmap_on_memory(void)
@@ -1264,7 +1332,6 @@ static inline bool 
arch_supports_memmap_on_memory(unsigned long size)
 
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-
unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
unsigned long remaining_size = size - vmemmap_size;
@@ -1295,10 +1362,23 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 *   altmap as an alternative source of memory, and we do not 
exactly
 *   populate a single PMD.
 */
-   return mhp_memmap_on_memory() &&
-  size == memory_block_size_bytes() &&
-  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
-  arch_supports_memmap_on_memory(size);
+   if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+   return false;
+
+   /*
+* Make sure the vmemmap allocation is fully contained
+* so that we always allocate vmemmap memory from altmap area.
+*/
+   if (!IS_ALIGNED(vmemmap_size,  PAGE_SIZE))
+   return false;
+/*
+ * Without page reservation remaining pages should be pageblock 

[PATCH v4 3/6] mm/hotplug: Allow architecture to override memmap on memory support check

2023-07-17 Thread Aneesh Kumar K.V
Some architectures would want different restrictions. Hence add an
architecture-specific override.

Both the PMD_SIZE check and pageblock alignment check are moved there.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1b19462f4e72..5921c81fcb70 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,9 +1247,25 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
return device_online(>dev);
 }
 
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long size)
+{
+   unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
+   unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+
+   /*
+* As default, we want the vmemmap to span a complete PMD such that we
+* can map the vmemmap using a single PMD if supported by the
+* architecture.
+*/
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-   unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+
+   unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
unsigned long remaining_size = size - vmemmap_size;
 
@@ -1281,8 +1297,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 */
return mhp_memmap_on_memory() &&
   size == memory_block_size_bytes() &&
-  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
+  arch_supports_memmap_on_memory(size);
 }
 
 /*
-- 
2.41.0



[PATCH v4 2/6] mm/hotplug: Allow memmap on memory hotplug request to fallback

2023-07-17 Thread Aneesh Kumar K.V
If not supported, fallback to not using memap on memmory. This avoids
the need for callers to do the fallback.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/acpi/acpi_memhotplug.c |  3 +--
 include/linux/memory_hotplug.h |  3 ++-
 mm/memory_hotplug.c| 13 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24f662d8bd39..d0c1a71007d0 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (!info->length)
continue;
 
-   if (mhp_supports_memmap_on_memory(info->length))
-   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
+   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
result = __add_memory(mgid, info->start_addr, info->length,
  mhp_flags);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 013c69753c91..7d2076583494 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,6 +97,8 @@ typedef int __bitwise mhp_t;
  * To do so, we will use the beginning of the hot-added range to build
  * the page tables for the memmap array that describes the entire range.
  * Only selected architectures support it with SPARSE_VMEMMAP.
+ * This is only a hint, the core kernel can decide to not do this based on
+ * different alignment checks.
  */
 #define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
 /*
@@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int 
nid,
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
  struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3f231cf1b410..1b19462f4e72 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
return device_online(>dev);
 }
 
-bool mhp_supports_memmap_on_memory(unsigned long size)
+static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
@@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * Self hosted memmap array
 */
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
-   if (!mhp_supports_memmap_on_memory(size)) {
-   ret = -EINVAL;
-   goto error;
+   if (mhp_supports_memmap_on_memory(size)) {
+   mhp_altmap.free = PHYS_PFN(size);
+   mhp_altmap.base_pfn = PHYS_PFN(start);
+   params.altmap = _altmap;
}
-   mhp_altmap.free = PHYS_PFN(size);
-   mhp_altmap.base_pfn = PHYS_PFN(start);
-   params.altmap = _altmap;
+   /* fallback to not using altmap  */
}
 
/* call arch's memory hotadd */
-- 
2.41.0



[PATCH v4 1/6] mm/hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig

2023-07-17 Thread Aneesh Kumar K.V
Instead of adding menu entry with all supported architectures, add
mm/Kconfig variable and select the same from supported architectures.

No functional change in this patch.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/arm64/Kconfig | 4 +---
 arch/x86/Kconfig   | 4 +---
 mm/Kconfig | 3 +++
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2511b30d0f6..20245bd72b8f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_GNU_PROPERTY
select ARCH_USE_MEMTEST
@@ -348,9 +349,6 @@ config GENERIC_CSUM
 config GENERIC_CALIBRATE_DELAY
def_bool y
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 config SMP
def_bool y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 78224aa76409..d0258e92a8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on ARCH_ENABLE_MEMORY_HOTPLUG
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/mm/Kconfig b/mm/Kconfig
index 923bd35f81f2..932349271e28 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -572,6 +572,9 @@ config MHP_MEMMAP_ON_MEMORY
 
 endif # MEMORY_HOTPLUG
 
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+   bool
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
2.41.0



[PATCH v4 0/6] Add support for memmap on memory feature on ppc64

2023-07-17 Thread Aneesh Kumar K.V
This patch series update memmap on memory feature to fall back to
memmap allocation outside the memory block if the alignment rules are
not met. This makes the feature more useful on architectures like
ppc64 where alignment rules are different with 64K page size.

This patch series is dependent on dax vmemmap optimization series
posted here
https://lore.kernel.org/linux-mm/20230718022934.90447-1-aneesh.ku...@linux.ibm.com/

Changes from v3:
* Extend the module parameter memmap_on_memory to force allocation even
  though we can waste hotplug memory.

Changes from v2:
* Rebase to latest linus tree
* Redo the series based on review feedback. Multiple changes to the patchset.

Changes from v1:
* update the memblock to store vmemmap_altmap details. This is required
so that when we remove the memory we can find the altmap details which
is needed on some architectures.
* rebase to latest linus tree

Aneesh Kumar K.V (6):
  mm/hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig
  mm/hotplug: Allow memmap on memory hotplug request to fallback
  mm/hotplug: Allow architecture to override memmap on memory support
check
  mm/hotplug: Allow pageblock alignment via altmap reservation
  powerpc/book3s64/memhotplug: Enable memmap on memory for radix
  mm/hotplug: Embed vmem_altmap details in memory block

 arch/arm64/Kconfig|   4 +-
 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/pgtable.h|  24 +++
 .../platforms/pseries/hotplug-memory.c|   3 +-
 arch/x86/Kconfig  |   4 +-
 drivers/acpi/acpi_memhotplug.c|   3 +-
 drivers/base/memory.c |  32 +++-
 include/linux/memory.h|   8 +-
 include/linux/memory_hotplug.h|   3 +-
 mm/Kconfig|   3 +
 mm/memory_hotplug.c   | 168 ++
 11 files changed, 193 insertions(+), 60 deletions(-)

-- 
2.41.0



[PATCH v5 13/13] powerpc/book3s64/radix: Add debug message to give more details of vmemmap allocation

2023-07-17 Thread Aneesh Kumar K.V
Add some extra vmemmap pr_debug message that will indicate the type of
vmemmap allocations.

For ex: with DAX vmemmap optimization we can find the below details:
[  187.166580] radix-mmu: PAGE_SIZE vmemmap mapping
[  187.166587] radix-mmu: PAGE_SIZE vmemmap mapping
[  187.166591] radix-mmu: Tail page reuse vmemmap mapping
[  187.166594] radix-mmu: Tail page reuse vmemmap mapping
[  187.166598] radix-mmu: Tail page reuse vmemmap mapping
[  187.166601] radix-mmu: Tail page reuse vmemmap mapping
[  187.166604] radix-mmu: Tail page reuse vmemmap mapping
[  187.166608] radix-mmu: Tail page reuse vmemmap mapping
[  187.166611] radix-mmu: Tail page reuse vmemmap mapping
[  187.166614] radix-mmu: Tail page reuse vmemmap mapping
[  187.166617] radix-mmu: Tail page reuse vmemmap mapping
[  187.166620] radix-mmu: Tail page reuse vmemmap mapping
[  187.166623] radix-mmu: Tail page reuse vmemmap mapping
[  187.166626] radix-mmu: Tail page reuse vmemmap mapping
[  187.166629] radix-mmu: Tail page reuse vmemmap mapping
[  187.166632] radix-mmu: Tail page reuse vmemmap mapping

And without vmemmap optimization
[  293.549931] radix-mmu: PMD_SIZE vmemmap mapping
[  293.549984] radix-mmu: PMD_SIZE vmemmap mapping
[  293.550032] radix-mmu: PMD_SIZE vmemmap mapping
[  293.550076] radix-mmu: PMD_SIZE vmemmap mapping
[  293.550117] radix-mmu: PMD_SIZE vmemmap mapping

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 73d0987369ff..2828e7e0802c 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1034,6 +1034,7 @@ static pte_t * __meminit 
radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, 
NULL);
if (!p)
return NULL;
+   pr_debug("PAGE_SIZE vmemmap mapping\n");
} else {
/*
 * When a PTE/PMD entry is freed from the init_mm
@@ -1046,6 +1047,7 @@ static pte_t * __meminit 
radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long
 */
get_page(reuse);
p = page_to_virt(reuse);
+   pr_debug("Tail page reuse vmemmap mapping\n");
}
 
VM_BUG_ON(!PAGE_ALIGNED(addr));
@@ -1155,6 +1157,7 @@ int __meminit radix__vmemmap_populate(unsigned long 
start, unsigned long end, in
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (p) {
vmemmap_set_pmd(pmd, p, node, addr, next);
+   pr_debug("PMD_SIZE vmemmap mapping\n");
continue;
} else if (altmap) {
/*
-- 
2.41.0



[PATCH v5 11/13] powerpc/book3s64/radix: Add support for vmemmap optimization for radix

2023-07-17 Thread Aneesh Kumar K.V
With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap
page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)). Hence
with 64K page size, we don't use vmemmap deduplication for PMD-level
mapping.

Signed-off-by: Aneesh Kumar K.V 
---
 Documentation/mm/vmemmap_dedup.rst |   1 +
 Documentation/powerpc/index.rst|   1 +
 Documentation/powerpc/vmemmap_dedup.rst| 101 ++
 arch/powerpc/Kconfig   |   1 +
 arch/powerpc/include/asm/book3s/64/radix.h |   9 +
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 203 +
 6 files changed, 316 insertions(+)
 create mode 100644 Documentation/powerpc/vmemmap_dedup.rst

diff --git a/Documentation/mm/vmemmap_dedup.rst 
b/Documentation/mm/vmemmap_dedup.rst
index a4b12ff906c4..c573e08b5043 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -210,6 +210,7 @@ the device (altmap).
 
 The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
 PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst
 
 The differences with HugeTLB are relatively minor.
 
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index d33b554ca7ba..a50834798454 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -36,6 +36,7 @@ powerpc
 ultravisor
 vas-api
 vcpudispatch_stats
+vmemmap_dedup
 
 features
 
diff --git a/Documentation/powerpc/vmemmap_dedup.rst 
b/Documentation/powerpc/vmemmap_dedup.rst
new file mode 100644
index ..dc4db59fdf87
--- /dev/null
+++ b/Documentation/powerpc/vmemmap_dedup.rst
@@ -0,0 +1,101 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==
+Device DAX
+==
+
+The device-dax interface uses the tail deduplication technique explained in
+Documentation/mm/vmemmap_dedup.rst
+
+On powerpc, vmemmap deduplication is only used with radix MMU translation. Also
+with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap
+deduplication.
+
+With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap
+page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no
+vmemmap deduplication possible.
+
+With 1G PUD level mapping, we require 16384 struct pages and a single 64K
+vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we
+require 16 64K pages in vmemmap to map the struct page for 1G PUD level 
mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+ +---+ ---virt_to_page---> +---+   mapping to   +---+
+ |   | | 0 | -> | 0 |
+ |   | +---++---+
+ |   | | 1 | -> | 1 |
+ |   | +---++---+
+ |   | | 2 | ^ ^ ^ ^ ^ ^
+ |   | +---+   | | | | |
+ |   | | 3 | --+ | | | |
+ |   | +---+ | | | |
+ |   | | 4 | + | | |
+ |PUD| +---+   | | |
+ |   level   | | . | --+ | |
+ |  mapping  | +---+ | |
+ |   | | . | + |
+ |   | +---+   |
+ |   | | 15| --+
+ |   | +---+
+ |   |
+ |   |
+ |   |
+ +---+
+
+
+With 4K page size, 2M PMD level mapping requires 512 struct pages and a single
+4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we
+require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +---+ ---virt_to_page---> +---+   mapping to   +---+
+ |   | | 0 | -> | 0 |
+ |   | +---++---+
+ |   | | 1 | -> | 1 |
+ |   | +---++---+
+ |   | | 2 | ^ ^ ^ ^ ^ ^
+ |   | +---+   | | | | |
+ |   | | 3 | --+ | | | |
+ |   | 

[PATCH v5 12/13] powerpc/book3s64/radix: Remove mmu_vmemmap_psize

2023-07-17 Thread Aneesh Kumar K.V
This is not used by radix anymore.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 11 ---
 arch/powerpc/mm/init_64.c| 21 ++---
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index b492b67c0b7d..73d0987369ff 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -601,17 +601,6 @@ void __init radix__early_init_mmu(void)
 #else
mmu_virtual_psize = MMU_PAGE_4K;
 #endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-   /* vmemmap mapping */
-   if (mmu_psize_defs[MMU_PAGE_2M].shift) {
-   /*
-* map vmemmap using 2M if available
-*/
-   mmu_vmemmap_psize = MMU_PAGE_2M;
-   } else
-   mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
 #endif
/*
 * initialize page table size
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 5701faca39ef..6db7a063ba63 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -198,17 +198,12 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, 
unsigned long start,
return false;
 }
 
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node,
-   struct vmem_altmap *altmap)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+struct vmem_altmap *altmap)
 {
bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
 
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (radix_enabled())
-   return radix__vmemmap_populate(start, end, node, altmap);
-#endif
-
/* Align to the page size of the linear mapping. */
start = ALIGN_DOWN(start, page_size);
 
@@ -277,6 +272,18 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node,
return 0;
 }
 
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+  struct vmem_altmap *altmap)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (radix_enabled())
+   return radix__vmemmap_populate(start, end, node, altmap);
+#endif
+
+   return __vmemmap_populate(start, end, node, altmap);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 static unsigned long vmemmap_list_free(unsigned long start)
 {
-- 
2.41.0



[PATCH v5 10/13] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

2023-07-17 Thread Aneesh Kumar K.V
This is in preparation to update radix to implement vmemmap optimization
for devdax. Below are the rules w.r.t radix vmemmap mapping

1. First try to map things using PMD (2M)
2. With altmap if altmap cross-boundary check returns true, fall back to
   PAGE_SIZE
3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to
   PAGE_SIZE

On removing vmemmap mapping, check if every subsection that is using the
vmemmap area is invalid. If found to be invalid, that implies we can safely
free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86
because with 64K page size, we need to do the above check even at the
PAGE_SIZE granularity.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/radix.h |   2 +
 arch/powerpc/include/asm/pgtable.h |   4 +
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 326 +++--
 arch/powerpc/mm/init_64.c  |  26 +-
 4 files changed, 327 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 2ef92f36340f..f1461289643a 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -331,6 +331,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned 
long start,
 unsigned long phys);
 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
  int node, struct vmem_altmap *altmap);
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+  struct vmem_altmap *altmap);
 extern void radix__vmemmap_remove_mapping(unsigned long start,
unsigned long page_size);
 
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 6a88bfdaa69b..68817ea7f994 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -165,6 +165,10 @@ static inline bool is_ioremap_addr(const void *x)
 
return addr >= IOREMAP_BASE && addr < IOREMAP_END;
 }
+
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+  unsigned long page_size);
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 227fea53c217..9a7f3707b6fb 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -744,8 +744,59 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
p4d_clear(p4d);
 }
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long 
end)
+{
+   unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+   return !vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long 
end)
+{
+   unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+   return !vmemmap_populated(start, PAGE_SIZE);
+
+}
+#endif
+
+static void __meminit free_vmemmap_pages(struct page *page,
+struct vmem_altmap *altmap,
+int order)
+{
+   unsigned int nr_pages = 1 << order;
+
+   if (altmap) {
+   unsigned long alt_start, alt_end;
+   unsigned long base_pfn = page_to_pfn(page);
+
+   /*
+* with 2M vmemmap mmaping we can have things setup
+* such that even though atlmap is specified we never
+* used altmap.
+*/
+   alt_start = altmap->base_pfn;
+   alt_end = altmap->base_pfn + altmap->reserve +
+   altmap->free + altmap->alloc + altmap->align;
+
+   if (base_pfn >= alt_start && base_pfn < alt_end) {
+   vmem_altmap_free(altmap, nr_pages);
+   return;
+   }
+   }
+
+   if (PageReserved(page)) {
+   /* allocated from memblock */
+   while (nr_pages--)
+   free_reserved_page(page++);
+   } else
+   free_pages((unsigned long)page_address(page), order);
+}
+
 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
-unsigned long end, bool direct)
+unsigned long end, bool direct,
+struct vmem_altmap *altmap)
 {
unsigned long next, pages = 0;
pte_t *pte;
@@ -759,24 +810,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned 
long addr,
if (!pte_present(*pte))
continue;
 
-   if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
-   /*
-   

[PATCH v5 09/13] powerpc/book3s64/mm: Enable transparent pud hugepage

2023-07-17 Thread Aneesh Kumar K.V
This is enabled only with radix translation and 1G hugepage size. This will
be used with devdax device memory with a namespace alignment of 1G.

Anon transparent hugepage is not supported even though we do have helpers
checking pud_trans_huge(). We should never find that return true. The only
expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP.

Some of the helpers are never expected to get called on hash translation
and hence is marked to call BUG() in such a case.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hash.h |   9 +
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 155 --
 arch/powerpc/include/asm/book3s/64/radix.h|  36 
 .../include/asm/book3s/64/tlbflush-radix.h|   2 +
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +
 arch/powerpc/mm/book3s64/pgtable.c|  78 +
 arch/powerpc/mm/book3s64/radix_pgtable.c  |  28 
 arch/powerpc/mm/book3s64/radix_tlb.c  |   7 +
 arch/powerpc/platforms/Kconfig.cputype|   1 +
 include/trace/events/thp.h|  10 ++
 10 files changed, 323 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index d4a19e6547ac..6e70ae511631 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -138,7 +138,16 @@ static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 }
 
 #definehash__pmd_bad(pmd)  (pmd_val(pmd) & H_PMD_BAD_BITS)
+
+/*
+ * pud comparison that will work with both pte and page table pointer.
+ */
+static inline int hash__pud_same(pud_t pud_a, pud_t pud_b)
+{
+   return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & 
~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
+}
 #definehash__pud_bad(pud)  (pud_val(pud) & H_PUD_BAD_BITS)
+
 static inline int hash__p4d_bad(p4d_t p4d)
 {
return (p4d_val(p4d) == 0);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4acc9690f599..a8204566cfd0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte)
 {
return __pud_raw(pte_raw(pte));
 }
+
+static inline pte_t *pudp_ptep(pud_t *pud)
+{
+   return (pte_t *)pud;
+}
+
+#define pud_pfn(pud)   pte_pfn(pud_pte(pud))
+#define pud_dirty(pud) pte_dirty(pud_pte(pud))
+#define pud_young(pud) pte_young(pud_pte(pud))
+#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud)))
+#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud)))
+#define pud_mkdirty(pud)   pte_pud(pte_mkdirty(pud_pte(pud)))
+#define pud_mkclean(pud)   pte_pud(pte_mkclean(pud_pte(pud)))
+#define pud_mkyoung(pud)   pte_pud(pte_mkyoung(pud_pte(pud)))
+#define pud_mkwrite(pud)   pte_pud(pte_mkwrite(pud_pte(pud)))
 #define pud_write(pud) pte_write(pud_pte(pud))
 
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#define pud_soft_dirty(pmd)pte_soft_dirty(pud_pte(pud))
+#define pud_mksoft_dirty(pmd)  pte_pud(pte_mksoft_dirty(pud_pte(pud)))
+#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud)))
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
 static inline int pud_bad(pud_t pud)
 {
if (radix_enabled())
@@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool 
write)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
 extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
 extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
   pmd_t *pmdp, pmd_t pmd);
+extern void set_pud_at(struct mm_struct *mm, unsigned long addr,
+  pud_t *pudp, pud_t pud);
+
 static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd)
 {
 }
 
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+   unsigned long addr, pud_t *pud)
+{
+}
+
 extern int hash__has_transparent_hugepage(void);
 static inline int has_transparent_hugepage(void)
 {
@@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void)
 }
 #define has_transparent_hugepage has_transparent_hugepage
 
+static inline int has_transparent_pud_hugepage(void)
+{
+   if (radix_enabled())
+   return radix__has_transparent_pud_hugepage();
+   return 0;
+}
+#define has_transparent_pud_hugepage has_transparent_pud_hugepage
+
 static inline unsigned long
 pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
unsigned long clr, unsigned long set)
@@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long 
addr, 

[PATCH v5 08/13] powerpc/mm/trace: Convert trace event to trace event class

2023-07-17 Thread Aneesh Kumar K.V
A follow-up patch will add a pud variant for this same event.
Using event class makes that addition simpler.

No functional change in this patch.

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/hash_pgtable.c  |  2 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c |  2 +-
 include/trace/events/thp.h   | 23 ---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c 
b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 51f48984abca..988948d69bc1 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct 
*mm, unsigned long addr
 
old = be64_to_cpu(old_be);
 
-   trace_hugepage_update(addr, old, clr, set);
+   trace_hugepage_update_pmd(addr, old, clr, set);
if (old & H_PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e7ea492ac510..02e185d2e4d6 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -962,7 +962,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct 
*mm, unsigned long add
 #endif
 
old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
-   trace_hugepage_update(addr, old, clr, set);
+   trace_hugepage_update_pmd(addr, old, clr, set);
 
return old;
 }
diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
index 202b3e3e67ff..a95c78b10561 100644
--- a/include/trace/events/thp.h
+++ b/include/trace/events/thp.h
@@ -8,25 +8,29 @@
 #include 
 #include 
 
-TRACE_EVENT(hugepage_set_pmd,
+DECLARE_EVENT_CLASS(hugepage_set,
 
-   TP_PROTO(unsigned long addr, unsigned long pmd),
-   TP_ARGS(addr, pmd),
+   TP_PROTO(unsigned long addr, unsigned long pte),
+   TP_ARGS(addr, pte),
TP_STRUCT__entry(
__field(unsigned long, addr)
-   __field(unsigned long, pmd)
+   __field(unsigned long, pte)
),
 
TP_fast_assign(
__entry->addr = addr;
-   __entry->pmd = pmd;
+   __entry->pte = pte;
),
 
-   TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, 
__entry->pmd)
+   TP_printk("Set page table entry with 0x%lx with 0x%lx", 
__entry->addr, __entry->pte)
 );
 
+DEFINE_EVENT(hugepage_set, hugepage_set_pmd,
+   TP_PROTO(unsigned long addr, unsigned long pmd),
+   TP_ARGS(addr, pmd)
+);
 
-TRACE_EVENT(hugepage_update,
+DECLARE_EVENT_CLASS(hugepage_update,
 
TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, 
unsigned long set),
TP_ARGS(addr, pte, clr, set),
@@ -48,6 +52,11 @@ TRACE_EVENT(hugepage_update,
TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 
0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
 );
 
+DEFINE_EVENT(hugepage_update, hugepage_update_pmd,
+   TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, 
unsigned long set),
+   TP_ARGS(addr, pmd, clr, set)
+);
+
 DECLARE_EVENT_CLASS(migration_pmd,
 
TP_PROTO(unsigned long addr, unsigned long pmd),
-- 
2.41.0



[PATCH v5 07/13] mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization

2023-07-17 Thread Aneesh Kumar K.V
Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap
optimization includes an update of both the permissions (writeable to
read-only) and the output address (pfn) of the vmemmap ptes. That is not
supported without unmapping of pte(marking it invalid) by some
architectures.

With DAX vmemmap optimization we don't require such pte updates and
architectures can enable DAX vmemmap optimization while having hugetlb
vmemmap optimization disabled. Hence split DAX optimization support into a
different config.

s390, loongarch and riscv don't have devdax support. So the DAX config is not
enabled for them. With this change, arm64 should be able to select DAX
optimization

[1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable 
HUGETLB_PAGE_OPTIMIZE_VMEMMAP")

Signed-off-by: Aneesh Kumar K.V 
---
 arch/loongarch/Kconfig | 2 +-
 arch/riscv/Kconfig | 2 +-
 arch/s390/Kconfig  | 2 +-
 arch/x86/Kconfig   | 3 ++-
 fs/Kconfig | 2 +-
 include/linux/mm.h | 2 +-
 mm/Kconfig | 5 -
 7 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index e55511af4c77..537ca2a4005a 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -59,7 +59,7 @@ config LOONGARCH
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_LD_ORPHAN_WARN
-   select ARCH_WANT_OPTIMIZE_VMEMMAP
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select ARCH_WANTS_NO_INSTR
select BUILDTIME_TABLE_SORT
select COMMON_CLK
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4c07b9189c86..6943d34c1ec1 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -53,7 +53,7 @@ config RISCV
select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL
-   select ARCH_WANT_OPTIMIZE_VMEMMAP
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
select BUILDTIME_TABLE_SORT if MMU
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5b39918b7042..975fd06e4f4d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -127,7 +127,7 @@ config S390
select ARCH_WANTS_NO_INSTR
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_IPC_PARSE_VERSION
-   select ARCH_WANT_OPTIMIZE_VMEMMAP
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS2
select DMA_OPS if PCI
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7422db409770..78224aa76409 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -128,7 +128,8 @@ config X86
select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANT_LD_ORPHAN_WARN
-   select ARCH_WANT_OPTIMIZE_VMEMMAP   if X86_64
+   select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP   if X86_64
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP   if X86_64
select ARCH_WANTS_THP_SWAP  if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
diff --git a/fs/Kconfig b/fs/Kconfig
index 18d034ec7953..9c104c130a6e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -252,7 +252,7 @@ config HUGETLB_PAGE
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
-   depends on ARCH_WANT_OPTIMIZE_VMEMMAP
+   depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a2234ee14d2..83f51ec0897d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3640,7 +3640,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
 #endif
 
 #define VMEMMAP_RESERVE_NR 2
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
  struct dev_pagemap *pgmap)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 09130434e30d..923bd35f81f2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -487,7 +487,10 @@ config SPARSEMEM_VMEMMAP
 # Select this config option from the architecture Kconfig, if it is preferred
 # to enable the feature of HugeTLB/dev_dax vmemmap optimization.
 #
-config ARCH_WANT_OPTIMIZE_VMEMMAP
+config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+   bool
+
+config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
bool
 
 config HAVE_MEMBLOCK_PHYS_MAP
-- 
2.41.0



[PATCH v5 06/13] mm/huge pud: Use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE

2023-07-17 Thread Aneesh Kumar K.V
pudp_set_wrprotect and move_huge_pud helpers are only used when
CONFIG_TRANSPARENT_HUGEPAGE is enabled. Similar to pmdp_set_wrprotect and
move_huge_pmd_helpers use architecture override only if
CONFIG_TRANSPARENT_HUGEPAGE is set

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/pgtable.h | 2 ++
 mm/mremap.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index ca67ecbd9a66..bc9d6b681e25 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -558,6 +558,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 #endif
 #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline void pudp_set_wrprotect(struct mm_struct *mm,
  unsigned long address, pud_t *pudp)
 {
@@ -571,6 +572,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
 {
BUILD_BUG();
 }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #endif
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 11e06e4ab33b..056478c106ee 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -349,7 +349,7 @@ static inline bool move_normal_pud(struct vm_area_struct 
*vma,
 }
 #endif
 
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && 
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
  unsigned long new_addr, pud_t *old_pud, pud_t 
*new_pud)
 {
-- 
2.41.0



[PATCH v5 05/13] mm: Add pud_same similar to __HAVE_ARCH_P4D_SAME

2023-07-17 Thread Aneesh Kumar K.V
This helps architectures to override pmd_same and pud_same independently.

Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/pgtable.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 6fd9b2831338..ca67ecbd9a66 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -693,11 +693,14 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
return pmd_val(pmd_a) == pmd_val(pmd_b);
 }
+#endif
 
+#ifndef pud_same
 static inline int pud_same(pud_t pud_a, pud_t pud_b)
 {
return pud_val(pud_a) == pud_val(pud_b);
 }
+#define pud_same pud_same
 #endif
 
 #ifndef __HAVE_ARCH_P4D_SAME
-- 
2.41.0



[PATCH v5 04/13] mm/vmemmap: Allow architectures to override how vmemmap optimization works

2023-07-17 Thread Aneesh Kumar K.V
Architectures like powerpc will like to use different page table allocators
and mapping mechanisms to implement vmemmap optimization. Similar to
vmemmap_populate allow architectures to implement
vmemap_populate_compound_pages

Signed-off-by: Aneesh Kumar K.V 
---
 mm/sparse-vmemmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a044a130405b..a2cbe44c48e1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -358,6 +358,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long 
start, unsigned long end,
return 0;
 }
 
+#ifndef vmemmap_populate_compound_pages
 /*
  * For compound pages bigger than section size (e.g. x86 1G compound
  * pages with 2M subsection size) fill the rest of sections as tail
@@ -446,6 +447,8 @@ static int __meminit 
vmemmap_populate_compound_pages(unsigned long start_pfn,
return 0;
 }
 
+#endif
+
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
-- 
2.41.0



[PATCH v5 03/13] mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override

2023-07-17 Thread Aneesh Kumar K.V
dax vmemmap optimization requires a minimum of 2 PAGE_SIZE area within
vmemmap such that tail page mapping can point to the second PAGE_SIZE area.
Enforce that in vmemmap_can_optimize() function.

Architectures like powerpc also want to enable vmemmap optimization
conditionally (only with radix MMU translation). Hence allow architecture
override.

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/mm.h | 27 +++
 mm/mm_init.c   |  2 +-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2dd73e4f3d8e..1a2234ee14d2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3639,13 +3639,32 @@ void vmemmap_free(unsigned long start, unsigned long 
end,
struct vmem_altmap *altmap);
 #endif
 
+#define VMEMMAP_RESERVE_NR 2
 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
-static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
-  struct dev_pagemap *pgmap)
+static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
 {
-   return is_power_of_2(sizeof(struct page)) &&
-   pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+   unsigned long nr_pages;
+   unsigned long nr_vmemmap_pages;
+
+   if (!pgmap || !is_power_of_2(sizeof(struct page)))
+   return false;
+
+   nr_pages = pgmap_vmemmap_nr(pgmap);
+   nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
+   /*
+* For vmemmap optimization with DAX we need minimum 2 vmemmap
+* pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
+*/
+   return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
 }
+/*
+ * If we don't have an architecture override, use the generic rule
+ */
+#ifndef vmemmap_can_optimize
+#define vmemmap_can_optimize __vmemmap_can_optimize
+#endif
+
 #else
 static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
   struct dev_pagemap *pgmap)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a1963c3322af..245ac69b66a5 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1020,7 +1020,7 @@ static inline unsigned long compound_nr_pages(struct 
vmem_altmap *altmap,
if (!vmemmap_can_optimize(altmap, pgmap))
return pgmap_vmemmap_nr(pgmap);
 
-   return 2 * (PAGE_SIZE / sizeof(struct page));
+   return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
 }
 
 static void __ref memmap_init_compound(struct page *head,
-- 
2.41.0



[PATCH v5 02/13] mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg

2023-07-17 Thread Aneesh Kumar K.V
We will use this in a later patch to do tlb flush when clearing pud entries
on powerpc. This is similar to commit 93a98695f2f9 ("mm: change
pmdp_huge_get_and_clear_full take vm_area_struct as arg")

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/pgtable.h | 4 ++--
 mm/debug_vm_pgtable.c   | 2 +-
 mm/huge_memory.c| 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cf13f8d938a8..6fd9b2831338 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -450,11 +450,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct 
vm_area_struct *vma,
 #endif
 
 #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
-static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
int full)
 {
-   return pudp_huge_get_and_clear(mm, address, pudp);
+   return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
 }
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index ee119e33fef1..ee2c4c1dcfc8 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -385,7 +385,7 @@ static void __init pud_advanced_tests(struct 
pgtable_debug_args *args)
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-   pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+   pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1);
pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eb3678360b97..ba20cef681a4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1981,7 +1981,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
if (!ptl)
return 0;
 
-   pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
+   pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
-- 
2.41.0



[PATCH v5 00/13] Add support for DAX vmemmap optimization for ppc64

2023-07-17 Thread Aneesh Kumar K.V
This patch series implements changes required to support DAX vmemmap
optimization for ppc64. The vmemmap optimization is only enabled with radix MMU
translation and 1GB PUD mapping with 64K page size. The patch series also split
hugetlb vmemmap optimization as a separate Kconfig variable so that
architectures can enable DAX vmemmap optimization without enabling hugetlb
vmemmap optimization. This should enable architectures like arm64 to enable DAX
vmemmap optimization while they can't enable hugetlb vmemmap optimization. More
details of the same are in patch "mm/vmemmap optimization: Split hugetlb and
devdax vmemmap optimization"

Changes from v4:
* Address review feedback
* Add the Reviewed-by:

Changes from v3:
* Rebase to latest linus tree
* Build fix with SPARSEMEM_VMEMMP disabled
* Add hash_pud_same outisde THP Kconfig

Changes from v2:
* Rebase to latest linus tree
* Address review feedback

Changes from V1:
* Fix make htmldocs warning
* Fix vmemmap allocation bugs with different alignment values.
* Correctly check for section validity to before we free vmemmap area


Aneesh Kumar K.V (13):
  mm/hugepage pud: Allow arch-specific helper function to check huge
page pud support
  mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg
  mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to
override
  mm/vmemmap: Allow architectures to override how vmemmap optimization
works
  mm: Add pud_same similar to __HAVE_ARCH_P4D_SAME
  mm/huge pud: Use transparent huge pud helpers only with
CONFIG_TRANSPARENT_HUGEPAGE
  mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization
  powerpc/mm/trace: Convert trace event to trace event class
  powerpc/book3s64/mm: Enable transparent pud hugepage
  powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap
handling function
  powerpc/book3s64/radix: Add support for vmemmap optimization for radix
  powerpc/book3s64/radix: Remove mmu_vmemmap_psize
  powerpc/book3s64/radix: Add debug message to give more details of
vmemmap allocation

 Documentation/mm/vmemmap_dedup.rst|   1 +
 Documentation/powerpc/index.rst   |   1 +
 Documentation/powerpc/vmemmap_dedup.rst   | 101 +++
 arch/loongarch/Kconfig|   2 +-
 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/book3s/64/hash.h |   9 +
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 155 -
 arch/powerpc/include/asm/book3s/64/radix.h|  47 ++
 .../include/asm/book3s/64/tlbflush-radix.h|   2 +
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +
 arch/powerpc/include/asm/pgtable.h|   4 +
 arch/powerpc/mm/book3s64/hash_pgtable.c   |   2 +-
 arch/powerpc/mm/book3s64/pgtable.c|  78 +++
 arch/powerpc/mm/book3s64/radix_pgtable.c  | 573 --
 arch/powerpc/mm/book3s64/radix_tlb.c  |   7 +
 arch/powerpc/mm/init_64.c |  37 +-
 arch/powerpc/platforms/Kconfig.cputype|   1 +
 arch/riscv/Kconfig|   2 +-
 arch/s390/Kconfig |   2 +-
 arch/x86/Kconfig  |   3 +-
 drivers/nvdimm/pfn_devs.c |   2 +-
 fs/Kconfig|   2 +-
 include/linux/mm.h|  29 +-
 include/linux/pgtable.h   |  12 +-
 include/trace/events/thp.h|  33 +-
 mm/Kconfig|   5 +-
 mm/debug_vm_pgtable.c |   2 +-
 mm/huge_memory.c  |   2 +-
 mm/mm_init.c  |   2 +-
 mm/mremap.c   |   2 +-
 mm/sparse-vmemmap.c   |   3 +
 31 files changed, 1048 insertions(+), 82 deletions(-)
 create mode 100644 Documentation/powerpc/vmemmap_dedup.rst

-- 
2.41.0



[PATCH v5 01/13] mm/hugepage pud: Allow arch-specific helper function to check huge page pud support

2023-07-17 Thread Aneesh Kumar K.V
Architectures like powerpc would like to enable transparent huge page pud
support only with radix translation. To support that add
has_transparent_pud_hugepage() helper that architectures can override.

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/pfn_devs.c | 2 +-
 include/linux/pgtable.h   | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index af7d9301520c..18ad315581ca 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -100,7 +100,7 @@ static unsigned long *nd_pfn_supported_alignments(unsigned 
long *alignments)
 
if (has_transparent_hugepage()) {
alignments[1] = HPAGE_PMD_SIZE;
-   if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+   if (has_transparent_pud_hugepage())
alignments[2] = HPAGE_PUD_SIZE;
}
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5063b482e34f..cf13f8d938a8 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1499,6 +1499,9 @@ typedef unsigned int pgtbl_mod_mask;
 #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
 #endif
 
+#ifndef has_transparent_pud_hugepage
+#define has_transparent_pud_hugepage() 
IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#endif
 /*
  * On some architectures it depends on the mm if the p4d/pud or pmd
  * layer of the page table hierarchy is folded or not.
-- 
2.41.0



[PATCH] ALSA: ps3: Fix errors in snd_ps3.h

2023-07-17 Thread shijie001

The following checkpatch errors are removed:
ERROR: "foo * bar" should be "foo *bar"

Signed-off-by: Jie Shi 
---
 sound/ppc/snd_ps3.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sound/ppc/snd_ps3.h b/sound/ppc/snd_ps3.h
index 8b554a79bc14..63bdb8ee3953 100644
--- a/sound/ppc/snd_ps3.h
+++ b/sound/ppc/snd_ps3.h
@@ -69,11 +69,11 @@ struct snd_ps3_card_info {
 /* dma buffer management */
 spinlock_t dma_lock;
 /* dma_lock start */
-void * dma_start_vaddr[2]; /* 0 for L, 1 for R */
+void *dma_start_vaddr[2]; /* 0 for L, 1 for R */
 dma_addr_t dma_start_bus_addr[2];
 size_t dma_buffer_size;
-void * dma_last_transfer_vaddr[2];
-void * dma_next_transfer_vaddr[2];
+void *dma_last_transfer_vaddr[2];
+void *dma_next_transfer_vaddr[2];
 intsilent;
 /* dma_lock end */


Re: linux-next: Tree for Jul 13 (drivers/video/fbdev/ps3fb.c)

2023-07-17 Thread Randy Dunlap
Hi Thomas,
On 7/14/23 13:46, Randy Dunlap wrote:
> Thomas,
>
> On 7/13/23 09:11, Randy Dunlap wrote:
>>
>>
>> On 7/12/23 19:37, Stephen Rothwell wrote:
>>> Hi all,
>>>
>
> I still see this build error on linux-next 20230714.

I still see this build error on linux-next 20230717.

>
>>> Changes since 20230712:
>>>
>>
>> on ppc64:
>>
>> In file included from ../include/linux/device.h:15,
>> from ../arch/powerpc/include/asm/io.h:22,
>> from ../include/linux/io.h:13,
>> from ../include/linux/irq.h:20,
>> from ../arch/powerpc/include/asm/hardirq.h:6,
>> from ../include/linux/hardirq.h:11,
>> from ../include/linux/interrupt.h:11,
>> from ../drivers/video/fbdev/ps3fb.c:25:
>> ../drivers/video/fbdev/ps3fb.c: In function 'ps3fb_probe':
>> ../drivers/video/fbdev/ps3fb.c:1172:40: error: 'struct fb_info' has no 
>> member named 'dev'
>> 1172 | dev_driver_string(info->dev), dev_name(info->dev),
>> | ^~
>> ../include/linux/dev_printk.h:110:37: note: in definition of macro 
>> 'dev_printk_index_wrap'
>> 110 | _p_func(dev, fmt, ##__VA_ARGS__); \
>> | ^~~
>> ../drivers/video/fbdev/ps3fb.c:1171:9: note: in expansion of macro 'dev_info'
>> 1171 | dev_info(info->device, "%s %s, using %u KiB of video memory\n",
>> | ^~~~
>> ../drivers/video/fbdev/ps3fb.c:1172:61: error: 'struct fb_info' has no 
>> member named 'dev'
>> 1172 | dev_driver_string(info->dev), dev_name(info->dev),
>> | ^~
>> ../include/linux/dev_printk.h:110:37: note: in definition of macro 
>> 'dev_printk_index_wrap'

-- 
~Randy [using gmail temporarily while infradead is down]


Re: [PATCH 0/2] eventfd: simplify signal helpers

2023-07-17 Thread Alex Williamson
On Mon, 17 Jul 2023 19:12:16 -0300
Jason Gunthorpe  wrote:

> On Mon, Jul 17, 2023 at 01:08:31PM -0600, Alex Williamson wrote:
> 
> > What would that mechanism be?  We've been iterating on getting the
> > serialization and buffering correct, but I don't know of another means
> > that combines the notification with a value, so we'd likely end up with
> > an eventfd only for notification and a separate ring buffer for
> > notification values.  
> 
> All FDs do this. You just have to make a FD with custom
> file_operations that does what this wants. The uAPI shouldn't be able
> to tell if the FD is backing it with an eventfd or otherwise. Have the
> kernel return the FD instead of accepting it. Follow the basic design
> of eg mlx5vf_save_fops

Sure, userspace could poll on any fd and read a value from it, but at
that point we're essentially duplicating a lot of what eventfd provides
for a minor(?) semantic difference over how the counter value is
interpreted.  Using an actual eventfd allows the ACPI notification to
work as just another interrupt index within the existing vfio IRQ uAPI.
Thanks,

Alex



Re: [PATCH 0/2] eventfd: simplify signal helpers

2023-07-17 Thread Jason Gunthorpe
On Mon, Jul 17, 2023 at 01:08:31PM -0600, Alex Williamson wrote:

> What would that mechanism be?  We've been iterating on getting the
> serialization and buffering correct, but I don't know of another means
> that combines the notification with a value, so we'd likely end up with
> an eventfd only for notification and a separate ring buffer for
> notification values.

All FDs do this. You just have to make a FD with custom
file_operations that does what this wants. The uAPI shouldn't be able
to tell if the FD is backing it with an eventfd or otherwise. Have the
kernel return the FD instead of accepting it. Follow the basic design
of eg mlx5vf_save_fops

Jason


Re: [PATCH 0/2] Add support for rpmsg sound card on i.MX93 platform

2023-07-17 Thread Mark Brown
On Fri, 14 Jul 2023 17:29:11 +0800, Chancel Liu wrote:
> Support rpmsg sound card on i.MX93 platform.
> 
> Chancel Liu (2):
>   ASoC: dt-bindings: fsl_rpmsg: Add compatible string for i.MX93
>   ASoC: fsl_rpmsg: Add support for i.MX93 platform
> 
>  Documentation/devicetree/bindings/sound/fsl,rpmsg.yaml | 1 +
>  sound/soc/fsl/fsl_rpmsg.c  | 8 
>  2 files changed, 9 insertions(+)
> 
> [...]

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next

Thanks!

[1/2] ASoC: dt-bindings: fsl_rpmsg: Add compatible string for i.MX93
  commit: 143f8c69a27f3fa8ed30c7f6790ea039fff57cfe
[2/2] ASoC: fsl_rpmsg: Add support for i.MX93 platform
  commit: 60f38a592efe08e5ced454e8a05f6814e6e221ec

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark



Re: [PATCH 0/2] eventfd: simplify signal helpers

2023-07-17 Thread Alex Williamson
On Mon, 17 Jul 2023 10:29:34 +0200
Grzegorz Jaszczyk  wrote:

> pt., 14 lip 2023 o 09:05 Christian Brauner  napisał(a):
> >
> > On Thu, Jul 13, 2023 at 11:10:54AM -0600, Alex Williamson wrote:  
> > > On Thu, 13 Jul 2023 12:05:36 +0200
> > > Christian Brauner  wrote:
> > >  
> > > > Hey everyone,
> > > >
> > > > This simplifies the eventfd_signal() and eventfd_signal_mask() helpers
> > > > by removing the count argument which is effectively unused.  
> > >
> > > We have a patch under review which does in fact make use of the
> > > signaling value:
> > >
> > > https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/  
> >
> > Huh, thanks for the link.
> >
> > Quoting from
> > https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/#25266856
> >  
> > > Reading an eventfd returns an 8-byte value, we generally only use it
> > > as a counter, but it's been discussed previously and IIRC, it's possible
> > > to use that value as a notification value.  
> >
> > So the goal is to pipe a specific value through eventfd? But it is
> > explicitly a counter. The whole thing is written around a counter and
> > each write and signal adds to the counter.
> >
> > The consequences are pretty well described in the cover letter of
> > v6 https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/
> >  
> > > Since the eventfd counter is used as ACPI notification value
> > > placeholder, the eventfd signaling needs to be serialized in order to
> > > not end up with notification values being coalesced. Therefore ACPI
> > > notification values are buffered and signalized one by one, when the
> > > previous notification value has been consumed.  
> >
> > But isn't this a good indication that you really don't want an eventfd
> > but something that's explicitly designed to associate specific data with
> > a notification? Using eventfd in that manner requires serialization,
> > buffering, and enforces ordering.

What would that mechanism be?  We've been iterating on getting the
serialization and buffering correct, but I don't know of another means
that combines the notification with a value, so we'd likely end up with
an eventfd only for notification and a separate ring buffer for
notification values.

As this series demonstrates, the current in-kernel users only increment
the counter and most userspace likely discards the counter value, which
makes the counter largely a waste.  While perhaps unconventional,
there's no requirement that the counter may only be incremented by one,
nor any restriction that I see in how userspace must interpret the
counter value.

As I understand the ACPI notification proposal that Grzegorz links
below, a notification with an interpreted value allows for a more
direct userspace implementation when dealing with a series of discrete
notification with value events.  Thanks,

Alex

> > I have no skin in the game aside from having to drop this conversion
> > which I'm fine to do if there are actually users for this btu really,
> > that looks a lot like abusing an api that really wasn't designed for
> > this.  
> 
> https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/
> was posted at the beginig of March and one of the main things we've
> discussed was the mechanism for propagating acpi notification value.
> We've endup with eventfd as the best mechanism and have actually been
> using it from v2. I really do not want to waste this effort, I think
> we are quite advanced with v6 now. Additionally we didn't actually
> modify any part of eventfd support that was in place, we only used it
> in a specific (and discussed beforehand) way.



Re: [PATCH v9 01/42] mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()

2023-07-17 Thread Mark Brown
On Mon, Jul 17, 2023 at 03:55:50PM +, Edgecombe, Rick P wrote:
> On Fri, 2023-07-14 at 23:57 +0100, Mark Brown wrote:

> > The same issue seems to apply with the version that was in -next
> > based
> > on v6.4-rc4 too.

> The version in your branch is not the same as the version in tip (which
> had a squashed build fix). I was able to reproduce the build error with
> your branch. But not with the one in tip rebased on v6.5-rc1. So can
> you try this version:
> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=x86/shstk=899223d69ce9f338056f4c41ef870d70040fc860

Ah, I'd not seen that patch or that tip had been rebased - I'd actually
been using literally the branch from tip as my base at whatever point I
last noticed it changing up until I rebased onto -rc1.


signature.asc
Description: PGP signature


Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset

2023-07-17 Thread Manivannan Sadhasivam
On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote:
> From: Xiaowei Bao 
> 
> A workaround for the issue where the PCI Express Endpoint (EP) controller
> loses the values of the Maximum Link Width and Supported Link Speed from
> the Link Capabilities Register, which initially configured by the Reset
> Configuration Word (RCW) during a link-down or hot reset event.
> 

If this fixes an issue, then there should be a Fixes tag.

> Signed-off-by: Xiaowei Bao 
> Signed-off-by: Hou Zhiqiang 
> Signed-off-by: Frank Li 
> ---
>  drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index 4e4fdd1dfea7..2ef02d827eeb 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -45,6 +45,7 @@ struct ls_pcie_ep {
>   struct pci_epc_features *ls_epc;
>   const struct ls_pcie_ep_drvdata *drvdata;
>   int irq;
> + u32 lnkcap;
>   boolbig_endian;
>  };
>  
> @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
>   struct ls_pcie_ep *pcie = dev_id;
>   struct dw_pcie *pci = pcie->pci;
>   u32 val, cfg;
> + u8 offset;
>  
>   val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
>   ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
> @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
>   return IRQ_NONE;
>  
>   if (val & PEX_PF0_PME_MES_DR_LUD) {
> +

Please add a comment on why the LNKCAP is being restored here.

> + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> +
> + dw_pcie_dbi_ro_wr_en(pci);
> + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap);

lnkcap is a 32-bit variable, so you should use dw_pcie_writel_dbi().

- Mani

> + dw_pcie_dbi_ro_wr_dis(pci);
> +
>   cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
>   cfg |= PEX_PF0_CFG_READY;
>   ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
> @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
>   struct ls_pcie_ep *pcie;
>   struct pci_epc_features *ls_epc;
>   struct resource *dbi_base;
> + u8 offset;
>   int ret;
>  
>   pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
> @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
>  
>   platform_set_drvdata(pdev, pcie);
>  
> + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
> +
>   ret = dw_pcie_ep_init(>ep);
>   if (ret)
>   return ret;
> -- 
> 2.34.1
> 

-- 
மணிவண்ணன் சதாசிவம்


Re: [PATCH v9 01/42] mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()

2023-07-17 Thread Edgecombe, Rick P
On Fri, 2023-07-14 at 23:57 +0100, Mark Brown wrote:
> On Mon, Jun 12, 2023 at 05:10:27PM -0700, Rick Edgecombe wrote:
> > The x86 Shadow stack feature includes a new type of memory called
> > shadow
> > stack. This shadow stack memory has some unusual properties, which
> > requires
> > some core mm changes to function properly.
> 
> This seems to break sparc64_defconfig when applied on top of v6.5-
> rc1:
> 
> In file included from /home/broonie/git/bisect/include/linux/mm.h:29,
>  from /home/broonie/git/bisect/net/core/skbuff.c:40:
> /home/broonie/git/bisect/include/linux/pgtable.h: In function
> 'pmd_mkwrite':
> /home/broonie/git/bisect/include/linux/pgtable.h:528:9: error:
> implicit declaration of function 'pmd_mkwrite_novma'; did you mean
> 'pte_mkwrite_novma'? [-Werror=implicit-function-declaration]
>   return pmd_mkwrite_novma(pmd);
>  ^
>  pte_mkwrite_novma
> /home/broonie/git/bisect/include/linux/pgtable.h:528:9: error:
> incompatible types when returning type 'int' but 'pmd_t' {aka 'struct
> '} was expected
>   return pmd_mkwrite_novma(pmd);
>  ^~
> 
> The same issue seems to apply with the version that was in -next
> based
> on v6.4-rc4 too.

The version in your branch is not the same as the version in tip (which
had a squashed build fix). I was able to reproduce the build error with
your branch. But not with the one in tip rebased on v6.5-rc1. So can
you try this version:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=x86/shstk=899223d69ce9f338056f4c41ef870d70040fc860




Re: [PATCH 1/2] PCI: layerscape: Add support for Link down notification

2023-07-17 Thread Manivannan Sadhasivam
On Thu, Jun 15, 2023 at 12:41:11PM -0400, Frank Li wrote:
> Add support to pass Link down notification to Endpoint function driver
> so that the LINK_DOWN event can be processed by the function.
> 
> Signed-off-by: Frank Li 

One nit below. With that,

Acked-by: Manivannan Sadhasivam 

> ---
>  drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index de4c1758a6c3..4e4fdd1dfea7 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -88,6 +88,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
>  
>   dev_dbg(pci->dev, "Link up\n");
>   } else if (val & PEX_PF0_PME_MES_DR_LDD) {
> + pci_epc_linkdown(pci->ep.epc);

It'd be good to move this call after dev_dbg().

- Mani

>   dev_dbg(pci->dev, "Link down\n");
>   } else if (val & PEX_PF0_PME_MES_DR_HRD) {
>   dev_dbg(pci->dev, "Hot reset\n");
> -- 
> 2.34.1
> 

-- 
மணிவண்ணன் சதாசிவம்


[PATCH 1/1] sound:soc: fix return value check in imx_audmux_suspend

2023-07-17 Thread Yuanjun Gong
check the return value of clk_prepare_enable, and if
clk_prepare_enable got an unexpected return value,
imx_audmux_suspend should return the error value.

Signed-off-by: Yuanjun Gong 
---
 sound/soc/fsl/imx-audmux.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sound/soc/fsl/imx-audmux.c b/sound/soc/fsl/imx-audmux.c
index be003a117b39..962b6baf0a34 100644
--- a/sound/soc/fsl/imx-audmux.c
+++ b/sound/soc/fsl/imx-audmux.c
@@ -325,8 +325,11 @@ static void imx_audmux_remove(struct platform_device *pdev)
 static int imx_audmux_suspend(struct device *dev)
 {
int i;
+   ssize_t ret;
 
-   clk_prepare_enable(audmux_clk);
+   ret = clk_prepare_enable(audmux_clk);
+   if (ret)
+   return ret;
 
for (i = 0; i < reg_max; i++)
regcache[i] = readl(audmux_base + i * 4);
-- 
2.17.1



[PATCH 1/1] sound:soc: fix return value check in imx_audmux_resume

2023-07-17 Thread Yuanjun Gong
check the return value of clk_prepare_enable, and if
clk_prepare_enable got an unexpected return value,
imx_audmux_resume should return the error value.

Signed-off-by: Yuanjun Gong 
---
 sound/soc/fsl/imx-audmux.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sound/soc/fsl/imx-audmux.c b/sound/soc/fsl/imx-audmux.c
index be003a117b39..e8a3a1baf18d 100644
--- a/sound/soc/fsl/imx-audmux.c
+++ b/sound/soc/fsl/imx-audmux.c
@@ -339,8 +339,11 @@ static int imx_audmux_suspend(struct device *dev)
 static int imx_audmux_resume(struct device *dev)
 {
int i;
+   ssize_t ret;
 
-   clk_prepare_enable(audmux_clk);
+   ret = clk_prepare_enable(audmux_clk);
+   if (ret)
+   return ret;
 
for (i = 0; i < reg_max; i++)
writel(regcache[i], audmux_base + i * 4);
-- 
2.17.1



Re: [PATCH] powerpc/build: vdso linker warning for orphan sections

2023-07-17 Thread John Ogness
Hi Nicholas,

On 2023-06-09, Nicholas Piggin  wrote:
> Add --orphan-handlin for vdsos, and adjust vdso linker scripts to deal
> with orphan sections.

I'm reporting that I am getting a linker warning with 6.5-rc2. The
warning message is:

ld: warning: discarding dynamic section .rela.opd

and bisects to:

8ad57add77d3 ("powerpc/build: vdso linker warning for orphan sections")

Despite the warning, my ppc64 system seems to run fine. Let me know if
you need any other information from me.

I noticed [0] this with 6.5-rc1 but didn't contact the right people.

John Ogness

[0] https://lore.kernel.org/lkml/871qhf1q3j@jogness.linutronix.de


Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset

2023-07-17 Thread Frank Li
On Mon, Jul 17, 2023 at 09:29:10PM +0530, Manivannan Sadhasivam wrote:
> On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote:
> > From: Xiaowei Bao 
> > 
> > A workaround for the issue where the PCI Express Endpoint (EP) controller
> > loses the values of the Maximum Link Width and Supported Link Speed from
> > the Link Capabilities Register, which initially configured by the Reset
> > Configuration Word (RCW) during a link-down or hot reset event.
> > 
> 
> If this fixes an issue, then there should be a Fixes tag.

It is not fixed a exist software issue, just workaround a hardwre errata.

> 
> > Signed-off-by: Xiaowei Bao 
> > Signed-off-by: Hou Zhiqiang 
> > Signed-off-by: Frank Li 
> > ---
> >  drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 +
> >  1 file changed, 13 insertions(+)
> > 
> > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> > b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > index 4e4fdd1dfea7..2ef02d827eeb 100644
> > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > @@ -45,6 +45,7 @@ struct ls_pcie_ep {
> > struct pci_epc_features *ls_epc;
> > const struct ls_pcie_ep_drvdata *drvdata;
> > int irq;
> > +   u32 lnkcap;
> > boolbig_endian;
> >  };
> >  
> > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> > *dev_id)
> > struct ls_pcie_ep *pcie = dev_id;
> > struct dw_pcie *pci = pcie->pci;
> > u32 val, cfg;
> > +   u8 offset;
> >  
> > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
> > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
> > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, 
> > void *dev_id)
> > return IRQ_NONE;
> >  
> > if (val & PEX_PF0_PME_MES_DR_LUD) {
> > +
> 
> Please add a comment on why the LNKCAP is being restored here.
> 
> > +   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> > +
> > +   dw_pcie_dbi_ro_wr_en(pci);
> > +   dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap);
> 
> lnkcap is a 32-bit variable, so you should use dw_pcie_writel_dbi().
> 
> - Mani
> 
> > +   dw_pcie_dbi_ro_wr_dis(pci);
> > +
> > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
> > cfg |= PEX_PF0_CFG_READY;
> > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
> > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct 
> > platform_device *pdev)
> > struct ls_pcie_ep *pcie;
> > struct pci_epc_features *ls_epc;
> > struct resource *dbi_base;
> > +   u8 offset;
> > int ret;
> >  
> > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
> > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct 
> > platform_device *pdev)
> >  
> > platform_set_drvdata(pdev, pcie);
> >  
> > +   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> > +   pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
> > +
> > ret = dw_pcie_ep_init(>ep);
> > if (ret)
> > return ret;
> > -- 
> > 2.34.1
> > 
> 
> -- 
> மணிவண்ணன் சதாசிவம்


Re: [PATCH] misc: Explicitly include correct DT includes

2023-07-17 Thread Rob Herring
On Fri, Jul 14, 2023 at 11:47 AM Rob Herring  wrote:
>
> The DT of_device.h and of_platform.h date back to the separate
> of_platform_bus_type before it as merged into the regular platform bus.
> As part of that merge prepping Arm DT support 13 years ago, they
> "temporarily" include each other. They also include platform_device.h
> and of.h. As a result, there's a pretty much random mix of those include
> files used throughout the tree. In order to detangle these headers and
> replace the implicit includes with struct declarations, users need to
> explicitly include the correct includes.
>
> Signed-off-by: Rob Herring 
> ---
>  drivers/misc/cxl/base.c| 1 +
>  drivers/misc/fastrpc.c | 1 +
>  drivers/misc/lis3lv02d/lis3lv02d.c | 2 +-
>  drivers/misc/qcom-coincell.c   | 1 -
>  drivers/misc/sram.c| 2 +-
>  drivers/misc/vcpu_stall_detector.c | 1 -
>  drivers/misc/xilinx_sdfec.c| 4 +++-
>  drivers/misc/xilinx_tmr_inject.c   | 3 ++-
>  drivers/misc/xilinx_tmr_manager.c  | 3 ++-
>  9 files changed, 11 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/misc/cxl/base.c b/drivers/misc/cxl/base.c
> index cc0caf9192dc..b054562c046e 100644
> --- a/drivers/misc/cxl/base.c
> +++ b/drivers/misc/cxl/base.c
> @@ -7,6 +7,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include "cxl.h"
>
> diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
> index 9666d28037e1..1c7c0532da6f 100644
> --- a/drivers/misc/fastrpc.c
> +++ b/drivers/misc/fastrpc.c
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c 
> b/drivers/misc/lis3lv02d/lis3lv02d.c
> index 299d316f1bda..49868a45c0ad 100644
> --- a/drivers/misc/lis3lv02d/lis3lv02d.c
> +++ b/drivers/misc/lis3lv02d/lis3lv02d.c
> @@ -26,7 +26,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
>  #include "lis3lv02d.h"
>
>  #define DRIVER_NAME "lis3lv02d"
> diff --git a/drivers/misc/qcom-coincell.c b/drivers/misc/qcom-coincell.c
> index 54d4f6ee..3c57f7429147 100644
> --- a/drivers/misc/qcom-coincell.c
> +++ b/drivers/misc/qcom-coincell.c
> @@ -8,7 +8,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>
>  struct qcom_coincell {
> diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
> index 5757adf418b1..a88f92cf35be 100644
> --- a/drivers/misc/sram.c
> +++ b/drivers/misc/sram.c
> @@ -10,8 +10,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
> -#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/misc/vcpu_stall_detector.c 
> b/drivers/misc/vcpu_stall_detector.c
> index 53b5506080e1..6479c962da1a 100644
> --- a/drivers/misc/vcpu_stall_detector.c
> +++ b/drivers/misc/vcpu_stall_detector.c
> @@ -13,7 +13,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/misc/xilinx_sdfec.c b/drivers/misc/xilinx_sdfec.c
> index 270ff4c5971a..35941c006552 100644
> --- a/drivers/misc/xilinx_sdfec.c
> +++ b/drivers/misc/xilinx_sdfec.c
> @@ -15,12 +15,14 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
> +#include 

Double include of of.h. v2 coming.


Re: [PATCH] usb: Explicitly include correct DT includes

2023-07-17 Thread Rob Herring
On Fri, Jul 14, 2023 at 11:50 AM Rob Herring  wrote:
>
> The DT of_device.h and of_platform.h date back to the separate
> of_platform_bus_type before it as merged into the regular platform bus.
> As part of that merge prepping Arm DT support 13 years ago, they
> "temporarily" include each other. They also include platform_device.h
> and of.h. As a result, there's a pretty much random mix of those include
> files used throughout the tree. In order to detangle these headers and
> replace the implicit includes with struct declarations, users need to
> explicitly include the correct includes.
>
> Signed-off-by: Rob Herring 
> ---

[...]

> diff --git a/drivers/usb/host/fsl-mph-dr-of.c 
> b/drivers/usb/host/fsl-mph-dr-of.c
> index a9877f2569f4..2574bccc151b 100644
> --- a/drivers/usb/host/fsl-mph-dr-of.c
> +++ b/drivers/usb/host/fsl-mph-dr-of.c
> @@ -10,10 +10,12 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> +#include 

Double include of of.h here. v2 coming.

Rob


Re: [PATCH v2 02/12] mm: introduce execmem_text_alloc() and jit_text_alloc()

2023-07-17 Thread Andy Lutomirski



On Mon, Jun 26, 2023, at 10:48 AM, Song Liu wrote:
> On Mon, Jun 26, 2023 at 5:31 AM Mark Rutland  wrote:
>>
> [...]
>> >
>> > So the idea was that jit_text_alloc() will have a cache of large pages
>> > mapped ROX, will allocate memory from those caches and there will be
>> > jit_update() that uses text poking for writing to that memory.
>> >
>> > Upon allocation of a large page to increase the cache, that large page will
>> > be "invalidated" by filling it with breakpoint instructions (e.g int3 on
>> > x86)
>>
>> Does that work on x86?
>>
>> That is in no way gauranteed for other architectures; on arm64 you need
>> explicit cache maintenance (with I-cache maintenance at the VA to be executed
>> from) followed by context-synchronization-events (e.g. via ISB instructions, 
>> or
>> IPIs).
>
> I guess we need:
> 1) Invalidate unused part of the huge ROX pages;
> 2) Do not put two jit users (including module text, bpf, etc.) in the
> same cache line;
> 3) Explicit cache maintenance;
> 4) context-synchronization-events.
>
> Would these (or a subset of them) be sufficient to protect us from torn read?

Maybe?  #4 is sufficiently vague that I can't really interpret it.

I have a half-drafted email asking for official clarification on the rules that 
might help shed light on this.  I find that this type of request works best 
when it's really well written :)

>
> Thanks,
> Song


Re: Kernel Crash Dump (kdump) broken with 6.5

2023-07-17 Thread Mahesh J Salgaonkar
On 2023-07-17 20:15:53 Mon, Sachin Sant wrote:
> Kdump seems to be broken with 6.5 for ppc64le.
> 
> [ 14.200412] systemd[1]: Starting dracut pre-pivot and cleanup hook...
> [[0;32m OK [0m] Started dracut pre-pivot and cleanup hook.
> Starting Kdump Vmcore Save Service...
> [ 14.231669] systemd[1]: Started dracut pre-pivot and cleanup hook.
> [ 14.231801] systemd[1]: Starting Kdump Vmcore Save Service...
> [ 14.341035] kdump.sh[297]: kdump: saving to 
> /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/
> [ 14.350053] EXT4-fs (sda2): re-mounted e971a335-1ef8-4295-ab4e-3940f28e53fc 
> r/w. Quota mode: none.
> [ 14.345979] kdump.sh[297]: kdump: saving vmcore-dmesg.txt to 
> /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/
> [ 14.348742] kdump.sh[331]: Cannot open /proc/vmcore: No such file or 
> directory
> [ 14.348845] kdump.sh[297]: kdump: saving vmcore-dmesg.txt failed
> [ 14.349014] kdump.sh[297]: kdump: saving vmcore
> [ 14.443422] kdump.sh[332]: open_dump_memory: Can't open the dump 
> memory(/proc/vmcore). No such file or directory
> [ 14.456413] kdump.sh[332]: makedumpfile Failed.
> [ 14.456662] kdump.sh[297]: kdump: saving vmcore failed, _exitcode:1
> [ 14.456822] kdump.sh[297]: kdump: saving the /run/initramfs/kexec-dmesg.log 
> to /sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/
> [ 14.487002] kdump.sh[297]: kdump: saving vmcore failed
> [[0;1;31mFAILED[0m] Failed to start Kdump Vmcore Save Service.

Thanks Sachin for catching this.

> 
> 6.4 was good. Git bisect points to following patch
> 
> commit 606787fed7268feb256957872586370b56af697a
> powerpc/64s: Remove support for ELFv1 little endian userspace
> 
> Reverting this patch allows a successful capture of vmcore.
> 
> Does this change require any corresponding change to kdump
> and/or kexec tools?

Need to investigate that. It looks like vmcore_elf64_check_arch()
check from fs/proc/vmcore.c is failing after above commit.

static int __init parse_crash_elf64_headers(void)
{
[...]

/* Do some basic Verification. */
if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
(ehdr.e_type != ET_CORE) ||
!vmcore_elf64_check_arch() ||
[...]

It looks like ehdr->e_flags are not set properly while generating vmcore
ELF header. I see that in kexec_file_load, ehdr->e_flags left set to 0
irrespective of IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) is true or false.

-Mahesh


Re: [PATCH] dmaengine: Explicitly include correct DT includes

2023-07-17 Thread Rob Herring
On Fri, Jul 14, 2023 at 11:44 AM Rob Herring  wrote:
>
> The DT of_device.h and of_platform.h date back to the separate
> of_platform_bus_type before it as merged into the regular platform bus.
> As part of that merge prepping Arm DT support 13 years ago, they
> "temporarily" include each other. They also include platform_device.h
> and of.h. As a result, there's a pretty much random mix of those include
> files used throughout the tree. In order to detangle these headers and
> replace the implicit includes with struct declarations, users need to
> explicitly include the correct includes.
>
> Signed-off-by: Rob Herring 
> ---
>  drivers/dma/apple-admac.c  | 3 ++-
>  drivers/dma/at_hdmac.c | 2 +-
>  drivers/dma/bcm-sba-raid.c | 4 +++-
>  drivers/dma/bestcomm/bestcomm.c| 4 +---

v2 coming for this:

>> drivers/dma/bestcomm/bestcomm.
c:80:13: error: call to undeclared function 'irq_of_parse_and_map';
ISO C99 and later do not support implicit function declarations
[-Wimplicit-function-declaration]
  80 | tsk->irq = irq_of_parse_and_map(bcom_eng->ofnode,
tsk->tasknum);
 |^
>> drivers/dma/bestcomm/bestcomm.c:105:4: error: call to undeclared function 
>> 'irq_dispose_mapping'; ISO C99 and later do not support implicit function 
>> declarations [-Wimplicit-function-declaration]
 105 | irq_dispose_mapping(tsk->irq);
 | ^
   drivers/dma/bestcomm/bestcomm.c:128:2: error: call to undeclared
function 'irq_dispose_mapping'; ISO C99 and later do not support
implicit function declarations [-Wimplicit-function-declaration]
 128 | irq_dispose_mapping(tsk->irq);
 | ^
   3 errors generated.


Re: [PATCH v5] Revert "powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto"

2023-07-17 Thread Christophe Leroy


Le 17/07/2023 à 07:01, Michael Ellerman a écrit :
> Christophe Leroy  writes:
>> Le 12/07/2023 à 15:45, Michael Ellerman a écrit :
>>> From: Christophe Leroy 
>>>
>>> This partly reverts commit 1e688dd2a3d6759d416616ff07afc4bb836c4213.
>>>
>>> That commit aimed at optimising the code around generation of
>>> WARN_ON/BUG_ON but this leads to a lot of dead code erroneously
>>> generated by GCC.
>>>
>>> That dead code becomes a problem when we start using objtool validation
>>> because objtool will abort validation with a warning as soon as it
>>> detects unreachable code. This is because unreachable code might
>>> be the indication that objtool doesn't properly decode object text.
>>>
>>>textdata bss dec hex filename
>>> 9551585 3627834  224376 13403795 cc8693 vmlinux.before
>>> 9535281 3628358  224376 13388015 cc48ef vmlinux.after
>>>
>>> Once this change is reverted, in a standard configuration (pmac32 +
>>> function tracer) the text is reduced by 16k which is around 1.7%
>>>
>>> We already had problem with it when starting to use objtool on powerpc
>>> as a replacement for recordmcount, see commit 93e3f45a2631 ("powerpc:
>>> Fix __WARN_FLAGS() for use with Objtool")
>>>
>>> There is also a problem with at least GCC 12, on ppc64_defconfig +
>>> CONFIG_CC_OPTIMIZE_FOR_SIZE=y + CONFIG_DEBUG_SECTION_MISMATCH=y :
>>>
>>>   LD  .tmp_vmlinux.kallsyms1
>>> powerpc64-linux-ld: net/ipv4/tcp_input.o:(__ex_table+0xc4): undefined 
>>> reference to `.L2136'
>>> make[2]: *** [scripts/Makefile.vmlinux:36: vmlinux] Error 1
>>> make[1]: *** [/home/chleroy/linux-powerpc/Makefile:1238: vmlinux] Error 
>>> 2
>>>
>>> Taking into account that other problems are encountered with that
>>> 'asm goto' in WARN_ON(), including build failures, keeping that
>>> change is not worth it allthough it is primarily a compiler bug.
>>>
>>> Revert it for now.
>>>
>>> mpe: Retain EMIT_WARN_ENTRY as a synonym for EMIT_BUG_ENTRY to reduce
>>> churn, as there are now nearly as many uses of EMIT_WARN_ENTRY as
>>> EMIT_BUG_ENTRY.
>>
>> In that case, should we keep __EMIT_BUG_ENTRY and also keep the check
>> that makes sure nobody uses EMIT_BUG_ENTRY with BUGFLAG_WARNING ?
> 
> I didn't think it was worth it, now that it's not a correctness issue.
> 
> I think the better option would be to have EMIT_WARN_ENTRY add
> BUGFLAG_WARNING itself, rather than the caller having to pass it.
> 

Ok that's fine for me.

I'll do that in a follow-up patch one day.

Christophe


Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset

2023-07-17 Thread Frank Li
On Mon, Jul 17, 2023 at 08:45:14AM -0600, Rob Herring wrote:
> On Thu, Jun 15, 2023 at 10:41 AM Frank Li  wrote:
> >
> > From: Xiaowei Bao 
> >
> > A workaround for the issue where the PCI Express Endpoint (EP) controller
> > loses the values of the Maximum Link Width and Supported Link Speed from
> > the Link Capabilities Register, which initially configured by the Reset
> > Configuration Word (RCW) during a link-down or hot reset event.
> 
> What makes this Layerscape specific? Seems like something internal to DWC.

layerscape designed behavor is that LINK speed and width controled by RCW.
But design have been 'defect' when switch to dwc controller, may not
correct connect some wire. So provide an errata, ask software recover such
information when link up/down to align design spec.

For example, RCW config max link is 2lan, after link down/up, DWC reset
to max link to 4lan. So host side get a report, max link is 4 lan. 

It will not impact function, just information miss matched.

Frank

> 
> >
> > Signed-off-by: Xiaowei Bao 
> > Signed-off-by: Hou Zhiqiang 
> > Signed-off-by: Frank Li 
> > ---
> >  drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 +
> >  1 file changed, 13 insertions(+)
> >
> > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> > b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > index 4e4fdd1dfea7..2ef02d827eeb 100644
> > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > @@ -45,6 +45,7 @@ struct ls_pcie_ep {
> > struct pci_epc_features *ls_epc;
> > const struct ls_pcie_ep_drvdata *drvdata;
> > int irq;
> > +   u32 lnkcap;
> > boolbig_endian;
> >  };
> >
> > @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> > *dev_id)
> > struct ls_pcie_ep *pcie = dev_id;
> > struct dw_pcie *pci = pcie->pci;
> > u32 val, cfg;
> > +   u8 offset;
> >
> > val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
> > ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
> > @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, 
> > void *dev_id)
> > return IRQ_NONE;
> >
> > if (val & PEX_PF0_PME_MES_DR_LUD) {
> > +
> > +   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> > +
> > +   dw_pcie_dbi_ro_wr_en(pci);
> > +   dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, 
> > pcie->lnkcap);
> > +   dw_pcie_dbi_ro_wr_dis(pci);
> > +
> > cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
> > cfg |= PEX_PF0_CFG_READY;
> > ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
> > @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct 
> > platform_device *pdev)
> > struct ls_pcie_ep *pcie;
> > struct pci_epc_features *ls_epc;
> > struct resource *dbi_base;
> > +   u8 offset;
> > int ret;
> >
> > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
> > @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct 
> > platform_device *pdev)
> >
> > platform_set_drvdata(pdev, pcie);
> >
> > +   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> > +   pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
> > +
> > ret = dw_pcie_ep_init(>ep);
> > if (ret)
> > return ret;
> > --
> > 2.34.1
> >


Kernel Crash Dump (kdump) broken with 6.5

2023-07-17 Thread Sachin Sant
Kdump seems to be broken with 6.5 for ppc64le.

[ 14.200412] systemd[1]: Starting dracut pre-pivot and cleanup hook...
[[0;32m OK [0m] Started dracut pre-pivot and cleanup hook.
Starting Kdump Vmcore Save Service...
[ 14.231669] systemd[1]: Started dracut pre-pivot and cleanup hook.
[ 14.231801] systemd[1]: Starting Kdump Vmcore Save Service...
[ 14.341035] kdump.sh[297]: kdump: saving to 
/sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/
[ 14.350053] EXT4-fs (sda2): re-mounted e971a335-1ef8-4295-ab4e-3940f28e53fc 
r/w. Quota mode: none.
[ 14.345979] kdump.sh[297]: kdump: saving vmcore-dmesg.txt to 
/sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/
[ 14.348742] kdump.sh[331]: Cannot open /proc/vmcore: No such file or directory
[ 14.348845] kdump.sh[297]: kdump: saving vmcore-dmesg.txt failed
[ 14.349014] kdump.sh[297]: kdump: saving vmcore
[ 14.443422] kdump.sh[332]: open_dump_memory: Can't open the dump 
memory(/proc/vmcore). No such file or directory
[ 14.456413] kdump.sh[332]: makedumpfile Failed.
[ 14.456662] kdump.sh[297]: kdump: saving vmcore failed, _exitcode:1
[ 14.456822] kdump.sh[297]: kdump: saving the /run/initramfs/kexec-dmesg.log to 
/sysroot//var/crash//127.0.0.1-2023-07-14-13:32:34/
[ 14.487002] kdump.sh[297]: kdump: saving vmcore failed
[[0;1;31mFAILED[0m] Failed to start Kdump Vmcore Save Service.

6.4 was good. Git bisect points to following patch

commit 606787fed7268feb256957872586370b56af697a
powerpc/64s: Remove support for ELFv1 little endian userspace

Reverting this patch allows a successful capture of vmcore.

Does this change require any corresponding change to kdump
and/or kexec tools?

- Sachin



Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset

2023-07-17 Thread Rob Herring
On Thu, Jun 15, 2023 at 10:41 AM Frank Li  wrote:
>
> From: Xiaowei Bao 
>
> A workaround for the issue where the PCI Express Endpoint (EP) controller
> loses the values of the Maximum Link Width and Supported Link Speed from
> the Link Capabilities Register, which initially configured by the Reset
> Configuration Word (RCW) during a link-down or hot reset event.

What makes this Layerscape specific? Seems like something internal to DWC.

>
> Signed-off-by: Xiaowei Bao 
> Signed-off-by: Hou Zhiqiang 
> Signed-off-by: Frank Li 
> ---
>  drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 +
>  1 file changed, 13 insertions(+)
>
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index 4e4fdd1dfea7..2ef02d827eeb 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -45,6 +45,7 @@ struct ls_pcie_ep {
> struct pci_epc_features *ls_epc;
> const struct ls_pcie_ep_drvdata *drvdata;
> int irq;
> +   u32 lnkcap;
> boolbig_endian;
>  };
>
> @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
> struct ls_pcie_ep *pcie = dev_id;
> struct dw_pcie *pci = pcie->pci;
> u32 val, cfg;
> +   u8 offset;
>
> val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
> ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
> @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
> return IRQ_NONE;
>
> if (val & PEX_PF0_PME_MES_DR_LUD) {
> +
> +   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> +
> +   dw_pcie_dbi_ro_wr_en(pci);
> +   dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, 
> pcie->lnkcap);
> +   dw_pcie_dbi_ro_wr_dis(pci);
> +
> cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
> cfg |= PEX_PF0_CFG_READY;
> ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
> @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
> struct ls_pcie_ep *pcie;
> struct pci_epc_features *ls_epc;
> struct resource *dbi_base;
> +   u8 offset;
> int ret;
>
> pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
> @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
>
> platform_set_drvdata(pdev, pcie);
>
> +   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> +   pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
> +
> ret = dw_pcie_ep_init(>ep);
> if (ret)
> return ret;
> --
> 2.34.1
>


Re: [RFC][PATCH] sched: Rename DIE domain

2023-07-17 Thread Peter Zijlstra
On Mon, Jul 17, 2023 at 03:51:25PM +0200, Vincent Guittot wrote:
> On Wed, 12 Jul 2023 at 16:11, Peter Zijlstra  wrote:
> >
> > Hi
> >
> > Thomas just tripped over the x86 topology setup creating a 'DIE' domain
> > for the package mask :-)
> 
> May be a link to the change that triggers this patch could be useful

Thomas should post soonish..


Re: [PATCH] char: Explicitly include correct DT includes

2023-07-17 Thread Jarkko Sakkinen
On Fri Jul 14, 2023 at 5:43 PM UTC, Rob Herring wrote:
> The DT of_device.h and of_platform.h date back to the separate
> of_platform_bus_type before it as merged into the regular platform bus.
> As part of that merge prepping Arm DT support 13 years ago, they
> "temporarily" include each other. They also include platform_device.h
> and of.h. As a result, there's a pretty much random mix of those include
> files used throughout the tree. In order to detangle these headers and
> replace the implicit includes with struct declarations, users need to
> explicitly include the correct includes.
>
> Signed-off-by: Rob Herring 
> ---
>  drivers/char/agp/uninorth-agp.c| 1 +
>  drivers/char/bsr.c | 3 +--
>  drivers/char/hw_random/atmel-rng.c | 2 +-
>  drivers/char/hw_random/bcm2835-rng.c   | 3 +--
>  drivers/char/hw_random/ingenic-trng.c  | 2 +-
>  drivers/char/hw_random/iproc-rng200.c  | 3 +--
>  drivers/char/hw_random/npcm-rng.c  | 3 +--
>  drivers/char/hw_random/omap-rng.c  | 2 --
>  drivers/char/hw_random/omap3-rom-rng.c | 1 -
>  drivers/char/hw_random/pasemi-rng.c| 3 +--
>  drivers/char/hw_random/pic32-rng.c | 3 +--
>  drivers/char/hw_random/stm32-rng.c | 3 ++-
>  drivers/char/hw_random/xgene-rng.c | 5 ++---
>  drivers/char/hw_random/xiphera-trng.c  | 1 -
>  drivers/char/ipmi/kcs_bmc_aspeed.c | 1 -
>  drivers/char/tpm/tpm_ftpm_tee.c| 1 -
>  drivers/char/tpm/tpm_tis.c | 1 -
>  drivers/char/tpm/tpm_tis_spi_main.c| 2 +-
>  drivers/char/tpm/tpm_tis_synquacer.c   | 1 -
>  19 files changed, 14 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c
> index 62de7f4ba864..84411b13c49f 100644
> --- a/drivers/char/agp/uninorth-agp.c
> +++ b/drivers/char/agp/uninorth-agp.c
> @@ -3,6 +3,7 @@
>   * UniNorth AGPGART routines.
>   */
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/char/bsr.c b/drivers/char/bsr.c
> index 12143854aeac..70d31aed9011 100644
> --- a/drivers/char/bsr.c
> +++ b/drivers/char/bsr.c
> @@ -6,11 +6,10 @@
>   * Author: Sonny Rao 
>   */
>  
> +#include 
>  #include 
>  #include 
>  #include 
> -#include 
> -#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/char/hw_random/atmel-rng.c 
> b/drivers/char/hw_random/atmel-rng.c
> index b8effe77d80f..a37367ebcbac 100644
> --- a/drivers/char/hw_random/atmel-rng.c
> +++ b/drivers/char/hw_random/atmel-rng.c
> @@ -15,7 +15,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
>  #include 
>  #include 
>  
> diff --git a/drivers/char/hw_random/bcm2835-rng.c 
> b/drivers/char/hw_random/bcm2835-rng.c
> index e98fcac578d6..e19b0f9f48b9 100644
> --- a/drivers/char/hw_random/bcm2835-rng.c
> +++ b/drivers/char/hw_random/bcm2835-rng.c
> @@ -8,8 +8,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> -#include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/char/hw_random/ingenic-trng.c 
> b/drivers/char/hw_random/ingenic-trng.c
> index 0eb80f786f4d..759445d4f65a 100644
> --- a/drivers/char/hw_random/ingenic-trng.c
> +++ b/drivers/char/hw_random/ingenic-trng.c
> @@ -11,8 +11,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
> -#include 
>  #include 
>  #include 
>  
> diff --git a/drivers/char/hw_random/iproc-rng200.c 
> b/drivers/char/hw_random/iproc-rng200.c
> index 06bc060534d8..34df3f0d3e45 100644
> --- a/drivers/char/hw_random/iproc-rng200.c
> +++ b/drivers/char/hw_random/iproc-rng200.c
> @@ -12,8 +12,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> -#include 
> +#include 
>  #include 
>  #include 
>  
> diff --git a/drivers/char/hw_random/npcm-rng.c 
> b/drivers/char/hw_random/npcm-rng.c
> index 9903d0357e06..8a304b754217 100644
> --- a/drivers/char/hw_random/npcm-rng.c
> +++ b/drivers/char/hw_random/npcm-rng.c
> @@ -8,12 +8,11 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
> -#include 
>  
>  #define NPCM_RNGCS_REG   0x00/* Control and status register 
> */
>  #define NPCM_RNGD_REG0x04/* Data register */
> diff --git a/drivers/char/hw_random/omap-rng.c 
> b/drivers/char/hw_random/omap-rng.c
> index 00ff96703dd2..be03f76a2a80 100644
> --- a/drivers/char/hw_random/omap-rng.c
> +++ b/drivers/char/hw_random/omap-rng.c
> @@ -26,8 +26,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> -#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/char/hw_random/omap3-rom-rng.c 
> b/drivers/char/hw_random/omap3-rom-rng.c
> index f06e4f95114f..18dc46b1b58e 100644
> --- a/drivers/char/hw_random/omap3-rom-rng.c
> +++ b/drivers/char/hw_random/omap3-rom-rng.c
> @@ -20,7 +20,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  #include 
>  
> diff --git a/drivers/char/hw_random/pasemi-rng.c 
> b/drivers/char/hw_random/pasemi-rng.c
> index 

Re: [PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset

2023-07-17 Thread Frank Li
On Thu, Jun 15, 2023 at 12:41:12PM -0400, Frank Li wrote:
> From: Xiaowei Bao 
> 
> A workaround for the issue where the PCI Express Endpoint (EP) controller
> loses the values of the Maximum Link Width and Supported Link Speed from
> the Link Capabilities Register, which initially configured by the Reset
> Configuration Word (RCW) during a link-down or hot reset event.
> 
> Signed-off-by: Xiaowei Bao 
> Signed-off-by: Hou Zhiqiang 
> Signed-off-by: Frank Li 
> ---

@lorenzo:
It is only for layerscape and workaround a small errata.
Could you please pick this up?

Frank

>  drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index 4e4fdd1dfea7..2ef02d827eeb 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -45,6 +45,7 @@ struct ls_pcie_ep {
>   struct pci_epc_features *ls_epc;
>   const struct ls_pcie_ep_drvdata *drvdata;
>   int irq;
> + u32 lnkcap;
>   boolbig_endian;
>  };
>  
> @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
>   struct ls_pcie_ep *pcie = dev_id;
>   struct dw_pcie *pci = pcie->pci;
>   u32 val, cfg;
> + u8 offset;
>  
>   val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
>   ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
> @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
>   return IRQ_NONE;
>  
>   if (val & PEX_PF0_PME_MES_DR_LUD) {
> +
> + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> +
> + dw_pcie_dbi_ro_wr_en(pci);
> + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap);
> + dw_pcie_dbi_ro_wr_dis(pci);
> +
>   cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
>   cfg |= PEX_PF0_CFG_READY;
>   ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
> @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
>   struct ls_pcie_ep *pcie;
>   struct pci_epc_features *ls_epc;
>   struct resource *dbi_base;
> + u8 offset;
>   int ret;
>  
>   pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
> @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
>  
>   platform_set_drvdata(pdev, pcie);
>  
> + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
> +
>   ret = dw_pcie_ep_init(>ep);
>   if (ret)
>   return ret;
> -- 
> 2.34.1
> 


Re: [PATCH 1/2] PCI: layerscape: Add support for Link down notification

2023-07-17 Thread Frank Li
O Thu, Jun 15, 2023 at 12:41:11PM -0400, Frank Li wrote:
> Add support to pass Link down notification to Endpoint function driver
> so that the LINK_DOWN event can be processed by the function.
> 
> Signed-off-by: Frank Li 
> ---

@Lorenzo:
No comment over 1 months. Just change layerscape
and 1 line code change.

Could you please consider pick this up?

Frank Li

>  drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index de4c1758a6c3..4e4fdd1dfea7 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -88,6 +88,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
> *dev_id)
>  
>   dev_dbg(pci->dev, "Link up\n");
>   } else if (val & PEX_PF0_PME_MES_DR_LDD) {
> + pci_epc_linkdown(pci->ep.epc);
>   dev_dbg(pci->dev, "Link down\n");
>   } else if (val & PEX_PF0_PME_MES_DR_HRD) {
>   dev_dbg(pci->dev, "Hot reset\n");
> -- 
> 2.34.1
> 


Re: [RFC][PATCH] sched: Rename DIE domain

2023-07-17 Thread Vincent Guittot
On Wed, 12 Jul 2023 at 16:11, Peter Zijlstra  wrote:
>
> Hi
>
> Thomas just tripped over the x86 topology setup creating a 'DIE' domain
> for the package mask :-)

May be a link to the change that triggers this patch could be useful

>
> Since these names are SCHED_DEBUG only, rename them.
> I don't think anybody *should* be relying on this, but who knows.

Apart the remaining reference to DIE already mentioned by others,
looks good to me

>
> Signed-off-by: Peter Zijlstra (Intel) 
> ---
>  arch/powerpc/kernel/smp.c   | 2 +-
>  arch/s390/kernel/topology.c | 2 +-
>  arch/x86/kernel/smpboot.c   | 2 +-
>  kernel/sched/topology.c | 2 +-
>  4 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index fbbb695bae3d..5ed6b9fe5094 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1050,7 +1050,7 @@ static struct sched_domain_topology_level 
> powerpc_topology[] = {
>  #endif
> { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) 
> },
> { cpu_mc_mask, SD_INIT_NAME(MC) },
> -   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> +   { cpu_cpu_mask, SD_INIT_NAME(PKG) },
> { NULL, },
>  };
>
> diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
> index 68adf1de..c803f5e6ab46 100644
> --- a/arch/s390/kernel/topology.c
> +++ b/arch/s390/kernel/topology.c
> @@ -522,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] 
> = {
> { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
> { cpu_book_mask, SD_INIT_NAME(BOOK) },
> { cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
> -   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> +   { cpu_cpu_mask, SD_INIT_NAME(PKG) },
> { NULL, },
>  };
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index e1aa2cd7734b..09cc9d0aa358 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -653,7 +653,7 @@ static void __init build_sched_topology(void)
>  */
> if (!x86_has_numa_in_package) {
> x86_topology[i++] = (struct sched_domain_topology_level){
> -   cpu_cpu_mask, SD_INIT_NAME(DIE)
> +   cpu_cpu_mask, SD_INIT_NAME(PKG)
> };
> }
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index d3a3b2646ec4..e9d9cf776b7a 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1670,7 +1670,7 @@ static struct sched_domain_topology_level 
> default_topology[] = {
>  #ifdef CONFIG_SCHED_MC
> { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
>  #endif
> -   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> +   { cpu_cpu_mask, SD_INIT_NAME(PKG) },
> { NULL, },
>  };
>
>


Re: [PATCH v2 1/2] powerpc/tpm: Create linux,sml-base/size as big endian

2023-07-17 Thread Jarkko Sakkinen
On Wed Jul 12, 2023 at 12:39 PM UTC, Michael Ellerman wrote:
> Jarkko Sakkinen  writes:
> > On Tue, 2023-07-11 at 08:47 -0400, Stefan Berger wrote:
> >> On 7/10/23 17:23, Jarkko Sakkinen wrote:
> >> > On Thu, 2023-06-15 at 22:37 +1000, Michael Ellerman wrote:
> >> > > There's code in prom_instantiate_sml() to do a "SML handover" (Stored
> >> > > Measurement Log) from OF to Linux, before Linux shuts down Open
> >> > > Firmware.
> >> > > 
> >> > > This involves creating a buffer to hold the SML, and creating two 
> >> > > device
> >> > > tree properties to record its base address and size. The kernel then
> >> > > later reads those properties from the device tree to find the SML.
> >> > > 
> >> > > When the code was initially added in commit 4a727429abec ("PPC64: Add
> >> > > support for instantiating SML from Open Firmware") the powerpc kernel
> >> > > was always built big endian, so the properties were created big endian
> >> > > by default.
> >> > > 
> >> > > However since then little endian support was added to powerpc, and now
> >> > > the code lacks conversions to big endian when creating the properties.
> >> > > 
> >> > > This means on little endian kernels the device tree properties are
> >> > > little endian, which is contrary to the device tree spec, and in
> >> > > contrast to all other device tree properties.
> >> > > 
> >> > > To cope with that a workaround was added in tpm_read_log_of() to skip
> >> > > the endian conversion if the properties were created via the SML
> >> > > handover.
> >> > > 
> >> > > A better solution is to encode the properties as big endian as they
> >> > > should be, and remove the workaround.
> >> > > 
> >> > > Typically changing the encoding of a property like this would present
> >> > > problems for kexec. However the SML is not propagated across kexec, so
> >> > > changing the encoding of the properties is a non-issue.
> >> > > 
> >> > > Fixes: e46e22f12b19 ("tpm: enhance read_log_of() to support Physical 
> >> > > TPM event log")
> >> > > Signed-off-by: Michael Ellerman 
> >> > > Reviewed-by: Stefan Berger 
> >> > > ---
> >> > >   arch/powerpc/kernel/prom_init.c |  8 ++--
> >> > >   drivers/char/tpm/eventlog/of.c  | 23 ---
> >> > >   2 files changed, 10 insertions(+), 21 deletions(-)
> >> > 
> >> > Split into two patches (producer and consumer).
> >> 
> >> I think this wouldn't be right since it would break the system when only 
> >> one patch is applied since it would be reading the fields in the wrong 
> >> endianess.
> >
> > I think it would help if the commit message would better explain
> > what is going on. It is somewhat difficult to decipher, if you
> > don't have deep knowledge of the powerpc architecture.
>
> I mean, it's already 8 paragraphs ¯\_(ツ)_/¯
>
> But I'm happy to expand it. I just don't really know what extra detail
> is needed to make it clearer.

Adding more text is not the right way to clarify things. I'd start
by explaining shortly SML and then move to the handover. It can't
be that hard, right?

Just adding new paragraphs would probably just make it even more
confusing.

BR, Jarkko


[PATCH v11 1/4] mm/tlbbatch: Introduce arch_tlbbatch_should_defer()

2023-07-17 Thread Yicong Yang
From: Anshuman Khandual 

The entire scheme of deferred TLB flush in reclaim path rests on the
fact that the cost to refill TLB entries is less than flushing out
individual entries by sending IPI to remote CPUs. But architecture
can have different ways to evaluate that. Hence apart from checking
TTU_BATCH_FLUSH in the TTU flags, rest of the decision should be
architecture specific.

Signed-off-by: Anshuman Khandual 
[https://lore.kernel.org/linuxppc-dev/20171101101735.2318-2-khand...@linux.vnet.ibm.com/]
Signed-off-by: Yicong Yang 
[Rebase and fix incorrect return value type]
Reviewed-by: Kefeng Wang 
Reviewed-by: Anshuman Khandual 
Reviewed-by: Barry Song 
Reviewed-by: Xin Hao 
Tested-by: Punit Agrawal 
---
 arch/x86/include/asm/tlbflush.h | 12 
 mm/rmap.c   |  9 +
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 80450e1d5385..cf2a1de5d388 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -253,6 +253,18 @@ static inline void flush_tlb_page(struct vm_area_struct 
*vma, unsigned long a)
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
 }
 
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+   bool should_defer = false;
+
+   /* If remote CPUs need to be flushed then defer batch the flush */
+   if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+   should_defer = true;
+   put_cpu();
+
+   return should_defer;
+}
+
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 0c0d8857dfce..6480e526c154 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -688,17 +688,10 @@ static void set_tlb_ubc_flush_pending(struct mm_struct 
*mm, pte_t pteval)
  */
 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 {
-   bool should_defer = false;
-
if (!(flags & TTU_BATCH_FLUSH))
return false;
 
-   /* If remote CPUs need to be flushed then defer batch the flush */
-   if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
-   should_defer = true;
-   put_cpu();
-
-   return should_defer;
+   return arch_tlbbatch_should_defer(mm);
 }
 
 /*
-- 
2.24.0



[PATCH v11 4/4] arm64: support batched/deferred tlb shootdown during page reclamation/migration

2023-07-17 Thread Yicong Yang
From: Barry Song 

on x86, batched and deferred tlb shootdown has lead to 90%
performance increase on tlb shootdown. on arm64, HW can do
tlb shootdown without software IPI. But sync tlbi is still
quite expensive.

Even running a simplest program which requires swapout can
prove this is true,
 #include 
 #include 
 #include 
 #include 

 int main()
 {
 #define SIZE (1 * 1024 * 1024)
 volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
  MAP_SHARED | MAP_ANONYMOUS, -1, 0);

 memset(p, 0x88, SIZE);

 for (int k = 0; k < 1; k++) {
 /* swap in */
 for (int i = 0; i < SIZE; i += 4096) {
 (void)p[i];
 }

 /* swap out */
 madvise(p, SIZE, MADV_PAGEOUT);
 }
 }

Perf result on snapdragon 888 with 8 cores by using zRAM
as the swap block device.

 ~ # perf record taskset -c 4 ./a.out
 [ perf record: Woken up 10 times to write data ]
 [ perf record: Captured and wrote 2.297 MB perf.data (60084 samples) ]
 ~ # perf report
 # To display the perf.data header info, please use --header/--header-only 
options.
 # To display the perf.data header info, please use --header/--header-only 
options.
 #
 #
 # Total Lost Samples: 0
 #
 # Samples: 60K of event 'cycles'
 # Event count (approx.): 35706225414
 #
 # Overhead  Command  Shared Object  Symbol
 #   ...  .  ..
 #
21.07%  a.out[kernel.kallsyms]  [k] _raw_spin_unlock_irq
 8.23%  a.out[kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
 6.67%  a.out[kernel.kallsyms]  [k] filemap_map_pages
 6.16%  a.out[kernel.kallsyms]  [k] __zram_bvec_write
 5.36%  a.out[kernel.kallsyms]  [k] ptep_clear_flush
 3.71%  a.out[kernel.kallsyms]  [k] _raw_spin_lock
 3.49%  a.out[kernel.kallsyms]  [k] memset64
 1.63%  a.out[kernel.kallsyms]  [k] clear_page
 1.42%  a.out[kernel.kallsyms]  [k] _raw_spin_unlock
 1.26%  a.out[kernel.kallsyms]  [k] 
mod_zone_state.llvm.8525150236079521930
 1.23%  a.out[kernel.kallsyms]  [k] xas_load
 1.15%  a.out[kernel.kallsyms]  [k] zram_slot_lock

ptep_clear_flush() takes 5.36% CPU in the micro-benchmark
swapping in/out a page mapped by only one process. If the
page is mapped by multiple processes, typically, like more
than 100 on a phone, the overhead would be much higher as
we have to run tlb flush 100 times for one single page.
Plus, tlb flush overhead will increase with the number
of CPU cores due to the bad scalability of tlb shootdown
in HW, so those ARM64 servers should expect much higher
overhead.

Further perf annonate shows 95% cpu time of ptep_clear_flush
is actually used by the final dsb() to wait for the completion
of tlb flush. This provides us a very good chance to leverage
the existing batched tlb in kernel. The minimum modification
is that we only send async tlbi in the first stage and we send
dsb while we have to sync in the second stage.

With the above simplest micro benchmark, collapsed time to
finish the program decreases around 5%.

Typical collapsed time w/o patch:
 ~ # time taskset -c 4 ./a.out
 0.21user 14.34system 0:14.69elapsed
w/ patch:
 ~ # time taskset -c 4 ./a.out
 0.22user 13.45system 0:13.80elapsed

Also tested with benchmark in the commit on Kunpeng920 arm64 server
and observed an improvement around 12.5% with command
`time ./swap_bench`.
w/o w/
real0m13.460s   0m11.771s
user0m0.248s0m0.279s
sys 0m12.039s   0m11.458s

Originally it's noticed a 16.99% overhead of ptep_clear_flush()
which has been eliminated by this patch:

[root@localhost yang]# perf record -- ./swap_bench && perf report
[...]
16.99%  swap_bench  [kernel.kallsyms]  [k] ptep_clear_flush

It is tested on 4,8,128 CPU platforms and shows to be beneficial on
large systems but may not have improvement on small systems like on
a 4 CPU platform.

Also this patch improve the performance of page migration. Using pmbench
and tries to migrate the pages of pmbench between node 0 and node 1 for
100 times for 1G memory, this patch decrease the time used around 20%
(prev 18.338318910 sec after 13.981866350 sec) and saved the time used
by ptep_clear_flush().

Cc: Anshuman Khandual 
Cc: Jonathan Corbet 
Cc: Nadav Amit 
Cc: Mel Gorman 
Tested-by: Yicong Yang 
Tested-by: Xin Hao 
Tested-by: Punit Agrawal 
Signed-off-by: Barry Song 
Signed-off-by: Yicong Yang 
Reviewed-by: Kefeng Wang 
Reviewed-by: Xin Hao 
Reviewed-by: Anshuman Khandual 
---
 .../features/vm/TLB/arch-support.txt  |  2 +-
 arch/arm64/Kconfig|  1 +
 arch/arm64/include/asm/tlbbatch.h | 12 +
 arch/arm64/include/asm/tlbflush.h | 44 +--
 4 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 arch/arm64/include/asm/tlbbatch.h

diff --git 

[PATCH v11 3/4] mm/tlbbatch: Introduce arch_flush_tlb_batched_pending()

2023-07-17 Thread Yicong Yang
From: Yicong Yang 

Currently we'll flush the mm in flush_tlb_batched_pending() to
avoid race between reclaim unmaps pages by batched TLB flush
and mprotect/munmap/etc. Other architectures like arm64 may
only need a synchronization barrier(dsb) here rather than
a full mm flush. So add arch_flush_tlb_batched_pending() to
allow an arch-specific implementation here. This intends no
functional changes on x86 since still a full mm flush for
x86.

Signed-off-by: Yicong Yang 
---
 arch/x86/include/asm/tlbflush.h | 5 +
 mm/rmap.c   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 1c7d3a36e16c..837e4a50281a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -284,6 +284,11 @@ static inline void arch_tlbbatch_add_pending(struct 
arch_tlbflush_unmap_batch *b
cpumask_or(>cpumask, >cpumask, mm_cpumask(mm));
 }
 
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+   flush_tlb_mm(mm);
+}
+
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 
 static inline bool pte_flags_need_flush(unsigned long oldflags,
diff --git a/mm/rmap.c b/mm/rmap.c
index 9699c6011b0e..3a16c91be7e2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -717,7 +717,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
 
if (pending != flushed) {
-   flush_tlb_mm(mm);
+   arch_flush_tlb_batched_pending(mm);
/*
 * If the new TLB flushing is pending during flushing, leave
 * mm->tlb_flush_batched as is, to avoid losing flushing.
-- 
2.24.0



[PATCH v11 2/4] mm/tlbbatch: Rename and extend some functions

2023-07-17 Thread Yicong Yang
From: Barry Song 

This patch does some preparation works to extend batched TLB flush to
arm64. Including:
- Extend set_tlb_ubc_flush_pending() and arch_tlbbatch_add_mm()
  to accept an additional argument for address, architectures
  like arm64 may need this for tlbi.
- Rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending()
  to match its current function since we don't need to handle
  mm on architectures like arm64 and add_mm is not proper,
  add_pending will make sense to both as on x86 we're pending the
  TLB flush operations while on arm64 we're pending the synchronize
  operations.

This intends no functional changes on x86.

Cc: Anshuman Khandual 
Cc: Jonathan Corbet 
Cc: Nadav Amit 
Cc: Mel Gorman 
Tested-by: Yicong Yang 
Tested-by: Xin Hao 
Tested-by: Punit Agrawal 
Signed-off-by: Barry Song 
Signed-off-by: Yicong Yang 
Reviewed-by: Kefeng Wang 
Reviewed-by: Xin Hao 
Reviewed-by: Anshuman Khandual 
---
 arch/x86/include/asm/tlbflush.h |  5 +++--
 include/linux/mm_types_task.h   |  4 ++--
 mm/rmap.c   | 12 +++-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cf2a1de5d388..1c7d3a36e16c 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -276,8 +276,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
return atomic64_inc_return(>context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch 
*batch,
-   struct mm_struct *mm)
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch 
*batch,
+struct mm_struct *mm,
+unsigned long uaddr)
 {
inc_mm_tlb_gen(mm);
cpumask_or(>cpumask, >cpumask, mm_cpumask(mm));
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 5414b5c6a103..aa44fff8bb9d 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -52,8 +52,8 @@ struct tlbflush_unmap_batch {
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
 * The arch code makes the following promise: generic code can modify a
-* PTE, then call arch_tlbbatch_add_mm() (which internally provides all
-* needed barriers), then call arch_tlbbatch_flush(), and the entries
+* PTE, then call arch_tlbbatch_add_pending() (which internally provides
+* all needed barriers), then call arch_tlbbatch_flush(), and the 
entries
 * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
 * returns.
 */
diff --git a/mm/rmap.c b/mm/rmap.c
index 6480e526c154..9699c6011b0e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -642,7 +642,8 @@ void try_to_unmap_flush_dirty(void)
 #define TLB_FLUSH_BATCH_PENDING_LARGE  \
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
 
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
+ unsigned long uaddr)
 {
struct tlbflush_unmap_batch *tlb_ubc = >tlb_ubc;
int batch;
@@ -651,7 +652,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, 
pte_t pteval)
if (!pte_accessible(mm, pteval))
return;
 
-   arch_tlbbatch_add_mm(_ubc->arch, mm);
+   arch_tlbbatch_add_pending(_ubc->arch, mm, uaddr);
tlb_ubc->flush_required = true;
 
/*
@@ -726,7 +727,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
 }
 #else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
+ unsigned long uaddr)
 {
 }
 
@@ -1579,7 +1581,7 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
 */
pteval = ptep_get_and_clear(mm, address, 
pvmw.pte);
 
-   set_tlb_ubc_flush_pending(mm, pteval);
+   set_tlb_ubc_flush_pending(mm, pteval, address);
} else {
pteval = ptep_clear_flush(vma, address, 
pvmw.pte);
}
@@ -1962,7 +1964,7 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
 */
pteval = ptep_get_and_clear(mm, address, 
pvmw.pte);
 
-   set_tlb_ubc_flush_pending(mm, pteval);
+   set_tlb_ubc_flush_pending(mm, pteval, address);
} else {
pteval = ptep_clear_flush(vma, address, 
pvmw.pte);
}
-- 
2.24.0



[PATCH v11 0/4] arm64: support batched/deferred tlb shootdown during page reclamation/migration

2023-07-17 Thread Yicong Yang
From: Yicong Yang 

Though ARM64 has the hardware to do tlb shootdown, the hardware broadcasting is
not free. A simplest micro benchmark shows even on snapdragon 888 with only
8 cores, the overhead for ptep_clear_flush is huge even for paging out one page
mapped by only one process:
5.36%  a.out[kernel.kallsyms]  [k] ptep_clear_flush

While pages are mapped by multiple processes or HW has more CPUs, the cost 
should
become even higher due to the bad scalability of tlb shootdown. The same 
benchmark
can result in 16.99% CPU consumption on ARM64 server with around 100 cores
according to the test on patch 4/4.

This patchset leverages the existing BATCHED_UNMAP_TLB_FLUSH by
1. only send tlbi instructions in the first stage -
arch_tlbbatch_add_mm()
2. wait for the completion of tlbi by dsb while doing tlbbatch
sync in arch_tlbbatch_flush()

Testing on snapdragon shows the overhead of ptep_clear_flush is removed by the
patchset. The micro benchmark becomes 5% faster even for one page mapped by
single process on snapdragon 888.

Since BATCHED_UNMAP_TLB_FLUSH is implemented only on x86, the patchset does some
renaming/extension for the current implementation first (Patch 1-3), then add 
the
support on arm64 (Patch 4).

-v11:
- Enable ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH config unconditionally on arm64.
Link: 
https://lore.kernel.org/linux-mm/20230710083914.18336-1-yangyic...@huawei.com/T/#mc343b7e7c4a090392ef43b620af85a3eea76abad

-v10:
1. Enable BATCHED_UNMAP_TLB_FLUSH regardless of CPU numbers, per Catalin.
2. Split the renaming/extension works in a separate PATCH 2, per Catalin. Since
   it's split from PATCH 2/2 in v9, so inherit the tags.
3. Add arch_flush_tlb_batched_pending() to allow arch-specific implementation,
   per Catalin. Since it's some kind of an optimization on arm64 so a separate
   Patch 3/4.
Link: 
https://lore.kernel.org/linux-mm/20230518065934.12877-1-yangyic...@huawei.com/

-v9:
1. Using a runtime tunable to control batched TLB flush, per Catalin in v7.
   Sorry for missing this on v8.
Link: https://lore.kernel.org/all/20230329035512.57392-1-yangyic...@huawei.com/

-v8:
1. Rebase on 6.3-rc4
2. Tested the optimization on page migration and mentioned it in the commit
3. Thanks the review from Anshuman.
Link: 
https://lore.kernel.org/linux-mm/20221117082648.47526-1-yangyic...@huawei.com/

-v7:
1. rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending() as suggested, 
since it
   takes an extra address for arm64, per Nadav and Anshuman. Also mentioned in 
the commit.
2. add tags from Xin Hao, thanks.
Link: https://lore.kernel.org/lkml/20221115031425.44640-1-yangyic...@huawei.com/

-v6:
1. comment we don't defer TLB flush on platforms affected by 
ARM64_WORKAROUND_REPEAT_TLBI
2. use cpus_have_const_cap() instead of this_cpu_has_cap()
3. add tags from Punit, Thanks.
4. default enable the feature when cpus >= 8 rather than > 8, since the original
   improvement is observed on snapdragon 888 with 8 cores.
Link: https://lore.kernel.org/lkml/20221028081255.19157-1-yangyic...@huawei.com/

-v5:
1. Make ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH depends on EXPERT for this stage on 
arm64.
2. Make a threshold of CPU numbers for enabling batched TLP flush on arm64
Link: 
https://lore.kernel.org/linux-arm-kernel/20220921084302.43631-1-yangyic...@huawei.com/T/

-v4:
1. Add tags from Kefeng and Anshuman, Thanks.
2. Limit the TLB batch/defer on systems with >4 CPUs, per Anshuman
3. Merge previous Patch 1,2-3 into one, per Anshuman
Link: 
https://lore.kernel.org/linux-mm/20220822082120.8347-1-yangyic...@huawei.com/

-v3:
1. Declare arch's tlbbatch defer support by arch_tlbbatch_should_defer() instead
   of ARCH_HAS_MM_CPUMASK, per Barry and Kefeng
2. Add Tested-by from Xin Hao
Link: 
https://lore.kernel.org/linux-mm/20220711034615.482895-1-21cn...@gmail.com/

-v2:
1. Collected Yicong's test result on kunpeng920 ARM64 server;
2. Removed the redundant vma parameter in arch_tlbbatch_add_mm()
   according to the comments of Peter Zijlstra and Dave Hansen
3. Added ARCH_HAS_MM_CPUMASK rather than checking if mm_cpumask
   is empty according to the comments of Nadav Amit

Thanks, Peter, Dave and Nadav for your testing or reviewing
, and comments.

-v1:
https://lore.kernel.org/lkml/20220707125242.425242-1-21cn...@gmail.com/

Anshuman Khandual (1):
  mm/tlbbatch: Introduce arch_tlbbatch_should_defer()

Barry Song (2):
  mm/tlbbatch: Rename and extend some functions
  arm64: support batched/deferred tlb shootdown during page
reclamation/migration

Yicong Yang (1):
  mm/tlbbatch: Introduce arch_flush_tlb_batched_pending()

 .../features/vm/TLB/arch-support.txt  |  2 +-
 arch/arm64/Kconfig|  1 +
 arch/arm64/include/asm/tlbbatch.h | 12 +
 arch/arm64/include/asm/tlbflush.h | 44 +--
 arch/x86/include/asm/tlbflush.h   | 22 +-
 include/linux/mm_types_task.h |  4 +-
 mm/rmap.c 

Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter

2023-07-17 Thread Fabio Estevam
On Mon, Jul 17, 2023 at 9:55 AM Mark Brown  wrote:

> I'll just put a non-specific Cc stable tag on it, that should be enough
> to get it backported.

Sounds good. Thanks, Mark.


Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter

2023-07-17 Thread Mark Brown
On Mon, Jul 17, 2023 at 09:31:38AM -0300, Fabio Estevam wrote:
> On Wed, Jul 12, 2023 at 9:53 AM Matus Gajdos  wrote:
> >
> > Otherwise bit clock remains running writing invalid data to the DAC.
> >
> > Signed-off-by: Matus Gajdos 
> 
> Should this contain a Fixes tag so that it could be backported to
> stable kernels?

I'll just put a non-specific Cc stable tag on it, that should be enough
to get it backported.


signature.asc
Description: PGP signature


Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter

2023-07-17 Thread Fabio Estevam
On Wed, Jul 12, 2023 at 9:53 AM Matus Gajdos  wrote:
>
> Otherwise bit clock remains running writing invalid data to the DAC.
>
> Signed-off-by: Matus Gajdos 

Should this contain a Fixes tag so that it could be backported to
stable kernels?


Re: [PATCH v10 4/4] arm64: support batched/deferred tlb shootdown during page reclamation/migration

2023-07-17 Thread Yicong Yang
On 2023/7/16 23:11, Catalin Marinas wrote:
> On Mon, Jul 10, 2023 at 04:39:14PM +0800, Yicong Yang wrote:
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 7856c3a3e35a..f0ce8208c57f 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -96,6 +96,7 @@ config ARM64
>>  select ARCH_SUPPORTS_NUMA_BALANCING
>>  select ARCH_SUPPORTS_PAGE_TABLE_CHECK
>>  select ARCH_SUPPORTS_PER_VMA_LOCK
>> +select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if EXPERT
> 
> I don't want EXPERT to turn on a feature that's not selectable by the
> user. This would lead to different performance behaviour based on
> EXPERT. Just select it unconditionally.

Got it. will drop it and address the comment below.

Thanks.

> 
>> diff --git a/arch/arm64/include/asm/tlbflush.h 
>> b/arch/arm64/include/asm/tlbflush.h
>> index 412a3b9a3c25..4bb9cec62e26 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -254,17 +254,23 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
>>  dsb(ish);
>>  }
>>  
>> -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
>> - unsigned long uaddr)
>> +static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
>> +   unsigned long uaddr)
>>  {
>>  unsigned long addr;
>>  
>>  dsb(ishst);
>> -addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
>> +addr = __TLBI_VADDR(uaddr, ASID(mm));
>>  __tlbi(vale1is, addr);
>>  __tlbi_user(vale1is, addr);
>>  }
>>  
>> +static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
>> + unsigned long uaddr)
>> +{
>> +return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
>> +}
>> +
>>  static inline void flush_tlb_page(struct vm_area_struct *vma,
>>unsigned long uaddr)
>>  {
>> @@ -272,6 +278,42 @@ static inline void flush_tlb_page(struct vm_area_struct 
>> *vma,
>>  dsb(ish);
>>  }
>>  
>> +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
> 
> If it's selected unconditionally, we won't need this #ifdef here.
> 
>> +
>> +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
>> +{
>> +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
>> +/*
>> + * TLB flush deferral is not required on systems, which are affected 
>> with
> 
> "affected by" and drop the comma before "which".
> 


Re: [PATCH] ASoC: fsl_sai: Disable bit clock with transmitter

2023-07-17 Thread Shengjiu Wang
On Wed, Jul 12, 2023 at 8:53 PM Matus Gajdos  wrote:

> Otherwise bit clock remains running writing invalid data to the DAC.
>
> Signed-off-by: Matus Gajdos 
>

Acked-by: Shengjiu Wang 

Best regards
Wang Shengjiu


> ---
>  sound/soc/fsl/fsl_sai.c | 2 +-
>  sound/soc/fsl/fsl_sai.h | 1 +
>  2 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
> index 5e09f634c61b..dcc7fbe7acac 100644
> --- a/sound/soc/fsl/fsl_sai.c
> +++ b/sound/soc/fsl/fsl_sai.c
> @@ -719,7 +719,7 @@ static void fsl_sai_config_disable(struct fsl_sai
> *sai, int dir)
> u32 xcsr, count = 100;
>
> regmap_update_bits(sai->regmap, FSL_SAI_xCSR(tx, ofs),
> -  FSL_SAI_CSR_TERE, 0);
> +  FSL_SAI_CSR_TERE | FSL_SAI_CSR_BCE, 0);
>
> /* TERE will remain set till the end of current frame */
> do {
> diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h
> index 8254c3547b87..550df87b6a06 100644
> --- a/sound/soc/fsl/fsl_sai.h
> +++ b/sound/soc/fsl/fsl_sai.h
> @@ -91,6 +91,7 @@
>  /* SAI Transmit/Receive Control Register */
>  #define FSL_SAI_CSR_TERE   BIT(31)
>  #define FSL_SAI_CSR_SE BIT(30)
> +#define FSL_SAI_CSR_BCEBIT(28)
>  #define FSL_SAI_CSR_FR BIT(25)
>  #define FSL_SAI_CSR_SR BIT(24)
>  #define FSL_SAI_CSR_xF_SHIFT   16
> --
> 2.25.1
>
>


[PATCH v1] powerpc/pseries: use kfree_sensitive() in plpks_gen_password()

2023-07-17 Thread Minjie Du
password might contain private information, so better use
kfree_sensitive to free it.
In plpks_gen_password() use kfree_sensitive().

Signed-off-by: Minjie Du 
---
 arch/powerpc/platforms/pseries/plpks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/plpks.c 
b/arch/powerpc/platforms/pseries/plpks.c
index b0658ea3e..3441e616e 100644
--- a/arch/powerpc/platforms/pseries/plpks.c
+++ b/arch/powerpc/platforms/pseries/plpks.c
@@ -150,7 +150,7 @@ static int plpks_gen_password(void)
ospasswordlength = maxpwsize;
ospassword = kzalloc(maxpwsize, GFP_KERNEL);
if (!ospassword) {
-   kfree(password);
+   kfree_sensitive(password);
return -ENOMEM;
}
memcpy(ospassword, password, ospasswordlength);
@@ -163,7 +163,7 @@ static int plpks_gen_password(void)
}
}
 out:
-   kfree(password);
+   kfree_sensitive(password);
 
return pseries_status_to_err(rc);
 }
-- 
2.39.0



Re: [PATCH 2/2] ASoC: fsl_rpmsg: Add support for i.MX93 platform

2023-07-17 Thread Shengjiu Wang
On Fri, Jul 14, 2023 at 5:30 PM Chancel Liu  wrote:

> Add compatible string and specific soc data to support rpmsg sound card
> on i.MX93 platform.
>
> Signed-off-by: Chancel Liu 
>

Acked-by: Shengjiu Wang 

Best regards
wang shengjiu

> ---
>  sound/soc/fsl/fsl_rpmsg.c | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/sound/soc/fsl/fsl_rpmsg.c b/sound/soc/fsl/fsl_rpmsg.c
> index 15b48b5ea856..abe19a8a7aa7 100644
> --- a/sound/soc/fsl/fsl_rpmsg.c
> +++ b/sound/soc/fsl/fsl_rpmsg.c
> @@ -170,12 +170,20 @@ static const struct fsl_rpmsg_soc_data imx8mp_data =
> {
>SNDRV_PCM_FMTBIT_S32_LE,
>  };
>
> +static const struct fsl_rpmsg_soc_data imx93_data = {
> +   .rates = SNDRV_PCM_RATE_16000 | SNDRV_PCM_RATE_32000 |
> +SNDRV_PCM_RATE_48000 | SNDRV_PCM_RATE_96000,
> +   .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE |
> +  SNDRV_PCM_FMTBIT_S32_LE,
> +};
> +
>  static const struct of_device_id fsl_rpmsg_ids[] = {
> { .compatible = "fsl,imx7ulp-rpmsg-audio", .data = _data},
> { .compatible = "fsl,imx8mm-rpmsg-audio", .data = _data},
> { .compatible = "fsl,imx8mn-rpmsg-audio", .data = _data},
> { .compatible = "fsl,imx8mp-rpmsg-audio", .data = _data},
> { .compatible = "fsl,imx8ulp-rpmsg-audio", .data = _data},
> +   { .compatible = "fsl,imx93-rpmsg-audio", .data = _data},
> { /* sentinel */ }
>  };
>  MODULE_DEVICE_TABLE(of, fsl_rpmsg_ids);
> --
> 2.25.1
>
>


[PATCH] powerpc/64: Enable accelerated crypto algorithms in defconfig

2023-07-17 Thread Michael Ellerman
Enable all the acclerated crypto algorithms as modules in the 64-bit
defconfig, to get more test coverage.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/configs/ppc64_defconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index 268fa361a06d..40a1f4a4274c 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -390,8 +390,11 @@ CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRCT10DIF_VPMSUM=m
+CONFIG_CRYPTO_VPMSUM_TESTER=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_SHA1_PPC=m
+CONFIG_CRYPTO_AES_GCM_P10=m
 CONFIG_CRYPTO_DEV_NX=y
 CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
 CONFIG_CRYPTO_DEV_VMX=y
-- 
2.41.0



Re: [PATCH] misc: Explicitly include correct DT includes

2023-07-17 Thread Andrew Donnellan
On Fri, 2023-07-14 at 11:47 -0600, Rob Herring wrote:
> The DT of_device.h and of_platform.h date back to the separate
> of_platform_bus_type before it as merged into the regular platform
> bus.
> As part of that merge prepping Arm DT support 13 years ago, they
> "temporarily" include each other. They also include platform_device.h
> and of.h. As a result, there's a pretty much random mix of those
> include
> files used throughout the tree. In order to detangle these headers
> and
> replace the implicit includes with struct declarations, users need to
> explicitly include the correct includes.
> 
> Signed-off-by: Rob Herring 

Acked-by: Andrew Donnellan  # cxl

-- 
Andrew DonnellanOzLabs, ADL Canberra
a...@linux.ibm.com   IBM Australia Limited


Fwd: [PATCH] drivers: macintosh: space required after that ','

2023-07-17 Thread hanyu001

This patch fixes the checkpatch.pl error:

./drivers/macintosh/adbhid.c:1091: ERROR: space required after that ',' 
(ctx:VxV)


Signed-off-by: maqimei <2433033...@qq.com>
---
 drivers/macintosh/adbhid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index b2fe7a3..293e72a 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -1088,7 +1088,7 @@ static void adbhid_input_unregister(int id)
 unsigned char r1_buffer[8];

 adb_request(, NULL, ADBREQ_SYNC | ADBREQ_REPLY, 1,
-ADB_READREG(id,1));
+ADB_READREG(id, 1));
 if (req.reply_len < 8)
 pr_err("%s: bad length for reg. 1\n", __func__);
 else


[PATCH] powerpc: xmon: Remove space after '(' and before ')'

2023-07-17 Thread hanyu001

The patch fixes the following errors detected by checkpatch:

./arch/powerpc/xmon/xmon.c:2426: ERROR: space prohibited after that open 
parenthesis '('
./arch/powerpc/xmon/xmon.c:2426: ERROR: space prohibited before that 
close parenthesis ')'
./arch/powerpc/xmon/xmon.c:2426: ERROR: space required before the open 
parenthesis '('


Signed-off-by: ztt <1549089...@qq.com>
---
 arch/powerpc/xmon/xmon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 013b63eb4cd9..c10d9ff02af1 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1057,7 +1057,7 @@ cmds(struct pt_regs *excp)
 flush_input();
 termch = 0;
 cmd = skipbl();
-if(cmd == '\n' ) {
+if (cmd == '\n') {
 if (last_cmd == NULL)
 continue;
 take_input(last_cmd);
@@ -2423,7 +2423,7 @@ memex(void)
 }
 last_cmd = "m\n";
 while ((cmd = skipbl()) != '\n') {
-switch( cmd ){
+switch (cmd) {
 case 'b':size = 1;break;
 case 'w':size = 2;break;
 case 'l':size = 4;break;


[PATCH] drivers: macintosh: space required after that ','

2023-07-17 Thread hanyu001

This patch fixes the checkpatch.pl error:

./drivers/macintosh/adbhid.c:1091: ERROR: space required after that ',' 
(ctx:VxV)


Signed-off-by: maqimei <2433033...@qq.com>
---
 drivers/macintosh/adbhid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index b2fe7a3..293e72a 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -1088,7 +1088,7 @@ static void adbhid_input_unregister(int id)
 unsigned char r1_buffer[8];

 adb_request(, NULL, ADBREQ_SYNC | ADBREQ_REPLY, 1,
-ADB_READREG(id,1));
+ADB_READREG(id, 1));
 if (req.reply_len < 8)
 pr_err("%s: bad length for reg. 1\n", __func__);
 else


[PATCH] powerpc: xmon: insert space before the open parenthesis '('

2023-07-17 Thread hanyu001

Fixes checkpatch error:

./arch/powerpc/xmon/xmon.c:1052: ERROR: space required before the open 
parenthesis '('


Signed-off-by: ztt <1549089...@qq.com>
---
 arch/powerpc/xmon/xmon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 70c4c59a1a8f..6a1a2f0b9084 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1049,7 +1049,7 @@ cmds(struct pt_regs *excp)

 xmon_show_stack(excp->gpr[1], excp->link, excp->nip);

-for(;;) {
+for (;;) {
 #ifdef CONFIG_SMP
 printf("%x:", smp_processor_id());
 #endif /* CONFIG_SMP */


[PATCH] drivers: macintosh: add spaces required around that ':' and '?'

2023-07-17 Thread hanyu001

This patch adds spaces required around that ':' and '?'.

./drivers/macintosh/macio-adb.c:143: ERROR: spaces required around that 
'?' (ctx:VxW)
./drivers/macintosh/macio-adb.c:143: ERROR: spaces required around that 
':' (ctx:VxW)


Signed-off-by: maqimei <2433033...@qq.com>
---
 drivers/macintosh/macio-adb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/macintosh/macio-adb.c 
b/drivers/macintosh/macio-adb.c

index 55a9f8c..4de4883 100644
--- a/drivers/macintosh/macio-adb.c
+++ b/drivers/macintosh/macio-adb.c
@@ -140,7 +140,7 @@ static int macio_adb_autopoll(int devs)
 spin_lock_irqsave(_lock, flags);
 out_8(>active_hi.r, devs >> 8);
 out_8(>active_lo.r, devs);
-out_8(>autopoll.r, devs? APE: 0);
+out_8(>autopoll.r, devs ? APE : 0);
 spin_unlock_irqrestore(_lock, flags);
 return 0;
 }


Re: [PATCH 0/2] eventfd: simplify signal helpers

2023-07-17 Thread Grzegorz Jaszczyk
pt., 14 lip 2023 o 09:05 Christian Brauner  napisał(a):
>
> On Thu, Jul 13, 2023 at 11:10:54AM -0600, Alex Williamson wrote:
> > On Thu, 13 Jul 2023 12:05:36 +0200
> > Christian Brauner  wrote:
> >
> > > Hey everyone,
> > >
> > > This simplifies the eventfd_signal() and eventfd_signal_mask() helpers
> > > by removing the count argument which is effectively unused.
> >
> > We have a patch under review which does in fact make use of the
> > signaling value:
> >
> > https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/
>
> Huh, thanks for the link.
>
> Quoting from
> https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/#25266856
>
> > Reading an eventfd returns an 8-byte value, we generally only use it
> > as a counter, but it's been discussed previously and IIRC, it's possible
> > to use that value as a notification value.
>
> So the goal is to pipe a specific value through eventfd? But it is
> explicitly a counter. The whole thing is written around a counter and
> each write and signal adds to the counter.
>
> The consequences are pretty well described in the cover letter of
> v6 https://lore.kernel.org/all/20230630155936.3015595-1-...@semihalf.com/
>
> > Since the eventfd counter is used as ACPI notification value
> > placeholder, the eventfd signaling needs to be serialized in order to
> > not end up with notification values being coalesced. Therefore ACPI
> > notification values are buffered and signalized one by one, when the
> > previous notification value has been consumed.
>
> But isn't this a good indication that you really don't want an eventfd
> but something that's explicitly designed to associate specific data with
> a notification? Using eventfd in that manner requires serialization,
> buffering, and enforces ordering.
>
> I have no skin in the game aside from having to drop this conversion
> which I'm fine to do if there are actually users for this btu really,
> that looks a lot like abusing an api that really wasn't designed for
> this.

https://patchwork.kernel.org/project/kvm/patch/20230307220553.631069-1-...@semihalf.com/
was posted at the beginig of March and one of the main things we've
discussed was the mechanism for propagating acpi notification value.
We've endup with eventfd as the best mechanism and have actually been
using it from v2. I really do not want to waste this effort, I think
we are quite advanced with v6 now. Additionally we didn't actually
modify any part of eventfd support that was in place, we only used it
in a specific (and discussed beforehand) way.


Re: [RFC 0/3] Asynchronous EEH recovery

2023-07-17 Thread Ganesh G R


On 6/13/23 8:06 AM, Oliver O'Halloran wrote:


On Tue, Jun 13, 2023 at 11:44 AM Ganesh Goudar  wrote:

Hi,

EEH recovery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery.

On powernv with 9 network cards, Where 2 cards installed on one
PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
in total. I see approximately 33% reduction in time taken in EEH
recovery.

These patches were originally posted as separate RFCs by Sam, And
I rebased and posted these patches almost a year back, I stopped
pursuing these patches as I was not able test this on powernv, Due
to the issues in drivers of cards I was testing this on, Which are
now resolved. Since I am re-posting this after long time, Posting
this as a fresh RFC, Please comment.

What changes have you made since the last time you posted this series?
If the patches are the same then the comments I posted last time still
apply.


Hi Oliver, You asked about the way we are testing this on powervm, You expressed
concerns about having this on powernv, suggested to have this feature just for
powervm for now, and also expressed concerns on having two locks.

On powervm using two port card we are instantiating 64 VFS, for an lpar and 
injecting
the error on the bus from phyp, to observe the behavior.
I was able to test this on powernv with 16 PFs from 8 cards installed on 
separate PHBs,
Where I saw considerable performance improvement.
Regarding two locks idea, I may not have tested it for all scenarios, So far I 
have not
faced any issue, Are you suggesting a different approach.

Thanks


Re: [PATCH] net: Explicitly include correct DT includes

2023-07-17 Thread Kurt Kanzenbach
On Fri Jul 14 2023, Rob Herring wrote:
> The DT of_device.h and of_platform.h date back to the separate
> of_platform_bus_type before it as merged into the regular platform bus.
> As part of that merge prepping Arm DT support 13 years ago, they
> "temporarily" include each other. They also include platform_device.h
> and of.h. As a result, there's a pretty much random mix of those include
> files used throughout the tree. In order to detangle these headers and
> replace the implicit includes with struct declarations, users need to
> explicitly include the correct includes.
>
> Signed-off-by: Rob Herring 
> ---

[...]

>  drivers/net/dsa/hirschmann/hellcreek.c  | 1 -
>  drivers/net/dsa/hirschmann/hellcreek_ptp.c  | 1 +

Acked-by: Kurt Kanzenbach  # hellcreek


signature.asc
Description: PGP signature


Re: [PATCH] net: Explicitly include correct DT includes

2023-07-17 Thread Jérôme Pouiller
On Friday 14 July 2023 19:48:00 CEST Rob Herring wrote:
> 
> The DT of_device.h and of_platform.h date back to the separate
> of_platform_bus_type before it as merged into the regular platform bus.
> As part of that merge prepping Arm DT support 13 years ago, they
> "temporarily" include each other. They also include platform_device.h
> and of.h. As a result, there's a pretty much random mix of those include
> files used throughout the tree. In order to detangle these headers and
> replace the implicit includes with struct declarations, users need to
> explicitly include the correct includes.
> 
> Signed-off-by: Rob Herring 
> ---
>  drivers/net/can/bxcan.c | 1 -
>  drivers/net/can/ifi_canfd/ifi_canfd.c   | 1 -
>  drivers/net/can/m_can/m_can.c   | 1 -
>  drivers/net/can/m_can/m_can.h   | 1 -
>  drivers/net/can/rcar/rcar_canfd.c   | 1 -
>  drivers/net/can/sja1000/sja1000_platform.c  | 1 -
>  drivers/net/can/sun4i_can.c | 1 -
>  drivers/net/can/ti_hecc.c   | 1 -
>  drivers/net/dsa/b53/b53_mdio.c  | 1 +
>  drivers/net/dsa/b53/b53_mmap.c  | 1 +
>  drivers/net/dsa/hirschmann/hellcreek.c  | 1 -
>  drivers/net/dsa/hirschmann/hellcreek_ptp.c  | 1 +
>  drivers/net/dsa/lan9303-core.c  | 1 +
>  drivers/net/dsa/microchip/ksz8863_smi.c | 3 +++
>  drivers/net/dsa/microchip/ksz_common.c  | 2 +-
>  drivers/net/dsa/mt7530-mmio.c   | 3 ++-
>  drivers/net/dsa/mv88e6xxx/chip.c| 2 +-
>  drivers/net/dsa/ocelot/felix_vsc9959.c  | 1 +
>  drivers/net/dsa/ocelot/seville_vsc9953.c| 3 ++-
>  drivers/net/dsa/qca/qca8k-leds.c| 1 +
>  drivers/net/dsa/realtek/realtek-mdio.c  | 2 +-
>  drivers/net/dsa/realtek/realtek-smi.c   | 1 -
>  drivers/net/dsa/sja1105/sja1105_main.c  | 1 -
>  drivers/net/dsa/vitesse-vsc73xx-core.c  | 1 -
>  drivers/net/dsa/xrs700x/xrs700x.c   | 2 +-
>  drivers/net/ethernet/aeroflex/greth.c   | 4 ++--
>  drivers/net/ethernet/amd/sunlance.c | 2 +-
>  drivers/net/ethernet/apm/xgene-v2/main.h| 1 +
>  drivers/net/ethernet/arc/emac_main.c| 2 +-
>  drivers/net/ethernet/atheros/ag71xx.c   | 3 ++-
>  drivers/net/ethernet/cadence/macb_main.c| 1 -
>  drivers/net/ethernet/cirrus/cs89x0.c| 1 -
>  drivers/net/ethernet/ezchip/nps_enet.c  | 5 ++---
>  drivers/net/ethernet/freescale/dpaa/dpaa_eth.c  | 3 ++-
>  drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c  | 2 ++
>  drivers/net/ethernet/freescale/enetc/enetc_ierb.c   | 2 +-
>  drivers/net/ethernet/freescale/fec_mpc52xx.c| 4 ++--
>  drivers/net/ethernet/freescale/fec_mpc52xx_phy.c| 3 ++-
>  drivers/net/ethernet/freescale/fec_ptp.c| 1 -
>  drivers/net/ethernet/freescale/fman/fman.c  | 1 +
>  drivers/net/ethernet/freescale/fman/fman_port.c | 1 +
>  drivers/net/ethernet/freescale/fman/mac.c   | 2 ++
>  drivers/net/ethernet/freescale/fs_enet/mac-fcc.c| 1 -
>  drivers/net/ethernet/freescale/fs_enet/mac-fec.c| 1 -
>  drivers/net/ethernet/freescale/fs_enet/mac-scc.c| 1 -
>  drivers/net/ethernet/freescale/fsl_pq_mdio.c| 1 +
>  drivers/net/ethernet/freescale/gianfar.c| 2 +-
>  drivers/net/ethernet/freescale/gianfar_ethtool.c| 2 ++
>  drivers/net/ethernet/freescale/ucc_geth.c   | 3 ++-
>  drivers/net/ethernet/freescale/xgmac_mdio.c | 4 ++--
>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c   | 3 ---
>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c   | 4 
>  drivers/net/ethernet/ibm/ehea/ehea_main.c   | 1 +
>  drivers/net/ethernet/ibm/emac/core.c| 1 +
>  drivers/net/ethernet/ibm/emac/core.h| 1 -
>  drivers/net/ethernet/ibm/emac/mal.c | 2 ++
>  drivers/net/ethernet/ibm/emac/rgmii.c   | 2 ++
>  drivers/net/ethernet/ibm/emac/tah.c | 2 ++
>  drivers/net/ethernet/ibm/emac/zmii.c| 2 ++
>  drivers/net/ethernet/korina.c   | 2 +-
>  drivers/net/ethernet/marvell/mvmdio.c   | 2 +-
>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 -
>  drivers/net/ethernet/marvell/prestera/prestera_rxtx.c   | 3 ---
>  drivers/net/ethernet/marvell/sky2.c | 1 -
>  drivers/net/ethernet/mediatek/mtk_eth_soc.c | 3 ++-
>  drivers/net/ethernet/mediatek/mtk_star_emac.c   | 1 -
>  

Re: [PATCH v2] KVM: ppc64: Enable ring-based dirty memory tracking on ppc64: enable config options and implement relevant functions

2023-07-17 Thread Aneesh Kumar K.V
Kautuk Consul  writes:

> - Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly
>   ordered.
> - Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the
>   kvmppc_xive_native_set_attr is called in the context of an ioctl
>   syscall and will call kvmppc_xive_native_eq_sync for setting the
>   KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page()
>   when there isn't a running vcpu. Implemented the
>   kvm_arch_allow_write_without_running_vcpu to always return true
>   to allow mark_page_dirty_in_slot to mark the page dirty in the
>   memslot->dirty_bitmap in this case.
> - Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page
>   offset.
> - Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required
>   for the generic KVM code to call.
> - Add a check to kvmppc_vcpu_run_hv for checking whether the dirty
>   ring is soft full.
> - Implement the kvm_arch_flush_remote_tlbs_memslot function to support
>   the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option.
>
> Test Results
> 
> On testing with live migration it was found that there is around
> 150-180 ms improvment in overall migration time with this patch.
>
> Bare Metal P9 testing with patch:
> 
> (qemu) info migrate
> globals:
> store-global-state: on
> only-migratable: off
> send-configuration: on
> send-section-footer: on
> decompress-error-check: on
> clear-bitmap-shift: 18
> Migration status: completed
> total time: 20694 ms
> downtime: 73 ms
> setup: 23 ms
> transferred ram: 2604370 kbytes
> throughput: 1033.55 mbps
> remaining ram: 0 kbytes
> total ram: 16777216 kbytes
> duplicate: 3555398 pages
> skipped: 0 pages
> normal: 642026 pages
> normal bytes: 2568104 kbytes
> dirty sync count: 3
> page size: 4 kbytes
> multifd bytes: 0 kbytes
> pages-per-second: 32455
> precopy ram: 2581549 kbytes
> downtime ram: 22820 kbytes
>
> Bare Metal P9 testing without patch:
> ---
> (qemu) info migrate
> globals:
> store-global-state: on
> only-migratable: off
> send-configuration: on
> send-section-footer: on
> decompress-error-check: on
> clear-bitmap-shift: 18
> Migration status: completed
> total time: 20873 ms
> downtime: 62 ms
> setup: 19 ms
> transferred ram: 2612900 kbytes
> throughput: 1027.83 mbps
> remaining ram: 0 kbytes
> total ram: 16777216 kbytes
> duplicate: 3553329 pages
> skipped: 0 pages
> normal: 644159 pages
> normal bytes: 2576636 kbytes
> dirty sync count: 4
> page size: 4 kbytes
> multifd bytes: 0 kbytes
> pages-per-second: 88297
> precopy ram: 2603645 kbytes
> downtime ram: 9254 kbytes
>
> Signed-off-by: Kautuk Consul 
> ---
>  Documentation/virt/kvm/api.rst  |  2 +-
>  arch/powerpc/include/uapi/asm/kvm.h |  2 ++
>  arch/powerpc/kvm/Kconfig|  2 ++
>  arch/powerpc/kvm/book3s.c   | 46 +
>  arch/powerpc/kvm/book3s_hv.c|  3 ++
>  include/linux/kvm_dirty_ring.h  |  5 
>  virt/kvm/dirty_ring.c   |  1 +
>  7 files changed, 60 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index c0ddd3035462..84c180ccd178 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -8114,7 +8114,7 @@ regardless of what has actually been exposed through 
> the CPUID leaf.
>  8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
>  --
>  
> -:Architectures: x86, arm64
> +:Architectures: x86, arm64, ppc64
>  :Parameters: args[0] - size of the dirty log ring
>  
>  KVM is capable of tracking dirty memory using ring buffers that are
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
> b/arch/powerpc/include/uapi/asm/kvm.h
> index 9f18fa090f1f..f722309ed7fb 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -33,6 +33,8 @@
>  /* Not always available, but if it is, this is the correct offset.  */
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
>  
> +#define KVM_DIRTY_LOG_PAGE_OFFSET 64
> +
>  struct kvm_regs {
>   __u64 pc;
>   __u64 cr;
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 902611954200..c93354ec3bd5 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -26,6 +26,8 @@ config KVM
>   select IRQ_BYPASS_MANAGER
>   select HAVE_KVM_IRQ_BYPASS
>   select INTERVAL_TREE
> + select HAVE_KVM_DIRTY_RING_ACQ_REL
> + select NEED_KVM_DIRTY_RING_WITH_BITMAP
>  
>  config KVM_BOOK3S_HANDLER
>   bool
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 686d8d9eda3e..01aa4fe2c424 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -32,6 +32,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "book3s.h"
>  #include "trace.h"
> @@ -1070,6 +1071,51 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned 
> irqchip, unsigned 

[PATCH v2] KVM: ppc64: Enable ring-based dirty memory tracking on ppc64: enable config options and implement relevant functions

2023-07-17 Thread Kautuk Consul
- Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly
  ordered.
- Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the
  kvmppc_xive_native_set_attr is called in the context of an ioctl
  syscall and will call kvmppc_xive_native_eq_sync for setting the
  KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page()
  when there isn't a running vcpu. Implemented the
  kvm_arch_allow_write_without_running_vcpu to always return true
  to allow mark_page_dirty_in_slot to mark the page dirty in the
  memslot->dirty_bitmap in this case.
- Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page
  offset.
- Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required
  for the generic KVM code to call.
- Add a check to kvmppc_vcpu_run_hv for checking whether the dirty
  ring is soft full.
- Implement the kvm_arch_flush_remote_tlbs_memslot function to support
  the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option.

Test Results

On testing with live migration it was found that there is around
150-180 ms improvment in overall migration time with this patch.

Bare Metal P9 testing with patch:

(qemu) info migrate
globals:
store-global-state: on
only-migratable: off
send-configuration: on
send-section-footer: on
decompress-error-check: on
clear-bitmap-shift: 18
Migration status: completed
total time: 20694 ms
downtime: 73 ms
setup: 23 ms
transferred ram: 2604370 kbytes
throughput: 1033.55 mbps
remaining ram: 0 kbytes
total ram: 16777216 kbytes
duplicate: 3555398 pages
skipped: 0 pages
normal: 642026 pages
normal bytes: 2568104 kbytes
dirty sync count: 3
page size: 4 kbytes
multifd bytes: 0 kbytes
pages-per-second: 32455
precopy ram: 2581549 kbytes
downtime ram: 22820 kbytes

Bare Metal P9 testing without patch:
---
(qemu) info migrate
globals:
store-global-state: on
only-migratable: off
send-configuration: on
send-section-footer: on
decompress-error-check: on
clear-bitmap-shift: 18
Migration status: completed
total time: 20873 ms
downtime: 62 ms
setup: 19 ms
transferred ram: 2612900 kbytes
throughput: 1027.83 mbps
remaining ram: 0 kbytes
total ram: 16777216 kbytes
duplicate: 3553329 pages
skipped: 0 pages
normal: 644159 pages
normal bytes: 2576636 kbytes
dirty sync count: 4
page size: 4 kbytes
multifd bytes: 0 kbytes
pages-per-second: 88297
precopy ram: 2603645 kbytes
downtime ram: 9254 kbytes

Signed-off-by: Kautuk Consul 
---
 Documentation/virt/kvm/api.rst  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  2 ++
 arch/powerpc/kvm/Kconfig|  2 ++
 arch/powerpc/kvm/book3s.c   | 46 +
 arch/powerpc/kvm/book3s_hv.c|  3 ++
 include/linux/kvm_dirty_ring.h  |  5 
 virt/kvm/dirty_ring.c   |  1 +
 7 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index c0ddd3035462..84c180ccd178 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8114,7 +8114,7 @@ regardless of what has actually been exposed through the 
CPUID leaf.
 8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
 --
 
-:Architectures: x86, arm64
+:Architectures: x86, arm64, ppc64
 :Parameters: args[0] - size of the dirty log ring
 
 KVM is capable of tracking dirty memory using ring buffers that are
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 9f18fa090f1f..f722309ed7fb 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -33,6 +33,8 @@
 /* Not always available, but if it is, this is the correct offset.  */
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
+
 struct kvm_regs {
__u64 pc;
__u64 cr;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 902611954200..c93354ec3bd5 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -26,6 +26,8 @@ config KVM
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select INTERVAL_TREE
+   select HAVE_KVM_DIRTY_RING_ACQ_REL
+   select NEED_KVM_DIRTY_RING_WITH_BITMAP
 
 config KVM_BOOK3S_HANDLER
bool
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 686d8d9eda3e..01aa4fe2c424 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s.h"
 #include "trace.h"
@@ -1070,6 +1071,51 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
 
 #endif /* CONFIG_KVM_XICS */
 
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It write protects selected pages to enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+