Re: [PATCH 6/8] mm: Allow arch specific arch_randomize_brk() with CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
Hi Christophe, Le 22/11/2021 à 09:48, Christophe Leroy a écrit : Commit e7142bf5d231 ("arm64, mm: make randomization selected by generic topdown mmap layout") introduced a default version of arch_randomize_brk() provided when CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT is selected. powerpc could select CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT but needs to provide its own arch_randomize_brk(). In order to allow that, don't make CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select CONFIG_ARCH_HAS_ELF_RANDOMIZE. Instead, ensure that selecting CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT and selecting CONFIG_ARCH_HAS_ELF_RANDOMIZE has the same effect. This feels weird to me since if CONFIG_ARCH_HAS_ELF_RANDOMIZE is used somewhere else at some point, it is not natural to add CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT: can't we use a __weak function or a new CONFIG_ARCH_HAS_RANDOMIZE_BRK? Thanks, Alex Then only provide the default arch_randomize_brk() when the architecture has not selected CONFIG_ARCH_HAS_ELF_RANDOMIZE. Cc: Alexandre Ghiti Signed-off-by: Christophe Leroy --- arch/Kconfig | 1 - fs/binfmt_elf.c | 3 ++- include/linux/elf-randomize.h | 3 ++- mm/util.c | 2 ++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 26b8ed11639d..ef3ce947b7a1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1000,7 +1000,6 @@ config HAVE_ARCH_COMPAT_MMAP_BASES config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT bool depends on MMU - select ARCH_HAS_ELF_RANDOMIZE config HAVE_STACK_VALIDATION bool diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8c7f26f1fbb..28968a189a91 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1287,7 +1287,8 @@ static int load_elf_binary(struct linux_binprm *bprm) * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + if ((IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) || +IS_ENABLED(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)) && elf_ex->e_type == ET_DYN && !interpreter) { mm->brk = mm->start_brk = ELF_ET_DYN_BASE; } diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h index da0dbb7b6be3..1e471ca7caaf 100644 --- a/include/linux/elf-randomize.h +++ b/include/linux/elf-randomize.h @@ -4,7 +4,8 @@ struct mm_struct; -#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE +#if !defined(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && \ + !defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) static inline unsigned long arch_mmap_rnd(void) { return 0; } # if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK) # define compat_brk_randomized diff --git a/mm/util.c b/mm/util.c index e58151a61255..edb9e94cceb5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -344,6 +344,7 @@ unsigned long randomize_stack_top(unsigned long stack_top) } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE unsigned long arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ @@ -352,6 +353,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) return randomize_page(mm->brk, SZ_1G); } +#endif unsigned long arch_mmap_rnd(void) {
Re: [PATCH v6 0/3] Introduce 64b relocatable kernel
Le 18/05/2021 à 12:12, Alexandre Ghiti a écrit : After multiple attempts, this patchset is now based on the fact that the 64b kernel mapping was moved outside the linear mapping. The first patch allows to build relocatable kernels but is not selected by default. That patch should ease KASLR implementation a lot. The second and third patches take advantage of an already existing powerpc script that checks relocations at compile-time, and uses it for riscv. @Palmer, any thought about that? There are no users for now, do you want to wait for a KASLR implementation to use it before merging this? If so, I can work on a KASLR implementation based on older implementation from Zong. Thanks, This patchset was tested on: * kernel: - rv32: OK - rv64 with RELOCATABLE: OK and checked that "suspicious" relocations are caught. - rv64 without RELOCATABLE: OK - powerpc: build only and checked that "suspicious" relocations are caught. * xipkernel: - rv32: build only - rv64: OK * nommukernel: - rv64: build only Changes in v6: * Remove the kernel move to vmalloc zone * Rebased on top of for-next * Remove relocatable property from 32b kernel as the kernel is mapped in the linear mapping and would then need to be copied physically too * CONFIG_RELOCATABLE depends on !XIP_KERNEL * Remove Reviewed-by from first patch as it changed a bit Changes in v5: * Add "static __init" to create_kernel_page_table function as reported by Kbuild test robot * Add reviewed-by from Zong * Rebase onto v5.7 Changes in v4: * Fix BPF region that overlapped with kernel's as suggested by Zong * Fix end of module region that could be larger than 2GB as suggested by Zong * Fix the size of the vm area reserved for the kernel as we could lose PMD_SIZE if the size was already aligned on PMD_SIZE * Split compile time relocations check patch into 2 patches as suggested by Anup * Applied Reviewed-by from Zong and Anup Changes in v3: * Move kernel mapping to vmalloc Changes in v2: * Make RELOCATABLE depend on MMU as suggested by Anup * Rename kernel_load_addr into kernel_virt_addr as suggested by Anup * Use __pa_symbol instead of __pa, as suggested by Zong * Rebased on top of v5.6-rc3 * Tested with sv48 patchset * Add Reviewed/Tested-by from Zong and Anup Alexandre Ghiti (3): riscv: Introduce CONFIG_RELOCATABLE powerpc: Move script to check relocations at compile time in scripts/ riscv: Check relocations at compile time arch/powerpc/tools/relocs_check.sh | 18 ++ arch/riscv/Kconfig | 12 +++ arch/riscv/Makefile| 5 ++- arch/riscv/Makefile.postlink | 36 arch/riscv/kernel/vmlinux.lds.S| 6 arch/riscv/mm/Makefile | 4 +++ arch/riscv/mm/init.c | 53 +- arch/riscv/tools/relocs_check.sh | 26 +++ scripts/relocs_check.sh| 20 +++ 9 files changed, 162 insertions(+), 18 deletions(-) create mode 100644 arch/riscv/Makefile.postlink create mode 100755 arch/riscv/tools/relocs_check.sh create mode 100755 scripts/relocs_check.sh
Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
Le 7/21/20 à 7:36 PM, Palmer Dabbelt a écrit : On Tue, 21 Jul 2020 16:11:02 PDT (-0700), b...@kernel.crashing.org wrote: On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote: > > I guess I don't understand why this is necessary at all. > > Specifically: why > > can't we just relocate the kernel within the linear map? That would > > let the > > bootloader put the kernel wherever it wants, modulo the physical > > memory size we > > support. We'd need to handle the regions that are coupled to the > > kernel's > > execution address, but we could just put them in an explicit memory > > region > > which is what we should probably be doing anyway. > > Virtual relocation in the linear mapping requires to move the kernel > physically too. Zong implemented this physical move in its KASLR RFC > patchset, which is cumbersome since finding an available physical spot > is harder than just selecting a virtual range in the vmalloc range. > > In addition, having the kernel mapping in the linear mapping prevents > the use of hugepage for the linear mapping resulting in performance loss > (at least for the GB that encompasses the kernel). > > Why do you find this "ugly" ? The vmalloc region is just a bunch of > available virtual addresses to whatever purpose we want, and as noted by > Zong, arm64 uses the same scheme. I don't get it :-) At least on powerpc we move the kernel in the linear mapping and it works fine with huge pages, what is your problem there ? You rely on punching small-page size holes in there ? That was my original suggestion, and I'm not actually sure it's invalid. It would mean that both the kernel's physical and virtual addresses are set by the bootloader, which may or may not be workable if we want to have an sv48+sv39 kernel. My initial approach to sv48+sv39 kernels would be to just throw away the sv39 memory on sv48 kernels, which would preserve the linear map but mean that there is no single physical address that's accessible for both. That would require some coordination between the bootloader and the kernel as to where it should be loaded, but maybe there's a better way to design the linear map. Right now we have a bunch of unwritten rules about where things need to be loaded, which is a recipe for disaster. We could copy the kernel around, but I'm not sure I really like that idea. We do zero the BSS right now, so it's not like we entirely rely on the bootloader to set up the kernel image, but with the hart race boot scheme we have right now we'd at least need to leave a stub sitting around. Maybe we just throw away SBI v0.1, though, that's why we called it all legacy in the first place. My bigger worry is that anything that involves running the kernel at arbitrary virtual addresses means we need a PIC kernel, which means every global symbol needs an indirection. That's probably not so bad for shared libraries, but the kernel has a lot of global symbols. PLT references probably aren't so scary, as we have an incoherent instruction cache so the virtual function predictor isn't that hard to build, but making all global data accesses GOT-relative seems like a disaster for performance. This fixed-VA thing really just exists so we don't have to be full-on PIC. In theory I think we could just get away with pretending that medany is PIC, which I believe works as long as the data and text offset stays constant, you you don't have any symbols between 2GiB and -2GiB (as those may stay fixed, even in medany), and you deal with GP accordingly (which should work itself out in the current startup code). We rely on this for some of the early boot code (and will soon for kexec), but that's a very controlled code base and we've already had some issues. I'd be much more comfortable adding an explicit semi-PIC code model, as I tend to miss something when doing these sorts of things and then we could at least add it to the GCC test runs and guarantee it actually works. Not really sure I want to deal with that, though. It would, however, be the only way to get random virtual addresses during kernel execution. At least in the old days, there were a number of assumptions that the kernel text/data/bss resides in the linear mapping. Ya, it terrified me as well. Alex says arm64 puts the kernel in the vmalloc region, so assuming that's the case it must be possible. I didn't get that from reading the arm64 port (I guess it's no secret that pretty much all I do is copy their code) See https://elixir.bootlin.com/linux/latest/source/arch/arm64/mm/mmu.c#L615. If you change that you need to ensure that it's still physically contiguous and you'll have to tweak __va and __pa, which might induce extra overhead. I'm operating under the assumption that we don't want to add an additional load to virt2phys conversions. arm64 bends over b
Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
Hi Palmer, Le 7/21/20 à 3:05 PM, Palmer Dabbelt a écrit : On Tue, 21 Jul 2020 11:36:10 PDT (-0700), a...@ghiti.fr wrote: Let's try to make progress here: I add linux-mm in CC to get feedback on this patch as it blocks sv48 support too. Sorry for being slow here. I haven't replied because I hadn't really fleshed No problem :) out the design yet, but just so everyone's on the same page my problems with this are: * We waste vmalloc space on 32-bit systems, where there isn't a lot of it. * On 64-bit systems the VA space around the kernel is precious because it's the only place we can place text (modules, BPF, whatever). If we start putting the kernel in the vmalloc space then we either have to pre-allocate a bunch of space around it (essentially making it a fixed mapping anyway) or it becomes likely that we won't be able to find space for modules as they're loaded into running systems. Let's note that we already have this issue for BPF and modules right now. But by keeping the kernel 'in the end' of the vmalloc region, that's quite mitigate this problem: if we exhaust the vmalloc region in 64bit and then start allocating here, I think the whole system will have other problem. * Relying on a relocatable kernel for sv48 support introduces a fairly large performance hit. I understand the performance penalty but I struggle to it "fairly large": can we benchmark this somehow ? Roughly, my proposal would be to: * Leave the 32-bit memory map alone. On 32-bit systems we can load modules anywhere and we only have one VA width, so we're not really solving any problems with these changes. Ok that's possible although a lot of ifdef will get involved :) * Staticly allocate a 2GiB portion of the VA space for all our text, as its own region. We'd link/relocate the kernel here instead of around PAGE_OFFSET, which would decouple the kernel from the physical memory layout of the system. This would have the side effect of sorting out a bunch of bootloader headaches that we currently have. This amounts to doing the same as this patch but instead of using the vmalloc region, we'd use our own right ? I believe we'd then lose the vmalloc facilities to allocate modules around this zone. * Sort out how to maintain a linear map as the canonical hole moves around between the VA widths without adding a bunch of overhead to the virt2phys and friends. This is probably going to be the trickiest part, but I think if we just change the page table code to essentially lie about VAs when an sv39 system runs an sv48+sv39 kernel we could make it work -- there'd be some logical complexity involved, but it would remain fast. I have to think about that. This doesn't solve the problem of virtually relocatable kernels, but it does let us decouple that from the sv48 stuff. It also lets us stop relying on a fixed physical address the kernel is loaded into, which is another thing I don't like. Agreed on this one. I know this may be a more complicated approach, but there aren't any sv48 systems around right now so I just don't see the rush to support them, particularly when there's a cost to what already exists (for those who haven't been watching, so far all the sv48 patch sets have imposed a significant performance penalty on all systems). Alex Alex Le 7/9/20 à 7:11 AM, Alex Ghiti a écrit : Hi Palmer, Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit : On Sun, 07 Jun 2020 00:59:46 PDT (-0700), a...@ghiti.fr wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. I know it's been a while, but I keep opening this up to review it and just can't get over how ugly it is to put the kernel's linear map in the vmalloc region. I guess I don't understand why this is necessary at all. Specifically: why can't we just relocate the kernel within the linear map? That would let the bootloader put the kernel wherever it wants, modulo the physical memory size we support. We'd need to handle the regions that are coupled to the kernel's execution address, but we could just put them in an explicit memory region which is what we should probably be doing anyway. Virtual relocation in the linear mapping requires to move the kernel physically too. Zong implemented this physical move in its KASLR RFC patchset, which is cumbersome since finding an available physical spot is harder than just selecting a virtual range in the vmalloc range. In addition, having the kernel mapping in the linear mapping prevents the use o
Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
Hi Benjamin, Le 7/21/20 à 7:11 PM, Benjamin Herrenschmidt a écrit : On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote: I guess I don't understand why this is necessary at all. Specifically: why can't we just relocate the kernel within the linear map? That would let the bootloader put the kernel wherever it wants, modulo the physical memory size we support. We'd need to handle the regions that are coupled to the kernel's execution address, but we could just put them in an explicit memory region which is what we should probably be doing anyway. Virtual relocation in the linear mapping requires to move the kernel physically too. Zong implemented this physical move in its KASLR RFC patchset, which is cumbersome since finding an available physical spot is harder than just selecting a virtual range in the vmalloc range. In addition, having the kernel mapping in the linear mapping prevents the use of hugepage for the linear mapping resulting in performance loss (at least for the GB that encompasses the kernel). Why do you find this "ugly" ? The vmalloc region is just a bunch of available virtual addresses to whatever purpose we want, and as noted by Zong, arm64 uses the same scheme. I don't get it :-) At least on powerpc we move the kernel in the linear mapping and it works fine with huge pages, what is your problem there ? You rely on punching small-page size holes in there ? ARCH_HAS_STRICT_KERNEL_RWX prevents the use of a hugepage for the kernel mapping in the direct mapping as it sets different permissions to different part of the kernel (data, text..etc). At least in the old days, there were a number of assumptions that the kernel text/data/bss resides in the linear mapping. If you change that you need to ensure that it's still physically contiguous and you'll have to tweak __va and __pa, which might induce extra overhead. Yes that's done in this patch and indeed there is an overhead to those functions. Cheers, Ben. Thanks, Alex
Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
Let's try to make progress here: I add linux-mm in CC to get feedback on this patch as it blocks sv48 support too. Alex Le 7/9/20 à 7:11 AM, Alex Ghiti a écrit : Hi Palmer, Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit : On Sun, 07 Jun 2020 00:59:46 PDT (-0700), a...@ghiti.fr wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. I know it's been a while, but I keep opening this up to review it and just can't get over how ugly it is to put the kernel's linear map in the vmalloc region. I guess I don't understand why this is necessary at all. Specifically: why can't we just relocate the kernel within the linear map? That would let the bootloader put the kernel wherever it wants, modulo the physical memory size we support. We'd need to handle the regions that are coupled to the kernel's execution address, but we could just put them in an explicit memory region which is what we should probably be doing anyway. Virtual relocation in the linear mapping requires to move the kernel physically too. Zong implemented this physical move in its KASLR RFC patchset, which is cumbersome since finding an available physical spot is harder than just selecting a virtual range in the vmalloc range. In addition, having the kernel mapping in the linear mapping prevents the use of hugepage for the linear mapping resulting in performance loss (at least for the GB that encompasses the kernel). Why do you find this "ugly" ? The vmalloc region is just a bunch of available virtual addresses to whatever purpose we want, and as noted by Zong, arm64 uses the same scheme. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Well, that's not enough to make sure this doesn't happen -- it's just enough to make sure it doesn't happen very quickily. That's the same boat we're already in, though, so it's not like it's worse. Indeed, that's not worse, I haven't found a way to reserve vmalloc area without actually allocating it. Signed-off-by: Alexandre Ghiti Reviewed-by: Zong Li --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h | 10 +- arch/riscv/include/asm/pgtable.h | 38 ++--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 88 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset 0 #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/in
Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
Hi Palmer, Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit : On Sun, 07 Jun 2020 00:59:46 PDT (-0700), a...@ghiti.fr wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. I know it's been a while, but I keep opening this up to review it and just can't get over how ugly it is to put the kernel's linear map in the vmalloc region. I guess I don't understand why this is necessary at all. Specifically: why can't we just relocate the kernel within the linear map? That would let the bootloader put the kernel wherever it wants, modulo the physical memory size we support. We'd need to handle the regions that are coupled to the kernel's execution address, but we could just put them in an explicit memory region which is what we should probably be doing anyway. Virtual relocation in the linear mapping requires to move the kernel physically too. Zong implemented this physical move in its KASLR RFC patchset, which is cumbersome since finding an available physical spot is harder than just selecting a virtual range in the vmalloc range. In addition, having the kernel mapping in the linear mapping prevents the use of hugepage for the linear mapping resulting in performance loss (at least for the GB that encompasses the kernel). Why do you find this "ugly" ? The vmalloc region is just a bunch of available virtual addresses to whatever purpose we want, and as noted by Zong, arm64 uses the same scheme. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Well, that's not enough to make sure this doesn't happen -- it's just enough to make sure it doesn't happen very quickily. That's the same boat we're already in, though, so it's not like it's worse. Indeed, that's not worse, I haven't found a way to reserve vmalloc area without actually allocating it. Signed-off-by: Alexandre Ghiti Reviewed-by: Zong Li --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h | 10 +- arch/riscv/include/asm/pgtable.h | 38 ++--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 88 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset 0 #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..94ef3b49dfb6 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include -#ifndef __ASSEMBLY__ -
Re: [PATCH v5 0/4] vmalloc kernel mapping and relocatable kernel
Hi Palmer, Le 6/7/20 à 3:59 AM, Alexandre Ghiti a écrit : This patchset originally implemented relocatable kernel support but now also moves the kernel mapping into the vmalloc zone. The first patch explains why we need to move the kernel into vmalloc zone (instead of memcpying it around). That patch should ease KASLR implementation a lot. The second patch allows to build relocatable kernels but is not selected by default. The third and fourth patches take advantage of an already existing powerpc script that checks relocations at compile-time, and uses it for riscv. Changes in v5: * Add "static __init" to create_kernel_page_table function as reported by Kbuild test robot * Add reviewed-by from Zong * Rebase onto v5.7 Changes in v4: * Fix BPF region that overlapped with kernel's as suggested by Zong * Fix end of module region that could be larger than 2GB as suggested by Zong * Fix the size of the vm area reserved for the kernel as we could lose PMD_SIZE if the size was already aligned on PMD_SIZE * Split compile time relocations check patch into 2 patches as suggested by Anup * Applied Reviewed-by from Zong and Anup Changes in v3: * Move kernel mapping to vmalloc Changes in v2: * Make RELOCATABLE depend on MMU as suggested by Anup * Rename kernel_load_addr into kernel_virt_addr as suggested by Anup * Use __pa_symbol instead of __pa, as suggested by Zong * Rebased on top of v5.6-rc3 * Tested with sv48 patchset * Add Reviewed/Tested-by from Zong and Anup Alexandre Ghiti (4): riscv: Move kernel mapping to vmalloc zone riscv: Introduce CONFIG_RELOCATABLE powerpc: Move script to check relocations at compile time in scripts/ riscv: Check relocations at compile time arch/powerpc/tools/relocs_check.sh | 18 + arch/riscv/Kconfig | 12 +++ arch/riscv/Makefile| 5 +- arch/riscv/Makefile.postlink | 36 + arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h | 10 ++- arch/riscv/include/asm/pgtable.h | 38 ++--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +- arch/riscv/kernel/vmlinux.lds.S| 9 ++- arch/riscv/mm/Makefile | 4 + arch/riscv/mm/init.c | 121 + arch/riscv/mm/physaddr.c | 2 +- arch/riscv/tools/relocs_check.sh | 26 +++ scripts/relocs_check.sh| 20 + 15 files changed, 259 insertions(+), 52 deletions(-) create mode 100644 arch/riscv/Makefile.postlink create mode 100755 arch/riscv/tools/relocs_check.sh create mode 100755 scripts/relocs_check.sh Do you have any remark regarding this series ? Thanks, Alex
Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
Hi Atish, Le 6/11/20 à 5:34 PM, Atish Patra a écrit : On Sun, Jun 7, 2020 at 1:01 AM Alexandre Ghiti wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Signed-off-by: Alexandre Ghiti Reviewed-by: Zong Li --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h| 10 +- arch/riscv/include/asm/pgtable.h | 38 ++--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 88 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET(pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset0 #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..94ef3b49dfb6 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include -#ifndef __ASSEMBLY__ - -/* Page Upper Directory not used in RISC-V */ -#include -#include -#include -#include - -#ifdef CONFIG_MMU +#ifndef CONFIG_MMU +#define KERNEL_VIRT_ADDR PAGE_OFFSET +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else +/* + * Leave 2GB for modules and BPF that must lie within a 2GB range around + * the kernel. + */ +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE(SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) + As these mappings have changed a few times in recent months including this one, I think it would be better to have virtual memory layout documentation in RISC-V similar to other architectures. If you can include the page table layout for 3/4 level page tables in the same document, that would be really helpful. Yes, I'll do that in a separate commit. Thanks, Alex +#ifdef CONFIG_64BIT +#define VMALLOC_MODULE_START BPF_JIT_REGION_END +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + SZ_2G) +#endif /* * Roughly size the vmemmap space to be large enough to fit enough @@ -57,9 +63,16 @@
Re: [PATCH v5 2/4] riscv: Introduce CONFIG_RELOCATABLE
Hi Jerome, Le 6/10/20 à 10:10 AM, Jerome Forissier a écrit : On 6/7/20 9:59 AM, Alexandre Ghiti wrote: [...] +config RELOCATABLE + bool + depends on MMU + help + This builds a kernel as a Position Independent Executable (PIE), + which retains all relocation metadata required to relocate the + kernel binary at runtime to a different virtual address than the + address it was linked at. + Since RISCV uses the RELA relocation format, this requires a + relocation pass at runtime even if the kernel is loaded at the + same address it was linked at. Is this true? I thought that the GNU linker would write the "proper" values by default, contrary to the LLVM linker (ld.lld) which would need a special flag: --apply-dynamic-relocs (by default the relocated places are set to zero). At least, it is my experience with Aarch64 on a different project. So, sorry if I'm talking nonsense here -- I have not looked at the details. It seems that you're right, at least for aarch64 since they specifically specify the --no-apply-dynamic-relocs option. I retried to boot without relocating at runtime, and it fails on riscv. Can this be arch specific ? Thanks, Alex
Re: [PATCH v4 1/4] riscv: Move kernel mapping to vmalloc zone
Hi Zong, Le 6/3/20 à 10:52 PM, Zong Li a écrit : On Wed, Jun 3, 2020 at 4:01 PM Alexandre Ghiti wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Signed-off-by: Alexandre Ghiti --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h| 10 +- arch/riscv/include/asm/pgtable.h | 38 ++--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 88 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET(pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset0 #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..94ef3b49dfb6 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include -#ifndef __ASSEMBLY__ - -/* Page Upper Directory not used in RISC-V */ -#include -#include -#include -#include - -#ifdef CONFIG_MMU +#ifndef CONFIG_MMU +#define KERNEL_VIRT_ADDR PAGE_OFFSET +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else +/* + * Leave 2GB for modules and BPF that must lie within a 2GB range around + * the kernel. + */ +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE(SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) + +#ifdef CONFIG_64BIT +#define VMALLOC_MODULE_START BPF_JIT_REGION_END +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + SZ_2G) +#endif /* * Roughly size the vmemmap space to be large enough to fit enough @@ -57,9 +63,16 @@ #define FIXADDR_SIZE PGDIR_SIZE #endif #define FIXADDR_START(FIXADDR_TOP - FIXADDR_SIZE) - #endif +#ifndef __ASSEMBLY__ + +/* Page Upper Directory not used in RISC-V */ +#include +#include +#include +#include + #ifdef CONFIG_64BIT #include #else @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl #define
Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
Hi Zong, Le 5/27/20 à 3:29 AM, Alex Ghiti a écrit : Le 5/27/20 à 2:05 AM, Zong Li a écrit : On Wed, May 27, 2020 at 1:06 AM Alex Ghiti wrote: Hi Zong, Le 5/26/20 à 5:43 AM, Zong Li a écrit : On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Signed-off-by: Alexandre Ghiti --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h | 10 +- arch/riscv/include/asm/pgtable.h | 37 +--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 87 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset 0 #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..25213cfaf680 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include -#ifndef __ASSEMBLY__ - -/* Page Upper Directory not used in RISC-V */ -#include -#include -#include -#include - -#ifdef CONFIG_MMU +#ifndef CONFIG_MMU +#define KERNEL_VIRT_ADDR PAGE_OFFSET +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else +/* + * Leave 2GB for modules and BPF that must lie within a 2GB range around + * the kernel. + */ +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE (SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) +#define BPF_JIT_REGION_START (kernel_virt_addr) +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE) It seems to have a potential risk here, the region of bpf is overlapping with kernel mapping, so if kernel size is bigger than 128MB, bpf region would be occupied and run out by kernel mapping. Is there the risk as I mentioned? Sorry I forgot to answer this one: I was confident that 128MB was large enough for kernel and BPF. But I see no reason to leave this risk so I'll change kernel_virt_addr for _end so BPF will have its 128MB reserved.
Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
Le 5/27/20 à 2:05 AM, Zong Li a écrit : On Wed, May 27, 2020 at 1:06 AM Alex Ghiti wrote: Hi Zong, Le 5/26/20 à 5:43 AM, Zong Li a écrit : On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Signed-off-by: Alexandre Ghiti --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h| 10 +- arch/riscv/include/asm/pgtable.h | 37 +--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 87 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET(pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset0 #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..25213cfaf680 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include -#ifndef __ASSEMBLY__ - -/* Page Upper Directory not used in RISC-V */ -#include -#include -#include -#include - -#ifdef CONFIG_MMU +#ifndef CONFIG_MMU +#define KERNEL_VIRT_ADDR PAGE_OFFSET +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else +/* + * Leave 2GB for modules and BPF that must lie within a 2GB range around + * the kernel. + */ +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE(SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) +#define BPF_JIT_REGION_START (kernel_virt_addr) +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE) It seems to have a potential risk here, the region of bpf is overlapping with kernel mapping, so if kernel size is bigger than 128MB, bpf region would be occupied and run out by kernel mapping. Is there the risk as I mentioned? Sorry I forgot to answer this one: I was confident that 128MB was large enough for kernel and BPF. But I see no reason to leave this risk so I'll change kernel_virt_addr for _end so BPF will have its 128MB reserved. Thanks ! Alex + +#ifdef CONFIG_64BIT +#define VMALLOC_MODULE_START BPF
Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
Hi Zong, Le 5/26/20 à 5:43 AM, Zong Li a écrit : On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Signed-off-by: Alexandre Ghiti --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h| 10 +- arch/riscv/include/asm/pgtable.h | 37 +--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 87 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET(pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset0 #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..25213cfaf680 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include -#ifndef __ASSEMBLY__ - -/* Page Upper Directory not used in RISC-V */ -#include -#include -#include -#include - -#ifdef CONFIG_MMU +#ifndef CONFIG_MMU +#define KERNEL_VIRT_ADDR PAGE_OFFSET +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else +/* + * Leave 2GB for modules and BPF that must lie within a 2GB range around + * the kernel. + */ +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE(SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) +#define BPF_JIT_REGION_START (kernel_virt_addr) +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE) It seems to have a potential risk here, the region of bpf is overlapping with kernel mapping, so if kernel size is bigger than 128MB, bpf region would be occupied and run out by kernel mapping. + +#ifdef CONFIG_64BIT +#define VMALLOC_MODULE_START BPF_JIT_REGION_END +#define VMALLOC_MODULE_END VMALLOC_END +#endif Although kernel_virt_addr is a fixed address now, I think it could be changed for the purpose of relocatable or KASLR, so if kernel_virt_addr is moved to far from VMALLOC_END than 2G, the region of module would be too big. Yes you're right, that's wrong to allow modules to lie outside the 2G window, thanks for noticing. In addition, the region
Re: [PATCH v2] powerpc: Do not consider weak unresolved symbol relocations as bad
On 1/18/20 12:03 PM, Alexandre Ghiti wrote: Commit 8580ac9404f6 ("bpf: Process in-kernel BTF") introduced two weak symbols that may be unresolved at link time which result in an absolute relocation to 0. relocs_check.sh emits the following warning: "WARNING: 2 bad relocations c1a41478 R_PPC64_ADDR64_binary__btf_vmlinux_bin_start c1a41480 R_PPC64_ADDR64_binary__btf_vmlinux_bin_end" whereas those relocations are legitimate even for a relocatable kernel compiled with -pie option. relocs_check.sh already excluded some weak unresolved symbols explicitly: remove those hardcoded symbols and add some logic that parses the symbols using nm, retrieves all the weak unresolved symbols and excludes those from the list of the potential bad relocations. Reported-by: Stephen Rothwell Signed-off-by: Alexandre Ghiti --- Changes in v2: - Follow Stephen advice of using grep -F instead of looping over weak symbols using read, patch is way smaller and cleaner. - Add missing nm in comment arch/powerpc/Makefile.postlink | 4 ++-- arch/powerpc/tools/relocs_check.sh | 20 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index 134f12f89b92..2268396ff4bb 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -17,11 +17,11 @@ quiet_cmd_head_check = CHKHEAD $@ quiet_cmd_relocs_check = CHKREL $@ ifdef CONFIG_PPC_BOOK3S_64 cmd_relocs_check = \ - $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" ; \ + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" ; \ $(BASH) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh "$(OBJDUMP)" "$@" else cmd_relocs_check = \ - $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" + $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" endif # `@true` prevents complaint when there is nothing to be done diff --git a/arch/powerpc/tools/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh index 7b9fe0a567cf..014e00e74d2b 100755 --- a/arch/powerpc/tools/relocs_check.sh +++ b/arch/powerpc/tools/relocs_check.sh @@ -10,14 +10,21 @@ # based on relocs_check.pl # Copyright © 2009 IBM Corporation -if [ $# -lt 2 ]; then - echo "$0 [path to objdump] [path to vmlinux]" 1>&2 +if [ $# -lt 3 ]; then + echo "$0 [path to objdump] [path to nm] [path to vmlinux]" 1>&2 exit 1 fi -# Have Kbuild supply the path to objdump so we handle cross compilation. +# Have Kbuild supply the path to objdump and nm so we handle cross compilation. objdump="$1" -vmlinux="$2" +nm="$2" +vmlinux="$3" + +# Remove from the bad relocations those that match an undefined weak symbol +# which will result in an absolute relocation to 0. +# Weak unresolved symbols are of that form in nm output: +# " w _binary__btf_vmlinux_bin_end" +undef_weak_symbols=$($nm "$vmlinux" | awk '$1 ~ /w/ { print $2 }') bad_relocs=$( $objdump -R "$vmlinux" | @@ -26,8 +33,6 @@ $objdump -R "$vmlinux" | # These relocations are okay # On PPC64: # R_PPC64_RELATIVE, R_PPC64_NONE - # R_PPC64_ADDR64 mach_ - # R_PPC64_ADDR64 __crc_ # On PPC: # R_PPC_RELATIVE, R_PPC_ADDR16_HI, # R_PPC_ADDR16_HA,R_PPC_ADDR16_LO, @@ -39,8 +44,7 @@ R_PPC_ADDR16_HI R_PPC_ADDR16_HA R_PPC_RELATIVE R_PPC_NONE' | - grep -E -v '\ if [ -z "$bad_relocs" ]; then Hi guys, Any thought about that ? I do think this patch makes the whole check about absolute relocations clearer. And in the future, it will avoid anyone to spend some time on those "bad" relocations which actually aren't. Thanks, Alex
Re: [PATCH] powerpc: Do not consider weak unresolved symbol relocations as bad
Hi Stephen, On 1/15/20 6:39 PM, Stephen Rothwell wrote: Hi Alexandre, Thanks for sorting this out. Just a few comments below. On Wed, 15 Jan 2020 15:46:48 -0500 Alexandre Ghiti wrote: # Have Kbuild supply the path to objdump so we handle cross compilation. ^ "and nm" +# Remove from the bad relocations those that match an undefined weak symbol +# which will result in an absolute relocation to 0. +# Weak unresolved symbols are of that form in nm output: +# " w _binary__btf_vmlinux_bin_end" +undef_weak_symbols=$($nm "$vmlinux" | awk -e '$1 ~ /w/ { print $2 }') + +while IFS= read -r weak_symbol; do + bad_relocs="$(echo -n "$bad_relocs" | sed "/$weak_symbol/d")" +done <<< "$undef_weak_symbols" This is not a bash script, and the above is a bashism :-( Also, my version of awk (mawk) doesn't have a -e option. How about something like : undef_weak_symbols=$($nm "$vmlinux" | awk '$1 ~ /w/ { print $2 }') if [ "$undef_weak_symbols" ]; then bad_relocs="$(echo "$bad_relocs" | grep -F -w -v "$undef_weak_symbols")" fi Or do this near the top and add the grep to the others. Yes that's quite better, thanks, I'll send a new version tomorrow. Thanks again, Alex
Re: [PATCH v8 4/4] hugetlb: allow to free gigantic pages regardless of the configuration
On 3/28/19 4:43 PM, Mike Kravetz wrote: On 3/26/19 11:36 PM, Alexandre Ghiti wrote: On systems without CONTIG_ALLOC activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patch simply enables the possibility to hand back those pages to memory allocator. Signed-off-by: Alexandre Ghiti Acked-by: David S. Miller [sparc] Thanks for all the updates Reviewed-by: Mike Kravetz Thanks for all your reviews :) Alex
Re: [PATCH v7 4/4] hugetlb: allow to free gigantic pages regardless of the configuration
On 3/17/19 2:31 PM, christophe leroy wrote: Le 17/03/2019 à 17:28, Alexandre Ghiti a écrit : On systems without CONTIG_ALLOC activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patch simply enables the possibility to hand back those pages to memory allocator. Signed-off-by: Alexandre Ghiti Acked-by: David S. Miller [sparc] --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/hugetlb.h | 4 -- arch/powerpc/include/asm/book3s/64/hugetlb.h | 7 --- arch/powerpc/platforms/Kconfig.cputype | 2 +- arch/s390/Kconfig | 2 +- arch/s390/include/asm/hugetlb.h | 3 -- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/include/asm/hugetlb.h | 4 -- include/asm-generic/hugetlb.h | 14 + include/linux/gfp.h | 2 +- mm/hugetlb.c | 54 ++-- mm/page_alloc.c | 4 +- 14 files changed, 61 insertions(+), 43 deletions(-) [...] diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 71d7b77eea50..aaf14974ee5f 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -126,4 +126,18 @@ static inline pte_t huge_ptep_get(pte_t *ptep) } #endif +#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_runtime_supported(void) +{ + return true; +} +#else +static inline bool gigantic_page_runtime_supported(void) +{ + return false; +} +#endif /* CONFIG_ARCH_HAS_GIGANTIC_PAGE */ What about the following instead: static inline bool gigantic_page_runtime_supported(void) { return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); } Totally, it already was like that in v2 or v3... +#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ + #endif /* _ASM_GENERIC_HUGETLB_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 1f1ad9aeebb9..58ea44bf75de 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -589,8 +589,8 @@ static inline bool pm_suspended_storage(void) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); -extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif +extern void free_contig_range(unsigned long pfn, unsigned int nr_pages); 'extern' is unneeded and should be avoided (iaw checkpatch) Ok, I did fix a checkpatch warning here, but did not notice the 'extern' one. Thanks for your time, Alex Christophe #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index afef61656c1e..4e55aa38704f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1058,6 +1058,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) free_contig_range(page_to_pfn(page), 1 << order); } +#ifdef CONFIG_CONTIG_ALLOC static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { @@ -1142,11 +1143,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); +#else /* !CONFIG_CONTIG_ALLOC */ +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static inline bool gigantic_page_supported(void) { return false; } static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) { return NULL; } + int nid, nodemask_t *nodemask) +{ + return NULL; +} static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } @@ -1156,7 +1166,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; h->nr_huge_pages--; @@ -2276,13 +2286,27 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, - nodemask_t *nodes_allowed) +static
Re: [PATCH v6 4/4] hugetlb: allow to free gigantic pages regardless of the configuration
On 3/8/19 2:05 PM, Mike Kravetz wrote: On 3/7/19 5:20 AM, Alexandre Ghiti wrote: On systems without CONTIG_ALLOC activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patch simply enables the possibility to hand back those pages to memory allocator. Signed-off-by: Alexandre Ghiti Acked-by: David S. Miller [sparc] Reviewed-by: Mike Kravetz Thanks Mike, Alex
Re: [PATCH v5 4/4] hugetlb: allow to free gigantic pages regardless of the configuration
On 3/6/19 2:16 PM, Dave Hansen wrote: On 3/6/19 11:00 AM, Alexandre Ghiti wrote: +static int set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) - return h->max_huge_pages; + /* +* Gigantic pages allocation depends on the capability for large page +* range allocation. If the system cannot provide alloc_contig_range, +* allow users to free gigantic pages. +*/ + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { + spin_lock(_lock); + if (count > persistent_huge_pages(h)) { + spin_unlock(_lock); + return -EINVAL; + } + goto decrease_pool; + } We talked about it during the last round and I don't seen any mention of it here in comments or the changelog: Why is this a goto? Why don't we just let the code fall through to the "decrease_pool" label? Why is this new block needed at all? Can't we just remove the old check and let it be? I'll get rid of the goto, I don't know how to justify it properly in a comment, maybe because it is not necessary. This is not a new block, this means exactly the same as before (remember gigantic_page_supported() actually meant CONTIG_ALLOC before this series), except that now we allow a user to free boottime allocated gigantic pages. And no we cannot just remove the check and let it be since it would modify the current behaviour, which is to return an error when trying to allocate gigantic pages whereas alloc_contig_range is not defined. I thought it was clearly commented above, I can try to make it more explicit.
Re: [PATCH v5 3/4] mm: Simplify MEMORY_ISOLATION && COMPACTION || CMA into CONTIG_ALLOC
On 3/6/19 2:30 PM, Vlastimil Babka wrote: On 3/6/19 8:00 PM, Alexandre Ghiti wrote: This condition allows to define alloc_contig_range, so simplify it into a more accurate naming. Suggested-by: Vlastimil Babka Signed-off-by: Alexandre Ghiti Acked-by: Vlastimil Babka (you could have sent this with my ack from v4 as there wasn't significant change, just the one I suggested :) Thanks, that's good to know. Alex
Re: [PATCH v5 2/4] sparc: Advertise gigantic page support
On 3/6/19 2:04 PM, David Miller wrote: From: Alexandre Ghiti Date: Wed, 6 Mar 2019 14:00:03 -0500 sparc actually supports gigantic pages and selecting ARCH_HAS_GIGANTIC_PAGE allows it to allocate and free gigantic pages at runtime. sparc allows configuration such as huge pages of 16GB, pages of 8KB and MAX_ORDER = 13 (default): HPAGE_SHIFT (34) - PAGE_SHIFT (13) = 21 >= MAX_ORDER (13) Signed-off-by: Alexandre Ghiti Much better. Acked-by: David S. Miller Thanks ! Alex
Re: [PATCH v3] hugetlb: allow to free gigantic pages regardless of the configuration
On 2/15/19 12:34 PM, Dave Hansen wrote: -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); -extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif +extern void free_contig_range(unsigned long pfn, unsigned int nr_pages); There's a lot of stuff going on in this patch. Adding/removing config options. Please get rid of these superfluous changes or at least break them out. I agree that this patch does a lot of things. I am going at least to split it into 2 separate patches, one suggested-by Vlastimil regarding the renaming of MEMORY_ISOLATION && COMPACTION || CMA, and another that indeed does what was primarily intended. #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..138a8df9b813 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -252,12 +252,17 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. + config ARCH_ENABLE_HUGEPAGE_MIGRATION bool Like this. :) My apologies for that. config ARCH_ENABLE_THP_MIGRATION bool +config CONTIG_ALLOC + def_bool y + depends on (MEMORY_ISOLATION && COMPACTION) || CMA + config PHYS_ADDR_T_64BIT def_bool 64BIT Please think carefully though the Kconfig dependencies. 'select' is *not* the same as 'depends on'. This replaces a bunch of arch-specific "select ARCH_HAS_GIGANTIC_PAGE" with a 'depends on'. I *think* that ends up being OK, but it absolutely needs to be addressed in the changelog about why *you* think it is OK and why it doesn't change the functionality of any of the patched architetures. Ok. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index afef61656c1e..e686c92212e9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1035,7 +1035,6 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { Whats the result of this #ifdef removal? A universally larger kernel even for architectures that do not support runtime gigantic page alloc/free? That doesn't seem like a good thing. Ok, I agree, now that we removed the "wrong" definition of ARCH_HAS_GIGANTIC_PAGE, we can actually use this define for architectures to show they support gigantic pages and avoid the problem you mention. Thanks. @@ -1058,6 +1057,12 @@ static void free_gigantic_page(struct page *page, unsigned int order) free_contig_range(page_to_pfn(page), 1 << order); } +static inline bool gigantic_page_runtime_allocation_supported(void) +{ + return IS_ENABLED(CONFIG_CONTIG_ALLOC); +} Why bother having this function? Why don't the callers just check the config option directly? Ok, this function is only used once in set_max_huge_pages where you mention the need for a comment, so I can get rid of it. Thanks. +#ifdef CONFIG_CONTIG_ALLOC static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { @@ -1143,22 +1148,15 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); -#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static inline bool gigantic_page_supported(void) { return false; } +#else /* !CONFIG_CONTIG_ALLOC */ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } -static inline void free_gigantic_page(struct page *page, unsigned int order) { } -static inline void destroy_compound_gigantic_page(struct page *page, - unsigned int order) { } #endif static void update_and_free_page(struct hstate *h, struct page *page) { int i; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) - return; I don't get the point of removing this check. Logically, this reads as checking if the architecture supports gigantic hstates and has nothing to do with allocation. I think this check was wrong from the beginning: gigantic_page_supported() was only checking (MEMORY_ISOLATION && COMPACTION) || CMA, which has nothing to do with the capability to free gigantic pages. But then I went through all the architectures to see if removing this test could affect any of them. And I
Re: [PATCH] hugetlb: allow to free gigantic pages regardless of the configuration
On 2/13/19 6:27 AM, Vlastimil Babka wrote: On 1/17/19 7:39 PM, Alexandre Ghiti wrote: From: Alexandre Ghiti On systems without CMA or (MEMORY_ISOLATION && COMPACTION) activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patchs simply enables the possibility to hand back those pages to memory allocator. This commit then renames gigantic_page_supported and ARCH_HAS_GIGANTIC_PAGE to make them more accurate. Indeed, those values being false does not mean that the system cannot use gigantic pages: it just means that runtime allocation of gigantic pages is not supported, one can still allocate boottime gigantic pages if the architecture supports it. Signed-off-by: Alexandre Ghiti I'm fine with the change, but wonder if this can be structured better in a way which would remove the duplicated "if (MEMORY_ISOLATION && COMPACTION) || CMA" from all arches, as well as the duplicated gigantic_page_runtime_allocation_supported() Yeah, totally, we can factorize more than what I did. I prepared a v2 of this patch that does exactly that: remove the triplet from arch specific code and the duplicated gigantic_page_runtime_allocation_supported. something like: - "select ARCH_HAS_GIGANTIC_PAGE" has no conditions, it just says the arch can support them either at boottime or runtime (but runtime is usable only if other conditions are met) And the v2 gets rid of ARCH_HAS_GIGANTIC_PAGE totally since it is not needed by arch to advertise the fact they support gigantic page, actually, when selected, it really just means that an arch has the means to allocate runtime gigantic page: it is equivalent to (MEMORY_ISOLATION && COMPACTION) || CMA. - gigantic_page_runtime_allocation_supported() is a function that returns true if ARCH_HAS_GIGANTIC_PAGE && ((MEMORY_ISOLATION && COMPACTION) || CMA) and there's a single instance, not per-arch. - code for freeing gigantic pages can probably still be conditional on ARCH_HAS_GIGANTIC_PAGE BTW I wanted also to do something about the "(MEMORY_ISOLATION && COMPACTION) || CMA" ugliness itself, i.e. put the common parts behind some new kconfig (COMPACTION_CORE ?) and expose it better to users, but I can take a stab on that once the above part is settled. Vlastimil I send the v2 right away, if you can take a look Vlastimil, that would be great. Note that Andrew already picked this patch in its tree, I'm not sure how to proceed. Thanks for your remarks ! Alex
Re: [PATCH] hugetlb: allow to free gigantic pages regardless of the configuration
On 2/5/19 6:23 AM, Michael Ellerman wrote: Alexandre Ghiti writes: From: Alexandre Ghiti On systems without CMA or (MEMORY_ISOLATION && COMPACTION) activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patchs simply enables the possibility to hand back those pages to memory allocator. This commit then renames gigantic_page_supported and ARCH_HAS_GIGANTIC_PAGE to make them more accurate. Indeed, those values being false does not mean that the system cannot use gigantic pages: it just means that runtime allocation of gigantic pages is not supported, one can still allocate boottime gigantic pages if the architecture supports it. Signed-off-by: Alexandre Ghiti --- - Compiled on all architectures - Tested on riscv architecture arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/hugetlb.h | 7 +++-- arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 +-- arch/powerpc/platforms/Kconfig.cputype | 2 +- The powerpc parts look fine. Acked-by: Michael Ellerman (powerpc) Thank you Michael, Alex cheers arch/s390/Kconfig| 2 +- arch/s390/include/asm/hugetlb.h | 7 +++-- arch/x86/Kconfig | 2 +- arch/x86/include/asm/hugetlb.h | 7 +++-- fs/Kconfig | 2 +- include/linux/gfp.h | 2 +- mm/hugetlb.c | 43 +++- mm/page_alloc.c | 4 +-- 12 files changed, 48 insertions(+), 36 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4168d366127..18239cbd7fcd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -18,7 +18,7 @@ config ARM64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index fb6609875455..797fc77eabcd 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -65,8 +65,11 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, #include -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION +static inline bool gigantic_page_runtime_allocation_supported(void) +{ + return true; +} #endif #endif /* __ASM_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 5b0177733994..7711f0e2c7e5 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -32,8 +32,8 @@ static inline int hstate_get_psize(struct hstate *hstate) } } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION +static inline bool gigantic_page_runtime_allocation_supported(void) { return true; } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8c7464c3f27f..779e06bac697 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -319,7 +319,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index ed554b09eb3f..6776eef6a9ae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -69,7 +69,7 @@ config S390 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d1afa58a4b6..57c952f5388e 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -116,7 +116,10 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); }
Re: [PATCH] hugetlb: allow to free gigantic pages regardless of the configuration
On 1/17/19 1:39 PM, Alexandre Ghiti wrote: From: Alexandre Ghiti On systems without CMA or (MEMORY_ISOLATION && COMPACTION) activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patchs simply enables the possibility to hand back those pages to memory allocator. This commit then renames gigantic_page_supported and ARCH_HAS_GIGANTIC_PAGE to make them more accurate. Indeed, those values being false does not mean that the system cannot use gigantic pages: it just means that runtime allocation of gigantic pages is not supported, one can still allocate boottime gigantic pages if the architecture supports it. Signed-off-by: Alexandre Ghiti --- - Compiled on all architectures - Tested on riscv architecture arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/hugetlb.h | 7 +++-- arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 +-- arch/powerpc/platforms/Kconfig.cputype | 2 +- arch/s390/Kconfig| 2 +- arch/s390/include/asm/hugetlb.h | 7 +++-- arch/x86/Kconfig | 2 +- arch/x86/include/asm/hugetlb.h | 7 +++-- fs/Kconfig | 2 +- include/linux/gfp.h | 2 +- mm/hugetlb.c | 43 +++- mm/page_alloc.c | 4 +-- 12 files changed, 48 insertions(+), 36 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4168d366127..18239cbd7fcd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -18,7 +18,7 @@ config ARM64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index fb6609875455..797fc77eabcd 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -65,8 +65,11 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, #include -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION +static inline bool gigantic_page_runtime_allocation_supported(void) +{ + return true; +} #endif #endif /* __ASM_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 5b0177733994..7711f0e2c7e5 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -32,8 +32,8 @@ static inline int hstate_get_psize(struct hstate *hstate) } } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION +static inline bool gigantic_page_runtime_allocation_supported(void) { return true; } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8c7464c3f27f..779e06bac697 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -319,7 +319,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index ed554b09eb3f..6776eef6a9ae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -69,7 +69,7 @@ config S390 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE_RUNTIME_ALLOCATION if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d1afa58a4b6..57c952f5388e 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -116,7 +116,10 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } +#ifdef
Re: [PATCH v6 00/11] hugetlb: Factorize hugetlb architecture primitives
Hi everyone, Does someone need anything more to be done regarding this series ? Thanks, Alex On 08/06/2018 05:57 PM, Alexandre Ghiti wrote: [CC linux-mm for inclusion in -mm tree] In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. Except for prepare_hugepage_range, this patchset moves the versions that are just pass-through to standard pte primitives into asm-generic/hugetlb.h by using the same #ifdef semantic that can be found in asm-generic/pgtable.h, i.e. __HAVE_ARCH_***. s390 architecture has not been tackled in this serie since it does not use asm-generic/hugetlb.h at all. This patchset has been compiled on all addressed architectures with success (except for parisc, but the problem does not come from this series). v6: - Remove nohash/32 and book3s/32 powerpc specific implementations in order to use the generic ones. - Add all the Reviewed-by, Acked-by and Tested-by in the commits, thanks to everyone. v5: As suggested by Mike Kravetz, no need to move the #include for arm and x86 architectures, let it live at the top of the file. v4: Fix powerpc build error due to misplacing of #include outside of #ifdef CONFIG_HUGETLB_PAGE, as pointed by Christophe Leroy. v1, v2, v3: Same version, just problems with email provider and misuse of --batch-size option of git send-email Alexandre Ghiti (11): hugetlb: Harmonize hugetlb.h arch specific defines with pgtable.h hugetlb: Introduce generic version of hugetlb_free_pgd_range hugetlb: Introduce generic version of set_huge_pte_at hugetlb: Introduce generic version of huge_ptep_get_and_clear hugetlb: Introduce generic version of huge_ptep_clear_flush hugetlb: Introduce generic version of huge_pte_none hugetlb: Introduce generic version of huge_pte_wrprotect hugetlb: Introduce generic version of prepare_hugepage_range hugetlb: Introduce generic version of huge_ptep_set_wrprotect hugetlb: Introduce generic version of huge_ptep_set_access_flags hugetlb: Introduce generic version of huge_ptep_get arch/arm/include/asm/hugetlb-3level.h| 32 +- arch/arm/include/asm/hugetlb.h | 30 -- arch/arm64/include/asm/hugetlb.h | 39 +++- arch/ia64/include/asm/hugetlb.h | 47 ++- arch/mips/include/asm/hugetlb.h | 40 +++-- arch/parisc/include/asm/hugetlb.h| 33 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 6 -- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/hugetlb.h | 43 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 6 -- arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h| 54 ++--- arch/sparc/include/asm/hugetlb.h | 40 +++-- arch/x86/include/asm/hugetlb.h | 69 -- include/asm-generic/hugetlb.h| 88 +++- 15 files changed, 135 insertions(+), 394 deletions(-)
Re: [PATCH v6 00/11] hugetlb: Factorize hugetlb architecture primitives
Thanks for your time, Alex Le 07/08/2018 à 09:54, Ingo Molnar a écrit : * Alexandre Ghiti wrote: [CC linux-mm for inclusion in -mm tree] In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. Except for prepare_hugepage_range, this patchset moves the versions that are just pass-through to standard pte primitives into asm-generic/hugetlb.h by using the same #ifdef semantic that can be found in asm-generic/pgtable.h, i.e. __HAVE_ARCH_***. s390 architecture has not been tackled in this serie since it does not use asm-generic/hugetlb.h at all. This patchset has been compiled on all addressed architectures with success (except for parisc, but the problem does not come from this series). v6: - Remove nohash/32 and book3s/32 powerpc specific implementations in order to use the generic ones. - Add all the Reviewed-by, Acked-by and Tested-by in the commits, thanks to everyone. v5: As suggested by Mike Kravetz, no need to move the #include for arm and x86 architectures, let it live at the top of the file. v4: Fix powerpc build error due to misplacing of #include outside of #ifdef CONFIG_HUGETLB_PAGE, as pointed by Christophe Leroy. v1, v2, v3: Same version, just problems with email provider and misuse of --batch-size option of git send-email Alexandre Ghiti (11): hugetlb: Harmonize hugetlb.h arch specific defines with pgtable.h hugetlb: Introduce generic version of hugetlb_free_pgd_range hugetlb: Introduce generic version of set_huge_pte_at hugetlb: Introduce generic version of huge_ptep_get_and_clear hugetlb: Introduce generic version of huge_ptep_clear_flush hugetlb: Introduce generic version of huge_pte_none hugetlb: Introduce generic version of huge_pte_wrprotect hugetlb: Introduce generic version of prepare_hugepage_range hugetlb: Introduce generic version of huge_ptep_set_wrprotect hugetlb: Introduce generic version of huge_ptep_set_access_flags hugetlb: Introduce generic version of huge_ptep_get arch/arm/include/asm/hugetlb-3level.h| 32 +- arch/arm/include/asm/hugetlb.h | 30 -- arch/arm64/include/asm/hugetlb.h | 39 +++- arch/ia64/include/asm/hugetlb.h | 47 ++- arch/mips/include/asm/hugetlb.h | 40 +++-- arch/parisc/include/asm/hugetlb.h| 33 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 6 -- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/hugetlb.h | 43 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 6 -- arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h| 54 ++--- arch/sparc/include/asm/hugetlb.h | 40 +++-- arch/x86/include/asm/hugetlb.h | 69 -- include/asm-generic/hugetlb.h| 88 +++- 15 files changed, 135 insertions(+), 394 deletions(-) The x86 bits look good to me (assuming it's all tested on all relevant architectures, etc.) Acked-by: Ingo Molnar Thanks, Ingo
Re: [PATCH v5 09/11] hugetlb: Introduce generic version of huge_ptep_set_wrprotect
Ok, I tried every defconfig available: - for the nohash/32, I found that I could use mpc885_ads_defconfig and I activated HUGETLBFS. I removed the definition of huge_ptep_set_wrprotect from nohash/32/pgtable.h, add an #error in include/asm-generic/hugetlb.h right before the generic definition of huge_ptep_set_wrprotect, and fell onto it at compile-time: => I'm pretty confident then that removing the definition of huge_ptep_set_wrprotect does not break anythingin this case. - regardind book3s/32, I did not find any defconfig with CONFIG_PPC_BOOK3S_32, CONFIG_PPC32 allowing to enable huge page support (ie CONFIG_SYS_SUPPORTS_HUGETLBFS) => Do you have a defconfig that would allow me to try the same as above ? Thanks, Alex On 07/31/2018 11:17 AM, Alexandre Ghiti wrote: On 07/31/2018 12:06 PM, Michael Ellerman wrote: Alexandre Ghiti writes: arm, ia64, mips, sh, x86 architectures use the same version of huge_ptep_set_wrprotect, so move this generic implementation into asm-generic/hugetlb.h. Note: powerpc uses twice for book3s/32 and nohash/32 the same version as the above architectures, but the modification was not straightforward and hence has not been done. Do you remember what the problem was there? It looks like you should just be able to drop them like the others. I assume there's some header spaghetti that causes problems though? Yes, the header spaghetti frightened me a bit. Maybe I should have tried harder: I can try to remove them and find the right defconfigs to compile both to begin with. And to guarantee the functionality is preserved, can I use the testsuite of libhugetlbfs with qemu ? Alex cheers Signed-off-by: Alexandre Ghiti Reviewed-by: Mike Kravetz --- arch/arm/include/asm/hugetlb-3level.h | 6 -- arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 6 -- arch/mips/include/asm/hugetlb.h | 6 -- arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/book3s/32/pgtable.h | 2 ++ arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/nohash/32/pgtable.h | 2 ++ arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h | 6 -- arch/sparc/include/asm/hugetlb.h | 1 + arch/x86/include/asm/hugetlb.h | 6 -- include/asm-generic/hugetlb.h | 8 13 files changed, 17 insertions(+), 30 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index b897541520ef..8247cd6a2ac6 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return retval; } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 3e7f6e69b28d..f4f69ae5466e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -48,6 +48,7 @@ extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index cbe296271030..49d1f7949f3a 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -27,12 +27,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 6ff2531cfb1d..3dcf5debf8c4 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -63,12 +63,6 @@ static inline int huge_pte_none(pte_t pte) return !val || (val == (unsigned long)invalid_pte_table); } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} -
Re: [PATCH v4 00/11] hugetlb: Factorize hugetlb architecture primitives
Actually, I went back to v4.17, and I have the same errors. I used: $ make ARCH=parisc O=build_parisc generic-64bit_defconfig $ PATH=/home/alex/wip/toolchain/gcc-8.1.0-nolibc/hppa64-linux/bin:$PATH make ARCH=parisc CROSS_COMPILE=hppa64-linux- I downloaded the crosscompiler here: https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/8.1.0/ On 07/26/2018 05:01 PM, Alex Ghiti wrote: Hi Helge, Thanks for your tests. In case it can help you, this is what I get when I try to build generic-64bit_defconfig (I truncated the output): ... LD vmlinux.o MODPOST vmlinux.o hppa64-linux-ld: init/main.o(.text+0x98): cannot reach strreplace init/main.o: In function `initcall_blacklisted': init/.tmp_main.o:(.text+0x98): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strreplace' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.text+0xbc): cannot reach strcmp init/.tmp_main.o:(.text+0xbc): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strcmp' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.text+0x21c): cannot reach strcpy init/main.o: In function `do_one_initcall': (.text+0x21c): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strcpy' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.text+0x250): cannot reach strlcat (.text+0x250): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strlcat' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.init.text+0x1d4): cannot reach strcmp init/main.o: In function `do_early_param': init/.tmp_main.o:(.init.text+0x1d4): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strcmp' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.init.text+0x250): cannot reach strcmp init/.tmp_main.o:(.init.text+0x250): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strcmp' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.init.text+0x294): cannot reach strlen init/main.o: In function `repair_env_string': init/.tmp_main.o:(.init.text+0x294): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strlen' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.init.text+0x2f0): cannot reach strlen init/.tmp_main.o:(.init.text+0x2f0): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strlen' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.init.text+0x308): cannot reach memmove init/.tmp_main.o:(.init.text+0x308): relocation truncated to fit: R_PARISC_PCREL22F against symbol `memmove' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.init.text+0x454): cannot reach strlen init/main.o: In function `unknown_bootoption': init/.tmp_main.o:(.init.text+0x454): relocation truncated to fit: R_PARISC_PCREL22F against symbol `strlen' defined in .text section in lib/string.o hppa64-linux-ld: init/main.o(.init.text+0x4dc): cannot reach strchr init/.tmp_main.o:(.init.text+0x4dc): additional relocation overflows omitted from the output hppa64-linux-ld: init/main.o(.init.text+0x638): cannot reach strncmp hppa64-linux-ld: init/main.o(.init.text+0x694): cannot reach get_option hppa64-linux-ld: init/main.o(.init.text+0x744): cannot reach strsep hppa64-linux-ld: init/main.o(.init.text+0x798): cannot reach strlen hppa64-linux-ld: init/main.o(.init.text+0x7d0): cannot reach strcpy hppa64-linux-ld: init/main.o(.init.text+0x954): cannot reach strlcpy hppa64-linux-ld: init/main.o(.init.text+0xab8): cannot reach strlen hppa64-linux-ld: init/main.o(.init.text+0xafc): cannot reach strlen hppa64-linux-ld: init/main.o(.init.text+0xb40): cannot reach strlen hppa64-linux-ld: init/main.o(.init.text+0xb84): cannot reach strlen hppa64-linux-ld: init/main.o(.init.text+0xbd0): cannot reach strcpy hppa64-linux-ld: init/main.o(.init.text+0xbe8): cannot reach strcpy hppa64-linux-ld: init/main.o(.init.text+0xc3c): cannot reach build_all_zonelists hppa64-linux-ld: init/main.o(.init.text+0x1200): cannot reach unknown hppa64-linux-ld: init/main.o(.init.text+0x1278): cannot reach wait_for_completion hppa64-linux-ld: init/main.o(.init.text+0x12b0): cannot reach _raw_spin_lock hppa64-linux-ld: init/main.o(.init.text+0x147c): cannot reach strcpy hppa64-linux-ld: init/main.o(.ref.text+0x40): cannot reach kernel_thread hppa64-linux-ld: init/main.o(.ref.text+0x60): cannot reach find_task_by_pid_ns hppa64-linux-ld: init/main.o(.ref.text+0x98): cannot reach set_cpus_allowed_ptr hppa64-linux-ld: init/main.o(.ref.text+0xbc): cannot reach kernel_thread hppa64-linux-ld: init/main.o(.ref.text+0xd4): cannot reach find_task_by_pid_ns hppa64-linux-ld: init/main.o(.ref.text+0x108): cannot reach complete hppa64-linux-ld: init/main.o(.ref.text+0x128): cannot reach cpu_startup_entry hppa64-linux-ld: init/main.o(.ref.text+0x164): cannot reach unknown hppa64-linux-ld: init/main.o(.ref.text+0x174): cannot reach
Re: [PATCH v4 00/11] hugetlb: Factorize hugetlb architecture primitives
-linux-ld: init/main.o(.ref.text+0x238): cannot reach unknown hppa64-linux-ld: init/main.o(.ref.text+0x258): cannot reach panic hppa64-linux-ld: init/main.o(.ref.text+0x268): cannot reach printk hppa64-linux-ld: init/main.o(.ref.text+0x280): cannot reach unknown hppa64-linux-ld: init/main.o(.ref.text+0x29c): cannot reach unknown hppa64-linux-ld: init/main.o(.ref.text+0x2b8): cannot reach unknown hppa64-linux-ld: init/main.o(.ref.text+0x2d4): cannot reach unknown hppa64-linux-ld: init/main.o(.ref.text+0x2f0): cannot reach panic hppa64-linux-ld: init/do_mounts.o(.text+0x30): cannot reach strncasecmp hppa64-linux-ld: init/do_mounts.o(.text+0x158): cannot reach strncmp hppa64-linux-ld: init/do_mounts.o(.text+0x180): cannot reach strchr ... On 07/26/2018 12:59 PM, Helge Deller wrote: * Alex Ghiti : This is the result of the build for all arches tackled in this series rebased on 4.18-rc6: ... parisc: generic-64bit_defconfig: with huge page does not link generic-64bit_defconfig: without huge page does not link BUT not because of this series, any feedback welcome. Strange, but I will check that later Anyway, I applied your v4-patch to my parisc64 tree, built the kernel, started it and ran some hugetlb LTP testcases sucessfully. So, please add: Tested-by: Helge Deller # parisc Helge
Re: [PATCH v4 00/11] hugetlb: Factorize hugetlb architecture primitives
Hi Christophe, Sorry, I should have done it already: with and without huge page activated, the build for mpc885_ads_defconfig is OK. Thanks, Alex On 07/26/2018 03:13 PM, LEROY Christophe wrote: Alex Ghiti a écrit : Hi everyone, This is the result of the build for all arches tackled in this series rebased on 4.18-rc6: arm: versatile_defconfig: without huge page OK keystone_defconfig: with huge page OK arm64: defconfig: with huge page OK ia64: generic_defconfig: with huge page OK mips: Paul Burton tested cavium octeon with huge page OK parisc: generic-64bit_defconfig: with huge page does not link generic-64bit_defconfig: without huge page does not link BUT not because of this series, any feedback welcome. powerpc: ppc64_defconfig: without huge page OK ppc64_defconfig: with huge page OK Can you also test ppc32 both with and without hugepage (mpc885_ads_defconfig) Thanks Christophe sh: dreamcast_defconfig: with huge page OK sparc: sparc32_defconfig: without huge page OK sparc64: sparc64_defconfig: with huge page OK x86: with huge page OK Alex On 07/23/2018 02:00 PM, Michael Ellerman wrote: Alex Ghiti writes: Does anyone have any suggestion about those patches ? Cross compiling it for some non-x86 arches would be a good start :) There are cross compilers available here: https://mirrors.edge.kernel.org/pub/tools/crosstool/ cheers On 07/09/2018 02:16 PM, Michal Hocko wrote: [CC hugetlb guys - http://lkml.kernel.org/r/20180705110716.3919-1-a...@ghiti.fr] On Thu 05-07-18 11:07:05, Alexandre Ghiti wrote: In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. Except for prepare_hugepage_range, this patchset moves the versions that are just pass-through to standard pte primitives into asm-generic/hugetlb.h by using the same #ifdef semantic that can be found in asm-generic/pgtable.h, i.e. __HAVE_ARCH_***. s390 architecture has not been tackled in this serie since it does not use asm-generic/hugetlb.h at all. powerpc could be factorized a bit more (cf huge_ptep_set_wrprotect). This patchset has been compiled on x86 only. Changelog: v4: Fix powerpc build error due to misplacing of #include outside of #ifdef CONFIG_HUGETLB_PAGE, as pointed by Christophe Leroy. v1, v2, v3: Same version, just problems with email provider and misuse of --batch-size option of git send-email Alexandre Ghiti (11): hugetlb: Harmonize hugetlb.h arch specific defines with pgtable.h hugetlb: Introduce generic version of hugetlb_free_pgd_range hugetlb: Introduce generic version of set_huge_pte_at hugetlb: Introduce generic version of huge_ptep_get_and_clear hugetlb: Introduce generic version of huge_ptep_clear_flush hugetlb: Introduce generic version of huge_pte_none hugetlb: Introduce generic version of huge_pte_wrprotect hugetlb: Introduce generic version of prepare_hugepage_range hugetlb: Introduce generic version of huge_ptep_set_wrprotect hugetlb: Introduce generic version of huge_ptep_set_access_flags hugetlb: Introduce generic version of huge_ptep_get arch/arm/include/asm/hugetlb-3level.h | 32 +- arch/arm/include/asm/hugetlb.h | 33 +-- arch/arm64/include/asm/hugetlb.h | 39 +++- arch/ia64/include/asm/hugetlb.h | 47 ++- arch/mips/include/asm/hugetlb.h | 40 +++-- arch/parisc/include/asm/hugetlb.h | 33 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 2 + arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/hugetlb.h | 43 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 + arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h | 54 ++--- arch/sparc/include/asm/hugetlb.h | 40 +++-- arch/x86/include/asm/hugetlb.h | 72 +-- include/asm-generic/hugetlb.h | 88 +++- 15 files changed, 143 insertions(+), 384 deletions(-) -- 2.16.2
Re: [PATCH v4 00/11] hugetlb: Factorize hugetlb architecture primitives
Hi everyone, This is the result of the build for all arches tackled in this series rebased on 4.18-rc6: arm: versatile_defconfig: without huge page OK keystone_defconfig: with huge page OK arm64: defconfig: with huge page OK ia64: generic_defconfig: with huge page OK mips: Paul Burton tested cavium octeon with huge page OK parisc: generic-64bit_defconfig: with huge page does not link generic-64bit_defconfig: without huge page does not link BUT not because of this series, any feedback welcome. powerpc: ppc64_defconfig: without huge page OK ppc64_defconfig: with huge page OK sh: dreamcast_defconfig: with huge page OK sparc: sparc32_defconfig: without huge page OK sparc64: sparc64_defconfig: with huge page OK x86: with huge page OK Alex On 07/23/2018 02:00 PM, Michael Ellerman wrote: Alex Ghiti writes: Does anyone have any suggestion about those patches ? Cross compiling it for some non-x86 arches would be a good start :) There are cross compilers available here: https://mirrors.edge.kernel.org/pub/tools/crosstool/ cheers On 07/09/2018 02:16 PM, Michal Hocko wrote: [CC hugetlb guys - http://lkml.kernel.org/r/20180705110716.3919-1-a...@ghiti.fr] On Thu 05-07-18 11:07:05, Alexandre Ghiti wrote: In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. Except for prepare_hugepage_range, this patchset moves the versions that are just pass-through to standard pte primitives into asm-generic/hugetlb.h by using the same #ifdef semantic that can be found in asm-generic/pgtable.h, i.e. __HAVE_ARCH_***. s390 architecture has not been tackled in this serie since it does not use asm-generic/hugetlb.h at all. powerpc could be factorized a bit more (cf huge_ptep_set_wrprotect). This patchset has been compiled on x86 only. Changelog: v4: Fix powerpc build error due to misplacing of #include outside of #ifdef CONFIG_HUGETLB_PAGE, as pointed by Christophe Leroy. v1, v2, v3: Same version, just problems with email provider and misuse of --batch-size option of git send-email Alexandre Ghiti (11): hugetlb: Harmonize hugetlb.h arch specific defines with pgtable.h hugetlb: Introduce generic version of hugetlb_free_pgd_range hugetlb: Introduce generic version of set_huge_pte_at hugetlb: Introduce generic version of huge_ptep_get_and_clear hugetlb: Introduce generic version of huge_ptep_clear_flush hugetlb: Introduce generic version of huge_pte_none hugetlb: Introduce generic version of huge_pte_wrprotect hugetlb: Introduce generic version of prepare_hugepage_range hugetlb: Introduce generic version of huge_ptep_set_wrprotect hugetlb: Introduce generic version of huge_ptep_set_access_flags hugetlb: Introduce generic version of huge_ptep_get arch/arm/include/asm/hugetlb-3level.h| 32 +- arch/arm/include/asm/hugetlb.h | 33 +-- arch/arm64/include/asm/hugetlb.h | 39 +++- arch/ia64/include/asm/hugetlb.h | 47 ++- arch/mips/include/asm/hugetlb.h | 40 +++-- arch/parisc/include/asm/hugetlb.h| 33 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 2 + arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/hugetlb.h | 43 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 + arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h| 54 ++--- arch/sparc/include/asm/hugetlb.h | 40 +++-- arch/x86/include/asm/hugetlb.h | 72 +-- include/asm-generic/hugetlb.h| 88 +++- 15 files changed, 143 insertions(+), 384 deletions(-) -- 2.16.2
Re: [PATCH v4 00/11] hugetlb: Factorize hugetlb architecture primitives
Ok will do and report when done. Thanks for your feedback, Alex On 07/23/2018 02:00 PM, Michael Ellerman wrote: Alex Ghiti writes: Does anyone have any suggestion about those patches ? Cross compiling it for some non-x86 arches would be a good start :) There are cross compilers available here: https://mirrors.edge.kernel.org/pub/tools/crosstool/ cheers On 07/09/2018 02:16 PM, Michal Hocko wrote: [CC hugetlb guys - http://lkml.kernel.org/r/20180705110716.3919-1-a...@ghiti.fr] On Thu 05-07-18 11:07:05, Alexandre Ghiti wrote: In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. Except for prepare_hugepage_range, this patchset moves the versions that are just pass-through to standard pte primitives into asm-generic/hugetlb.h by using the same #ifdef semantic that can be found in asm-generic/pgtable.h, i.e. __HAVE_ARCH_***. s390 architecture has not been tackled in this serie since it does not use asm-generic/hugetlb.h at all. powerpc could be factorized a bit more (cf huge_ptep_set_wrprotect). This patchset has been compiled on x86 only. Changelog: v4: Fix powerpc build error due to misplacing of #include outside of #ifdef CONFIG_HUGETLB_PAGE, as pointed by Christophe Leroy. v1, v2, v3: Same version, just problems with email provider and misuse of --batch-size option of git send-email Alexandre Ghiti (11): hugetlb: Harmonize hugetlb.h arch specific defines with pgtable.h hugetlb: Introduce generic version of hugetlb_free_pgd_range hugetlb: Introduce generic version of set_huge_pte_at hugetlb: Introduce generic version of huge_ptep_get_and_clear hugetlb: Introduce generic version of huge_ptep_clear_flush hugetlb: Introduce generic version of huge_pte_none hugetlb: Introduce generic version of huge_pte_wrprotect hugetlb: Introduce generic version of prepare_hugepage_range hugetlb: Introduce generic version of huge_ptep_set_wrprotect hugetlb: Introduce generic version of huge_ptep_set_access_flags hugetlb: Introduce generic version of huge_ptep_get arch/arm/include/asm/hugetlb-3level.h| 32 +- arch/arm/include/asm/hugetlb.h | 33 +-- arch/arm64/include/asm/hugetlb.h | 39 +++- arch/ia64/include/asm/hugetlb.h | 47 ++- arch/mips/include/asm/hugetlb.h | 40 +++-- arch/parisc/include/asm/hugetlb.h| 33 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 2 + arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/hugetlb.h | 43 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 + arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h| 54 ++--- arch/sparc/include/asm/hugetlb.h | 40 +++-- arch/x86/include/asm/hugetlb.h | 72 +-- include/asm-generic/hugetlb.h| 88 +++- 15 files changed, 143 insertions(+), 384 deletions(-) -- 2.16.2
Re: [PATCH v4 00/11] hugetlb: Factorize hugetlb architecture primitives
Does anyone have any suggestion about those patches ? On 07/09/2018 02:16 PM, Michal Hocko wrote: [CC hugetlb guys - http://lkml.kernel.org/r/20180705110716.3919-1-a...@ghiti.fr] On Thu 05-07-18 11:07:05, Alexandre Ghiti wrote: In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. Except for prepare_hugepage_range, this patchset moves the versions that are just pass-through to standard pte primitives into asm-generic/hugetlb.h by using the same #ifdef semantic that can be found in asm-generic/pgtable.h, i.e. __HAVE_ARCH_***. s390 architecture has not been tackled in this serie since it does not use asm-generic/hugetlb.h at all. powerpc could be factorized a bit more (cf huge_ptep_set_wrprotect). This patchset has been compiled on x86 only. Changelog: v4: Fix powerpc build error due to misplacing of #include outside of #ifdef CONFIG_HUGETLB_PAGE, as pointed by Christophe Leroy. v1, v2, v3: Same version, just problems with email provider and misuse of --batch-size option of git send-email Alexandre Ghiti (11): hugetlb: Harmonize hugetlb.h arch specific defines with pgtable.h hugetlb: Introduce generic version of hugetlb_free_pgd_range hugetlb: Introduce generic version of set_huge_pte_at hugetlb: Introduce generic version of huge_ptep_get_and_clear hugetlb: Introduce generic version of huge_ptep_clear_flush hugetlb: Introduce generic version of huge_pte_none hugetlb: Introduce generic version of huge_pte_wrprotect hugetlb: Introduce generic version of prepare_hugepage_range hugetlb: Introduce generic version of huge_ptep_set_wrprotect hugetlb: Introduce generic version of huge_ptep_set_access_flags hugetlb: Introduce generic version of huge_ptep_get arch/arm/include/asm/hugetlb-3level.h| 32 +- arch/arm/include/asm/hugetlb.h | 33 +-- arch/arm64/include/asm/hugetlb.h | 39 +++- arch/ia64/include/asm/hugetlb.h | 47 ++- arch/mips/include/asm/hugetlb.h | 40 +++-- arch/parisc/include/asm/hugetlb.h| 33 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 2 + arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/hugetlb.h | 43 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 + arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h| 54 ++--- arch/sparc/include/asm/hugetlb.h | 40 +++-- arch/x86/include/asm/hugetlb.h | 72 +-- include/asm-generic/hugetlb.h| 88 +++- 15 files changed, 143 insertions(+), 384 deletions(-) -- 2.16.2
Re: [PATCH v3 02/11] hugetlb: Introduce generic version of hugetlb_free_pgd_range
My bad, when I moved the #include at the bottom of the file, I did not pay attention to that #ifdef. I'm going to fix powerpc and check other architectures if I did not make the same mistake. I'll send a v4 as soon as possible. Thanks for your comment, Alex On 07/05/2018 10:22 AM, Christophe Leroy wrote: On 07/05/2018 05:16 AM, Alexandre Ghiti wrote: arm, arm64, mips, parisc, sh, x86 architectures use the same version of hugetlb_free_pgd_range, so move this generic implementation into asm-generic/hugetlb.h. Signed-off-by: Alexandre Ghiti Build failure on mpc885_ads_defconfig CC arch/powerpc/kernel/setup-common.o In file included from arch/powerpc/kernel/setup-common.c:37: ./include/linux/hugetlb.h:191:65: error: expected identifier or '(' before '{' token #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) ^ ./include/asm-generic/hugetlb.h:44:20: note: in expansion of macro 'hugetlb_free_pgd_range' static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, ^~ see below --- arch/arm/include/asm/hugetlb.h | 12 ++-- arch/arm64/include/asm/hugetlb.h | 10 -- arch/ia64/include/asm/hugetlb.h | 5 +++-- arch/mips/include/asm/hugetlb.h | 13 ++--- arch/parisc/include/asm/hugetlb.h | 12 ++-- arch/powerpc/include/asm/hugetlb.h | 4 +++- arch/sh/include/asm/hugetlb.h | 12 ++-- arch/sparc/include/asm/hugetlb.h | 4 +++- arch/x86/include/asm/hugetlb.h | 11 ++- include/asm-generic/hugetlb.h | 11 +++ 10 files changed, 30 insertions(+), 64 deletions(-) [snip] diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 3225eb6402cc..de46ee16b615 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -4,7 +4,6 @@ #ifdef CONFIG_HUGETLB_PAGE #include -#include extern struct kmem_cache *hugepte_cache; @@ -113,6 +112,7 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); #endif +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); @@ -193,4 +193,6 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, } #endif /* CONFIG_HUGETLB_PAGE */ +#include + That include was previously inside #ifdef CONFIG_HUGETLB_PAGE. Why put it outside ? Christophe
Re: [PATCH 05/11] hugetlb: Introduce generic version of huge_ptep_clear_flush
Please drop this serie, sorry for the noise. On 07/05/2018 04:58 AM, Alexandre Ghiti wrote: arm, x86 architectures use the same version of huge_ptep_clear_flush, so move this generic implementation into asm-generic/hugetlb.h. Signed-off-by: Alexandre Ghiti --- arch/arm/include/asm/hugetlb-3level.h | 6 -- arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 1 + arch/mips/include/asm/hugetlb.h | 1 + arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/hugetlb.h| 1 + arch/sh/include/asm/hugetlb.h | 1 + arch/sparc/include/asm/hugetlb.h | 1 + arch/x86/include/asm/hugetlb.h| 6 -- include/asm-generic/hugetlb.h | 8 10 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index ad36e84b819a..b897541520ef 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return retval; } -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, -unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 6ae0bcafe162..4c8dd488554d 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -71,6 +71,7 @@ extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTE_CLEAR diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 6719c74da0de..41b5f6adeee4 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,6 +20,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 0959cc5a41fa..7df1f116a3cc 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -48,6 +48,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 6e281e1bb336..9afff26747a1 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -32,6 +32,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index ec3e0c2e78f8..de0769f0b5b2 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -143,6 +143,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, #endif } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 08ee6c00b5e9..9abf9c86b769 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -25,6 +25,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 944e3a4bfaff..651a9593fcee 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -42,6 +42,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define
Re: [PATCH v2 06/11] hugetlb: Introduce generic version of huge_pte_none
Please drop this serie, sorry for the noise. On 07/05/2018 05:12 AM, Alexandre Ghiti wrote: arm, arm64, ia64, parisc, powerpc, sh, sparc, x86 architectures use the same version of huge_pte_none, so move this generic implementation into asm-generic/hugetlb.h. Signed-off-by: Alexandre Ghiti --- arch/arm/include/asm/hugetlb.h | 5 - arch/arm64/include/asm/hugetlb.h | 5 - arch/ia64/include/asm/hugetlb.h| 5 - arch/mips/include/asm/hugetlb.h| 1 + arch/parisc/include/asm/hugetlb.h | 5 - arch/powerpc/include/asm/hugetlb.h | 5 - arch/sh/include/asm/hugetlb.h | 5 - arch/sparc/include/asm/hugetlb.h | 5 - arch/x86/include/asm/hugetlb.h | 5 - include/asm-generic/hugetlb.h | 7 +++ 10 files changed, 8 insertions(+), 40 deletions(-) diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 047b893ef95d..3d2ce4dbc145 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -43,11 +43,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 4c8dd488554d..49247c6f94db 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,11 +42,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 41b5f6adeee4..bf573500b3c4 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -26,11 +26,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 7df1f116a3cc..1c9c4531376c 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -55,6 +55,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma))); } +#define __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL; diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 9afff26747a1..c09d8c74553c 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -38,11 +38,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index de0769f0b5b2..530b817e097c 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -152,11 +152,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_hugetlb_page(vma, addr); } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 9abf9c86b769..a9f8266f33cf 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -31,11 +31,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 651a9593fcee..5bbd712e 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -48,11 +48,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 8347d5abf882..c5fdc53b6e41 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -27,11 +27,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline int