On Thu, Jan 08, 2026 at 09:25:30AM +0000, Ard Biesheuvel wrote:
> The primary mapping of the kernel image is made using huge pages where
> possible, mostly to minimize TLB pressure (Only the entry text section
> requires alignment to 2 MiB). This involves some rounding and padding of
> the .text and .rodata sections, resulting in gaps.  These gaps are
> smaller than a huge page, and are remapped using different permissions,
> resulting in fragmentation of the huge page mappings at the edges of
> those regions.
> 
> Similarly, there is a gap between .data and .bss, where the init text
> and data regions reside. This means that the end of the .data region and
> the start of the .bss region are not covered by huge page mappings
> either, even though both regions use the same permissions (RW+NX).
> 
> Improve the situation, by placing .data and .bss adjacently in the
> linker map, and putting the init text and data regions after .rodata,
> taking the place of the rodata/data gap. This results in one fewer gap,
> and a more efficient mapping of the .data and .bss regions.
> 
> To preserve the x86_64 ELF layout with PT_LOAD regions aligned to 2 MiB,
> start the second ELF segment at .init.data and align it to 2 MiB.  The
> resulting padding will be covered by the init region and will be freed
> along with it after boot.
> 
> defconfig + Clang 19:
> 
> Before:
> 
>   0xffffffff81000000-0xffffffff82200000    18M  ro  PSE  GLB x  pmd
>   0xffffffff82200000-0xffffffff8231c000  1136K  ro       GLB x  pte
>   0xffffffff8231c000-0xffffffff82400000   912K  RW       GLB NX pte
>   0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE  GLB NX pmd
>   0xffffffff82a00000-0xffffffff82b40000  1280K  ro       GLB NX pte
>   0xffffffff82b40000-0xffffffff82c00000   768K  RW       GLB NX pte
>   0xffffffff82c00000-0xffffffff83400000     8M  RW  PSE  GLB NX pmd
>   0xffffffff83400000-0xffffffff83800000     4M  RW       GLB NX pte
> 
> After:
> 
>   0xffffffff81000000-0xffffffff82200000    18M  ro  PSE  GLB x  pmd
>   0xffffffff82200000-0xffffffff8231c000  1136K  ro       GLB x  pte
>   0xffffffff8231c000-0xffffffff82400000   912K  RW       GLB NX pte
>   0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE  GLB NX pmd
>   0xffffffff82a00000-0xffffffff82b40000  1280K  ro       GLB NX pte
>   0xffffffff82b40000-0xffffffff82c00000   768K  RW       GLB NX pte
>   0xffffffff82c00000-0xffffffff82e00000     2M  RW  PSE  GLB NX pmd
>   0xffffffff82e00000-0xffffffff83000000     2M  RW       GLB NX pte
>   0xffffffff83000000-0xffffffff83800000     8M  RW  PSE  GLB NX pmd
> 
> With the gaps removed/unmapped (pti=on)
> 
> Before:
> 
>   0xffffffff81000000-0xffffffff81200000     2M  ro  PSE  GLB x  pmd
>   0xffffffff81200000-0xffffffff82200000    16M  ro  PSE      x  pmd
>   0xffffffff82200000-0xffffffff8231c000  1136K  ro           x  pte
>   0xffffffff8231c000-0xffffffff82400000   912K                  pte
>   0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE      NX pmd
>   0xffffffff82a00000-0xffffffff82b40000  1280K  ro           NX pte
>   0xffffffff82b40000-0xffffffff82c00000   768K                  pte
>   0xffffffff82c00000-0xffffffff83400000     8M  RW  PSE      NX pmd
>   0xffffffff83400000-0xffffffff8342a000   168K  RW           NX pte
>   0xffffffff8342a000-0xffffffff836f3000  2852K                  pte
>   0xffffffff836f3000-0xffffffff83800000  1076K  RW           NX pte
> 
> After:
> 
>   0xffffffff81000000-0xffffffff81200000     2M  ro  PSE  GLB x  pmd
>   0xffffffff81200000-0xffffffff82200000    16M  ro  PSE      x  pmd
>   0xffffffff82200000-0xffffffff8231c000  1136K  ro           x  pte
>   0xffffffff8231c000-0xffffffff82400000   912K                  pte
>   0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE      NX pmd
>   0xffffffff82a00000-0xffffffff82b40000  1280K  ro           NX pte
>   0xffffffff82b40000-0xffffffff82e3d000  3060K                  pte
>   0xffffffff82e3d000-0xffffffff83000000  1804K  RW           NX pte
>   0xffffffff83000000-0xffffffff83800000     8M  RW  PSE      NX pmd
> 
> Signed-off-by: Ard Biesheuvel <[email protected]>
> ---
>  arch/x86/kernel/vmlinux.lds.S | 91 +++++++++++---------
>  arch/x86/mm/init_64.c         |  5 +-
>  arch/x86/mm/pat/set_memory.c  |  2 +-
>  3 files changed, 52 insertions(+), 46 deletions(-)

I guess we could do this - I don't see why not... we'll have to take it for
a longer spin tho.

> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index 3a24a3fc55f5..1dee2987c42b 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -61,12 +61,15 @@ const_cpu_current_top_of_stack = cpu_current_top_of_stack;
>  #define X86_ALIGN_RODATA_BEGIN       . = ALIGN(HPAGE_SIZE);
>  
>  #define X86_ALIGN_RODATA_END                                 \
> -             . = ALIGN(HPAGE_SIZE);                          \
> -             __end_rodata_hpage_align = .;                   \

$ git grep __end_rodata_hpage_align
arch/x86/include/asm/sections.h:13:extern char __end_rodata_hpage_align[];
arch/x86/tools/relocs.c:93:     "__end_rodata_hpage_align|"

I guess you wanna remove those too and say that that marker is unused. Better
yet do that in a pre-patch.

> -             __end_rodata_aligned = .;
> +             . = ALIGN(PAGE_SIZE);                           \
> +             __end_rodata_aligned = ALIGN(HPAGE_SIZE);
>  
>  #define ALIGN_ENTRY_TEXT_BEGIN       . = ALIGN(PMD_SIZE);
>  #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
> +
> +#define DATA_SEGMENT_START                                   \
> +     . = ALIGN(HPAGE_SIZE);                                  \
> +     __data_segment_start = .;
>  #else
>  
>  #define X86_ALIGN_RODATA_BEGIN

...

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Reply via email to