Subject: [PATCH] x86, 64bit: add support for loading kernel above 512G

Current kernel is not allowed to be loaded above 512g, it thinks
that address is too big.

We only need to add one extra spare page for needed level3 to
point another 512g range.

Need to check _text range and set level4 pg to point to that spare
level3 page, and set level3 to point to level2 page to cover
[_text, _end] with extra mapping.

We need this to put relocatable bzImage high above 512g.

-v2: handling cross GB boundary that hpa insists on.
    test on cross 1G, 5G, 512g, 513g.
    should double check on cross 1024g, but it should work.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>

---
 arch/x86/kernel/head_64.S |  142 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 131 insertions(+), 11 deletions(-)

Index: linux-2.6/arch/x86/kernel/head_64.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head_64.S
+++ linux-2.6/arch/x86/kernel/head_64.S
@@ -78,12 +78,6 @@ startup_64:
 	testl	%eax, %eax
 	jnz	bad_address
 
-	/* Is the address too large? */
-	leaq	_text(%rip), %rdx
-	movq	$PGDIR_SIZE, %rax
-	cmpq	%rax, %rdx
-	jae	bad_address
-
 	/* Fixup the physical addresses in the page table
 	 */
 	addq	%rbp, init_level4_pgt + 0(%rip)
@@ -97,28 +91,147 @@ startup_64:
 
 	addq	%rbp, level2_fixmap_pgt + (506*8)(%rip)
 
-	/* Add an Identity mapping if I am above 1G */
+	/* Add an Identity mapping if _end is above 1G */
+	leaq	_end(%rip), %r9
+	decq	%r9
+	cmp	$PUD_SIZE, %r9
+	jl	ident_complete
+
+	/* get end */
+	andq	$PMD_PAGE_MASK, %r9
+	/* round start to 1G if it is below 1G */
 	leaq	_text(%rip), %rdi
 	andq	$PMD_PAGE_MASK, %rdi
+	cmp	$PUD_SIZE, %rdi
+	jg	1f
+	movq	$PUD_SIZE, %rdi
+1:
+	/* get 512G index */
+	movq	%r9, %r10
+	shrq	$PGDIR_SHIFT, %r10
+	andq	$(PTRS_PER_PGD - 1), %r10
+	movq	%rdi, %rax
+	shrq	$PGDIR_SHIFT, %rax
+	andq	$(PTRS_PER_PGD - 1), %rax
+
+	/* cross two 512G ? */
+	cmp	%r10, %rax
+	jne	set_level3_other_512g
+
+	/* all in first 512G ? */
+	cmp	$0, %rax
+	je	skip_level3_spare
+
+	/* same 512G other than first 512g */
+	leaq    (level3_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	leaq    init_level4_pgt(%rip), %rbx
+	movq    %rdx, 0(%rbx, %rax, 8)
+	addq    $L4_PAGE_OFFSET, %rax
+	movq    %rdx, 0(%rbx, %rax, 8)
+
+	/* get 1G index */
+	movq    %r9, %r10
+	shrq    $PUD_SHIFT, %r10
+	andq    $(PTRS_PER_PUD - 1), %r10
+        movq    %rdi, %rax
+        shrq    $PUD_SHIFT, %rax
+        andq    $(PTRS_PER_PUD - 1), %rax
+
+	/* same 1G ? */
+	cmp     %r10, %rax
+	je	set_level2_start_only_not_first_512g
+
+	/* set level2 for end */
+	leaq    level3_spare_pgt(%rip), %rbx
+	leaq    (level2_spare2_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq    %rdx, 0(%rbx, %r10, 8)
+
+set_level2_start_only_not_first_512g:
+	leaq    level3_spare_pgt(%rip), %rbx
+	leaq    (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq    %rdx, 0(%rbx, %rax, 8)
+
+	jmp	set_level2_spare
+
+set_level3_other_512g:
+	/* start is in first 512G ? */
+	cmp	$0, %rax
+	/* for level2 last on first 512g */
+	leaq	level3_ident_pgt(%rip), %rcx
+	je	set_level2_start_other_512g
+
+	/* Set level3 for _text */
+	leaq	(level3_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	leaq	init_level4_pgt(%rip), %rbx
+	movq	%rdx, 0(%rbx, %rax, 8)
+	addq	$L4_PAGE_OFFSET, %rax
+	movq	%rdx, 0(%rbx, %rax, 8)
 
+	/* for level2 last not on first 512G */
+	leaq	level3_spare_pgt(%rip), %rcx
+
+set_level2_start_other_512g:
+	/* always need to set level2 */
 	movq	%rdi, %rax
 	shrq	$PUD_SHIFT, %rax
 	andq	$(PTRS_PER_PUD - 1), %rax
-	jz	ident_complete
-
+	movq	%rcx, %rbx    /* %rcx has level3_spare_pgt or level3_ident_pgt */
 	leaq	(level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq	%rdx, 0(%rbx, %rax, 8)
+
+set_level3_end_other_512g:
+	leaq	(level3_spare2_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	leaq	init_level4_pgt(%rip), %rbx
+	movq	%rdx, 0(%rbx, %r10, 8)
+	addq	$L4_PAGE_OFFSET, %r10
+	movq	%rdx, 0(%rbx, %r10, 8)
+
+	/* always need to set level2 */
+	movq	%r9, %r10
+	shrq	$PUD_SHIFT, %r10
+	andq	$(PTRS_PER_PUD - 1), %r10
+	leaq	level3_spare2_pgt(%rip), %rbx
+	leaq	(level2_spare2_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq	%rdx, 0(%rbx, %r10, 8)
+
+	jmp	set_level2_spare
+
+skip_level3_spare:
+	/* get 1G index */
+	movq	%r9, %r10
+	shrq	$PUD_SHIFT, %r10
+	andq	$(PTRS_PER_PUD - 1), %r10
+	movq	%rdi, %rax
+	shrq	$PUD_SHIFT, %rax
+	andq	$(PTRS_PER_PUD - 1), %rax
+
+	/* same 1G ? */
+	cmp	%r10, %rax
+	je	set_level2_start_only_first_512g
+
+	/* set level2 without level3 spare */
 	leaq	level3_ident_pgt(%rip), %rbx
+	leaq	(level2_spare2_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+	movq	%rdx, 0(%rbx, %r10, 8)
+
+set_level2_start_only_first_512g:
+	/*  set level2 without level3 spare */
+	leaq	level3_ident_pgt(%rip), %rbx
+	leaq	(level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
 	movq	%rdx, 0(%rbx, %rax, 8)
 
+set_level2_spare:
 	movq	%rdi, %rax
 	shrq	$PMD_SHIFT, %rax
 	andq	$(PTRS_PER_PMD - 1), %rax
 	leaq	__PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
 	leaq	level2_spare_pgt(%rip), %rbx
-	leaq	_end(%rip), %r8
-	decq	%r8
+	movq	%r9, %r8
 	shrq	$PMD_SHIFT, %r8
 	andq	$(PTRS_PER_PMD - 1), %r8
+	cmp	%r8, %rax
+	jl	1f
+	addq	$PTRS_PER_PMD, %r8
 1:	movq	%rdx, 0(%rbx, %rax, 8)
 	addq	$PMD_SIZE, %rdx
 	incq	%rax
@@ -435,8 +548,15 @@ NEXT_PAGE(level2_kernel_pgt)
 	PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
 		KERNEL_IMAGE_SIZE/PMD_SIZE)
 
+NEXT_PAGE(level3_spare_pgt)
+	.fill   512, 8, 0
+NEXT_PAGE(level3_spare2_pgt)
+	.fill   512, 8, 0
+
 NEXT_PAGE(level2_spare_pgt)
 	.fill   512, 8, 0
+NEXT_PAGE(level2_spare2_pgt)
+	.fill   512, 8, 0
 
 #undef PMDS
 #undef NEXT_PAGE
