Note that the code in the middle regarding setting up page tables
was copied from boothdr.S. We can rework this in a future patch,
but keep them the same for now as we know it works.
---
i386/i386/msr.h | 1 +
x86_64/Makefrag.am | 1 +
x86_64/cpuboot.S | 412 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 414 insertions(+)
create mode 100644 x86_64/cpuboot.S
diff --git a/i386/i386/msr.h b/i386/i386/msr.h
index 42b01062..87898b9e 100644
--- a/i386/i386/msr.h
+++ b/i386/i386/msr.h
@@ -20,6 +20,7 @@
#define _MACHINE_MSR_H_
#define MSR_REG_EFER 0xC0000080
+# define MSR_REG_EFER_LONG_MODE_EN (1 << 8)
#define MSR_REG_STAR 0xC0000081
#define MSR_REG_LSTAR 0xC0000082
#define MSR_REG_CSTAR 0xC0000083
diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am
index 2b79e771..e0d4d2f9 100644
--- a/x86_64/Makefrag.am
+++ b/x86_64/Makefrag.am
@@ -92,6 +92,7 @@ libkernel_a_SOURCES += \
i386/i386/percpu.h \
i386/i386/percpu.c \
x86_64/cswitch.S \
+ x86_64/cpuboot.S \
x86_64/debug_trace.S \
x86_64/idt_inittab.S \
x86_64/locore.S \
diff --git a/x86_64/cpuboot.S b/x86_64/cpuboot.S
new file mode 100644
index 00000000..38c67771
--- /dev/null
+++ b/x86_64/cpuboot.S
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2025 Free Software Foundation
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#if NCPUS > 1
+#include <mach/machine/asm.h>
+#include <i386/i386asm.h>
+#include <i386/proc_reg.h>
+#include <i386/apic.h>
+#include <i386/cpu_number.h>
+#include <i386/seg.h>
+#include <i386/msr.h>
+#include <i386/gdt.h>
+
+#define RELOC(addr) (addr - apboot)
+#define CR0_CLEAR_FLAGS_CACHE_ENABLE (CR0_CD | CR0_NW)
+#define CR0_SET_FLAGS (CR0_CLEAR_FLAGS_CACHE_ENABLE | CR0_PE)
+#define CR0_CLEAR_FLAGS (CR0_PG | CR0_AM | CR0_WP | CR0_NE | CR0_TS |
CR0_EM | CR0_MP)
+#define BOOT_CS 0x8
+#define BOOT_DS 0x10
+
+/* Number of 64 bit words */
+#define GDT_DESCR_M64 2
+#define GDT_TABLE_M64 14
+
+#define SEG_ACCESS_OFS 40
+#define SEG_FLAGS_OFS 52
+
+.globl apboot, apbootend, gdt_descr_tmp, apboot_jmp_offset
+
+/* NOTE: apboot to apbootend is copied at runtime to a region above 64k
+ * so it can be called as a SIPI vector. Manual relocations are thus required
+ * on instructions that take absolute addresses. Since we need to patch it,
+ * we use the data section to put this block of code.
+ */
+
+/* NB: cannot use (%rip) in below relocated code */
+.section .boot.text
+.align 4096
+ .code16
+apboot:
+ /* This is now address CS:0 in real mode */
+
+ /* Set data seg same as code seg */
+ mov %cs, %dx
+ mov %dx, %ds
+
+ cli
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ mov %ax, %es
+ mov %ax, %fs
+ mov %ax, %gs
+ mov %ax, %ss
+
+
+ movl %cr0, %eax
+ andl $~CR0_CLEAR_FLAGS, %eax
+ orl $CR0_SET_FLAGS, %eax
+ movl %eax, %cr0
+
+ lgdt RELOC(gdt_descr_tmp)
+ ljmpl *RELOC(apboot_jmp_offset)
+apboot32:
+ .code32
+ xorl %eax, %eax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movw $BOOT_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+
+ /* Protected mode! */
+
+ /*
+ * Prepare minimal page mapping to jump to 64 bit and to C code.
+ * The first 4GB is identity mapped, and the first 2GB are re-mapped
+ * to high addresses at KERNEL_MAP_BASE
+ */
+
+ movl $AP_p3table,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p4table)
+ /*
+ * Fill 4 entries in L3 table to cover the whole 32-bit 4GB address
+ * space. Part of it might be remapped later if the kernel is mapped
+ * below 4G.
+ */
+ movl $AP_p2table,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3table)
+ movl $AP_p2table1,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3table + 8)
+ movl $AP_p2table2,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3table + 16)
+ movl $AP_p2table3,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3table + 24)
+ /* point each page table level two entry to a page */
+ mov $0,%ecx
+.map_p2_table:
+ mov $0x200000,%eax // 2MiB page, should be always available
+ mul %ecx
+ or $(PTE_V|PTE_W|PTE_S),%eax // enable 2MiB page instead of 4k
+ mov %eax,AP_p2table(,%ecx,8)
+ inc %ecx
+ cmp $2048,%ecx // 512 entries per table, map 4 L2 tables
+ jne .map_p2_table
+
+ /*
+ * KERNEL_MAP_BASE must me aligned to 2GB.
+ * Depending on kernel starting address, we might need to add another
+ * entry in the L4 table (controlling 512 GB chunks). In any case, we
+ * add two entries in L3 table to make sure we map 2GB for the kernel.
+ * Note that this may override part of the mapping create above.
+ */
+.kernel_map:
+#if KERNEL_MAP_BASE >= (1U << 39)
+ movl $AP_p3ktable,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p4table + (8 * ((KERNEL_MAP_BASE >> 39) & 0x1FF))) //
select 512G block
+ movl $AP_p2ktable1,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3ktable + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))
// select first 1G block
+ movl $AP_p2ktable2,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3ktable + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) +
1) )) // select second 1G block
+#else
+ movl $AP_p2ktable1,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3table + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))
// select first 1G block
+ movl $AP_p2ktable2,%eax
+ or $(PTE_V|PTE_W),%eax
+ movl %eax,(AP_p3table + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1)
)) // select second 1G block
+#endif
+
+ mov $0,%ecx
+.map_p2k_table:
+ mov $0x200000,%eax // 2MiB page, should be always available
+ mul %ecx
+ or $(PTE_V|PTE_W|PTE_S),%eax // enable 2MiB page instead of 4K
+ mov %eax,AP_p2ktable1(,%ecx,8)
+ inc %ecx
+ cmp $1024,%ecx // 512 entries per table, map 2 L2 tables
+ jne .map_p2k_table
+
+switch64:
+ /*
+ * Jump to 64 bit mode, we have to
+ * - enable PAE
+ * - enable long mode
+ * - enable paging and load the tables filled above in CR3
+ * - jump to a 64-bit code segment at a 32-bit address
+ */
+ mov %cr4,%eax
+ or $CR4_PAE,%eax
+ mov %eax,%cr4
+ mov $MSR_REG_EFER,%ecx
+ rdmsr
+ or $MSR_REG_EFER_LONG_MODE_EN,%eax
+ wrmsr
+ mov $AP_p4table,%eax
+ mov %eax,%cr3
+ mov %cr0,%eax
+ or $CR0_PG,%eax
+ or $CR0_WP,%eax
+ mov %eax,%cr0
+
+ /* Load null Interrupt descriptor table */
+ movl $apboot_idt_ptr, %eax
+ lidtl (%eax)
+
+ lgdtl apboot_gdt64_descr
+ ljmpl $KERNEL_CS, $1f
+1:
+ .code64
+ xorl %eax, %eax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movw $KERNEL_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movw %ax, %ss
+
+ /* Long mode! */
+
+ /* Get CPU number into rbp */
+ movq $1, %rax
+ cpuid
+ shrl $24, %ebx
+ andb %cs:apic_id_mask, %bl
+ movq $cpu_id_lut, %rdi
+ movl %cs:(%rdi, %rbx, 4), %ebp
+
+ /* set up mini stack to do far return */
+ movq $EXT(int_stack_top), %rdi
+ movq (%rdi, %rbp, 8), %rsp
+
+ /* jump to a 64-bit code segment at a 64-bit address */
+ pushq $KERNEL_CS
+ pushq $(start64 + KERNELBASE)
+ retfq
+
+start64:
+ xorl %eax, %eax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movw $KERNEL_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %ss
+ movw $PERCPU_DS, %ax
+ movw %ax, %gs
+
+ /* Get CPU number into rbp */
+ movq $1, %rax
+ cpuid
+ shrl $24, %ebx
+ andb %cs:apic_id_mask, %bl
+ movq $cpu_id_lut, %rdi
+ movl %cs:(%rdi, %rbx, 4), %ebp
+
+ /* set up mini stack */
+ movq $EXT(int_stack_top), %rdi
+ movq (%rdi, %rbp, 8), %rsp
+
+ /* Access per_cpu area */
+ movq %rbp, %rax
+ movq $PC_SIZE,%rbx
+ mul %rbx
+ addq $percpu_array, %rax
+
+ /* Record our cpu number */
+ movq %rbp, (PERCPU_CPU_ID)(%rax)
+
+ /* Save address of percpu area for later */
+ leaq (%rax), %r8
+
+ /* Set up Kernel GS base */
+ movq %r8, %rdx
+ shrq $32, %rdx
+ movl $MSR_REG_KGSBASE, %ecx
+ wrmsr
+
+ /* Set up GS base */
+ movq %r8, %rax
+ movq %r8, %rdx
+ shrq $32, %rdx
+ movl $MSR_REG_GSBASE, %ecx
+ wrmsr
+
+ /* Enable local apic in xAPIC mode */
+ xorq %rax, %rax
+ xorq %rdx, %rdx
+ movq $APIC_MSR, %rcx
+ rdmsr
+ orq $APIC_MSR_ENABLE, %rax
+ andq $(~(APIC_MSR_BSP | APIC_MSR_X2APIC)), %rax
+ movq $APIC_MSR, %rcx
+ wrmsr
+
+ /* Load int_stack_top[cpu] -> esp */
+ CPU_NUMBER(%edx)
+ movq $EXT(int_stack_top), %rdi
+ movq (%rdi, %rdx, 8), %rsp
+
+ /* Ensure stack alignment */
+ andq $(~0xf), %rsp
+
+ /* Reset EFLAGS to a known state */
+ pushq $0
+ popfq
+
+ /* Finish the cpu configuration */
+ call EXT(cpu_ap_main) - KERNELBASE
+
+3:
+ /* NOT REACHED */
+ hlt
+ jmp 3b
+
+.align 16
+ .word 0
+gdt_descr_tmp:
+ .short 3*8-1
+ .long RELOC(gdt_tmp)
+
+.align 16
+gdt_tmp:
+ /* 0 */
+ .quad 0
+ /* BOOT_CS */
+ .word 0xffff
+ .word 0x0000
+ .byte 0x00
+ .byte ACC_PL_K | ACC_CODE_R | ACC_P
+ .byte ((SZ_32 | SZ_G) << 4) | 0xf
+ .byte 0x00
+ /* BOOT_DS */
+ .word 0xffff
+ .word 0x0000
+ .byte 0x00
+ .byte ACC_PL_K | ACC_DATA_W | ACC_P
+ .byte ((SZ_32 | SZ_G) << 4) | 0xf
+ .byte 0x00
+
+.align 16
+apboot_jmp_offset:
+ .long RELOC(apboot32)
+ .word BOOT_CS
+
+apbootend:
+
+.section .boot.data,"ax",@progbits
+
+.align 4096
+AP_p4table: .space 4096
+AP_p3table: .space 4096
+AP_p2table: .space 4096
+AP_p2table1: .space 4096
+AP_p2table2: .space 4096
+AP_p2table3: .space 4096
+AP_p3ktable: .space 4096
+AP_p2ktable1: .space 4096
+AP_p2ktable2: .space 4096
+
+.align 16
+/* Make this long enough to hold 32b null IDT, but
+ * also long enough to inherit 64b null IDT
+ * so we don't have to reload it */
+apboot_idt_ptr:
+ .quad 0
+ .word 0
+
+.code64
+.align 16
+apboot_gdt64_top:
+ /* 6 bytes of padding */
+ .long 0
+ .word 0
+apboot_gdt64_descr:
+ .word (GDT_TABLE_M64 * 8) - 1
+apboot_gdt64_descr_addr:
+ .quad apboot_gdt64
+apboot_gdt64:
+ /* NULL segment */
+ .quad 0
+
+ /* KERNEL_CS */
+ .quad ((ACC_A | ACC_P | ACC_CODE_R) << SEG_ACCESS_OFS) | (SZ_64 <<
SEG_FLAGS_OFS)
+
+ /* KERNEL_DS */
+ .quad ((ACC_A | ACC_P | ACC_DATA_W) << SEG_ACCESS_OFS) | (SZ_64 <<
SEG_FLAGS_OFS)
+
+ /* LDT = 0x18 */
+ .quad 0
+
+ /* TSS = 0x20 */
+ .quad 0
+
+ /* USER_LDT = 0x28 */
+ .quad 0
+
+ /* USER_TSS = 0x30 */
+ .quad 0
+
+ /* LINEAR = 0x38 */
+ .quad 0
+
+ /* FPREGS = 0x40 */
+ .quad 0
+
+ /* USER_GDT = 0x48 and 0x50 */
+ .quad 0
+ .quad 0
+
+ /* USER_TSS64 = 0x58 */
+ .quad 0
+
+ /* USER_TSS64 = 0x60 */
+ .quad 0
+
+ /* PERCPU_DS (GS) = 0x68 */
+ .quad ((ACC_A | ACC_P | ACC_DATA_W) << SEG_ACCESS_OFS) | (SZ_64 <<
SEG_FLAGS_OFS)
+#endif
--
2.45.2