From: Cong Wang <[email protected]>

This patch extends the kexec subsystem to support multikernel
functionality, allowing different kernel instances to be loaded and
executed on specific CPUs. The implementation introduces:

- New KEXEC_TYPE_MULTIKERNEL type and KEXEC_MULTIKERNEL flag

- multikernel_kick_ap() function for CPU-specific kernel booting

- LINUX_REBOOT_CMD_MULTIKERNEL reboot command with CPU parameter

- Specialized segment loading for multikernel images using memremap

- Integration with existing kexec infrastructure while bypassing
  standard machine_kexec_prepare() for avoiding resets

The multikernel_kexec() function validates CPU availability and uses
the existing kexec image start address to boot the target CPU with
a different kernel instance. This enables heterogeneous computing
scenarios where different CPUs can run specialized kernel variants.

Signed-off-by: Cong Wang <[email protected]>
---
 arch/x86/include/asm/smp.h  |   1 +
 arch/x86/kernel/smpboot.c   | 104 +++++++++++++++++++++++++++
 include/linux/kexec.h       |   6 +-
 include/uapi/linux/kexec.h  |   1 +
 include/uapi/linux/reboot.h |   2 +-
 kernel/kexec.c              |  41 ++++++++++-
 kernel/kexec_core.c         | 135 ++++++++++++++++++++++++++++++++++++
 kernel/reboot.c             |  10 +++
 8 files changed, 294 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 22bfebe6776d..1a59fd0de759 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -107,6 +107,7 @@ void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_kick_ap(unsigned int cpu, struct task_struct *tidle);
+int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address);
 int native_cpu_disable(void);
 void __noreturn hlt_play_dead(void);
 void native_play_dead(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33e166f6ab12..c2844a493ebf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -833,6 +833,72 @@ int common_cpu_up(unsigned int cpu, struct task_struct 
*idle)
        return 0;
 }
 
+// must be locked by cpus_read_lock()
+static int do_multikernel_boot_cpu(u32 apicid, int cpu, unsigned long 
kernel_start_address)
+{
+       unsigned long start_ip = real_mode_header->trampoline_start;
+       int ret;
+
+       pr_info("do_multikernel_boot_cpu(apicid=%u, cpu=%u, 
kernel_start_address=%lx)\n", apicid, cpu, kernel_start_address);
+#ifdef CONFIG_X86_64
+       /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
+       if (apic->wakeup_secondary_cpu_64)
+               start_ip = real_mode_header->trampoline_start64;
+#endif
+       //initial_code = (unsigned long)start_secondary;
+       initial_code = (unsigned long)kernel_start_address;
+
+       if (IS_ENABLED(CONFIG_X86_32)) {
+               early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+               //initial_stack  = idle->thread.sp;
+       } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
+               smpboot_control = cpu;
+       }
+
+       /* Skip init_espfix_ap(cpu); */
+
+       /* Skip announce_cpu(cpu, apicid); */
+
+       /*
+        * This grunge runs the startup process for
+        * the targeted processor.
+        */
+       if (x86_platform.legacy.warm_reset) {
+
+               pr_debug("Setting warm reset code and vector.\n");
+
+               smpboot_setup_warm_reset_vector(start_ip);
+               /*
+                * Be paranoid about clearing APIC errors.
+               */
+               if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+                       apic_write(APIC_ESR, 0);
+                       apic_read(APIC_ESR);
+               }
+       }
+
+       smp_mb();
+
+       /*
+        * Wake up a CPU in difference cases:
+        * - Use a method from the APIC driver if one defined, with wakeup
+        *   straight to 64-bit mode preferred over wakeup to RM.
+        * Otherwise,
+        * - Use an INIT boot APIC message
+        */
+       if (apic->wakeup_secondary_cpu_64)
+               ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
+       else if (apic->wakeup_secondary_cpu)
+               ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
+       else
+               ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
+
+       pr_info("do_multikernel_boot_cpu end\n");
+       /* If the wakeup mechanism failed, cleanup the warm reset vector */
+       if (ret)
+               arch_cpuhp_cleanup_kick_cpu(cpu);
+       return ret;
+}
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -905,6 +971,44 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, 
struct task_struct *idle)
        return ret;
 }
 
+// must be locked by cpus_read_lock()
+int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address)
+{
+       u32 apicid = apic->cpu_present_to_apicid(cpu);
+       int err;
+
+       lockdep_assert_irqs_enabled();
+
+       pr_info("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
+
+       if (apicid == BAD_APICID || !apic_id_valid(apicid)) {
+               pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", 
cpu, apicid);
+               return -EINVAL;
+       }
+
+       if (!test_bit(apicid, phys_cpu_present_map)) {
+               pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", 
cpu, apicid);
+               return -EINVAL;
+       }
+
+       /*
+        * Save current MTRR state in case it was changed since early boot
+        * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+        */
+       mtrr_save_state();
+
+       /* the FPU context is blank, nobody can own it */
+       per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+       /* skip common_cpu_up(cpu, tidle); */
+
+       err = do_multikernel_boot_cpu(apicid, cpu, kernel_start_address);
+       if (err)
+               pr_err("do_multikernel_boot_cpu failed(%d) to wakeup CPU#%u\n", 
err, cpu);
+
+       return err;
+}
+
+
 int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
 {
        u32 apicid = apic->cpu_present_to_apicid(cpu);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 39fe3e6cd282..a3ae3e561109 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -358,9 +358,10 @@ struct kimage {
        unsigned long control_page;
 
        /* Flags to indicate special processing */
-       unsigned int type : 1;
+       unsigned int type : 2;
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
+#define KEXEC_TYPE_MULTIKERNEL 2
        unsigned int preserve_context : 1;
        /* If set, we are using file mode kexec syscall */
        unsigned int file_mode:1;
@@ -434,6 +435,7 @@ extern void machine_kexec(struct kimage *image);
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
 extern int kernel_kexec(void);
+extern int multikernel_kexec(int cpu);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
                                                unsigned int order);
 
@@ -455,7 +457,7 @@ bool kexec_load_permitted(int kexec_image_type);
 #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | 
KEXEC_CRASH_HOTPLUG_SUPPORT)
 #else
 #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | 
KEXEC_UPDATE_ELFCOREHDR | \
-                       KEXEC_CRASH_HOTPLUG_SUPPORT)
+                       KEXEC_CRASH_HOTPLUG_SUPPORT | KEXEC_MULTIKERNEL)
 #endif
 
 /* List of defined/legal kexec file flags */
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 8958ebfcff94..4ed8660ef95e 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -14,6 +14,7 @@
 #define KEXEC_PRESERVE_CONTEXT 0x00000002
 #define KEXEC_UPDATE_ELFCOREHDR        0x00000004
 #define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008
+#define KEXEC_MULTIKERNEL      0x00000010
 #define KEXEC_ARCH_MASK                0xffff0000
 
 /*
diff --git a/include/uapi/linux/reboot.h b/include/uapi/linux/reboot.h
index 58e64398efc5..aac2f2f94a98 100644
--- a/include/uapi/linux/reboot.h
+++ b/include/uapi/linux/reboot.h
@@ -34,7 +34,7 @@
 #define        LINUX_REBOOT_CMD_RESTART2       0xA1B2C3D4
 #define        LINUX_REBOOT_CMD_SW_SUSPEND     0xD000FCE2
 #define        LINUX_REBOOT_CMD_KEXEC          0x45584543
-
+#define        LINUX_REBOOT_CMD_MULTIKERNEL    0x4D4B4C49
 
 
 #endif /* _UAPI_LINUX_REBOOT_H */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 28008e3d462e..49e62f804674 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -16,6 +16,7 @@
 #include <linux/syscalls.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/memblock.h>
 
 #include "kexec_internal.h"
 
@@ -27,6 +28,7 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned 
long entry,
        int ret;
        struct kimage *image;
        bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+       bool multikernel_load = flags & KEXEC_MULTIKERNEL;
 
 #ifdef CONFIG_CRASH_DUMP
        if (kexec_on_panic) {
@@ -37,6 +39,30 @@ static int kimage_alloc_init(struct kimage **rimage, 
unsigned long entry,
        }
 #endif
 
+#if 0
+       if (multikernel_load) {
+               // Check if entry is in a reserved memory region
+               bool in_reserved_region = false;
+               phys_addr_t start, end;
+               u64 i;
+
+               for_each_reserved_mem_range(i, &start, &end) {
+                       if (entry >= start && entry < end) {
+                               in_reserved_region = true;
+                               break;
+                       }
+               }
+
+               if (!in_reserved_region) {
+                       pr_err("Entry point 0x%lx is not in a reserved memory 
region\n", entry);
+                       return -EADDRNOTAVAIL; // Return an error if not in a 
reserved region
+               }
+
+               pr_info("multikernel load: got to multikernel_load syscall, 
entry 0x%lx, nr_segments %lu, flags 0x%lx\n",
+                       entry, nr_segments, flags);
+       }
+#endif
+
        /* Allocate and initialize a controlling structure */
        image = do_kimage_alloc_init();
        if (!image)
@@ -54,10 +80,16 @@ static int kimage_alloc_init(struct kimage **rimage, 
unsigned long entry,
        }
 #endif
 
+       if (multikernel_load) {
+               image->type = KEXEC_TYPE_MULTIKERNEL;
+       }
+
        ret = sanity_check_segment_list(image);
        if (ret)
                goto out_free_image;
 
+       if (multikernel_load)
+               goto done;
        /*
         * Find a location for the control code buffer, and add it
         * the vector of segments so that it's pages will also be
@@ -79,6 +111,7 @@ static int kimage_alloc_init(struct kimage **rimage, 
unsigned long entry,
                }
        }
 
+done:
        *rimage = image;
        return 0;
 out_free_control_pages:
@@ -139,9 +172,11 @@ static int do_kexec_load(unsigned long entry, unsigned 
long nr_segments,
                image->hotplug_support = 1;
 #endif
 
-       ret = machine_kexec_prepare(image);
-       if (ret)
-               goto out;
+       if (!(flags & KEXEC_MULTIKERNEL)) {
+               ret = machine_kexec_prepare(image);
+               if (ret)
+                       goto out;
+       }
 
        /*
         * Some architecture(like S390) may touch the crash memory before
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 31203f0bacaf..35a66c8dd78b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -41,6 +41,7 @@
 #include <linux/objtool.h>
 #include <linux/kmsg_dump.h>
 #include <linux/dma-map-ops.h>
+#include <linux/memblock.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -211,6 +212,32 @@ int sanity_check_segment_list(struct kimage *image)
        }
 #endif
 
+#if 0
+       if (image->type == KEXEC_TYPE_MULTIKERNEL) {
+               for (i = 0; i < nr_segments; i++) {
+                       unsigned long mstart, mend;
+                       phys_addr_t start, end;
+                       bool in_reserved_region = false;
+                       u64 i;
+
+                       mstart = image->segment[i].mem;
+                       mend = mstart + image->segment[i].memsz - 1;
+                       for_each_reserved_mem_range(i, &start, &end) {
+                               if (mstart >= start && mend <= end) {
+                                       in_reserved_region = true;
+                                       break;
+                               }
+                       }
+
+                       if (!in_reserved_region) {
+                               pr_err("Segment 0x%lx-0x%lx is not in a 
reserved memory region\n",
+                                       mstart, mend);
+                               return -EADDRNOTAVAIL;
+                       }
+               }
+       }
+#endif
+
        /*
         * The destination addresses are searched from system RAM rather than
         * being allocated from the buddy allocator, so they are not guaranteed
@@ -943,6 +970,84 @@ static int kimage_load_crash_segment(struct kimage *image, 
int idx)
 }
 #endif
 
+static int kimage_load_multikernel_segment(struct kimage *image, int idx)
+{
+       /* For multikernel we simply copy the data from
+        * user space to it's destination.
+        * We do things a page at a time for the sake of kmap.
+        */
+       struct kexec_segment *segment = &image->segment[idx];
+       unsigned long maddr;
+       size_t ubytes, mbytes;
+       int result;
+       unsigned char __user *buf = NULL;
+       unsigned char *kbuf = NULL;
+
+       result = 0;
+       if (image->file_mode)
+               kbuf = segment->kbuf;
+       else
+               buf = segment->buf;
+       ubytes = segment->bufsz;
+       mbytes = segment->memsz;
+       maddr = segment->mem;
+       pr_info("Loading multikernel segment: mem=0x%lx, memsz=0x%zu, 
buf=0x%px, bufsz=0x%zu\n",
+               maddr, mbytes, buf, ubytes);
+       while (mbytes) {
+               char *ptr;
+               size_t uchunk, mchunk;
+               unsigned long page_addr = maddr & PAGE_MASK;
+               unsigned long page_offset = maddr & ~PAGE_MASK;
+
+               /* Use memremap to map the physical address */
+               ptr = memremap(page_addr, PAGE_SIZE, MEMREMAP_WB);
+               if (!ptr) {
+                       pr_err("Failed to memremap memory at 0x%lx\n", 
page_addr);
+                       result = -ENOMEM;
+                       goto out;
+               }
+
+               /* Adjust pointer to the offset within the page */
+               ptr += page_offset;
+
+               /* Calculate chunk sizes */
+               mchunk = min_t(size_t, mbytes, PAGE_SIZE - page_offset);
+               uchunk = min(ubytes, mchunk);
+
+               /* Zero the trailing part of the page if needed */
+               if (mchunk > uchunk) {
+                       /* Zero the trailing part of the page */
+                       memset(ptr + uchunk, 0, mchunk - uchunk);
+               }
+
+               if (uchunk) {
+                       /* For file based kexec, source pages are in kernel 
memory */
+                       if (image->file_mode)
+                               memcpy(ptr, kbuf, uchunk);
+                       else
+                               result = copy_from_user(ptr, buf, uchunk);
+                       ubytes -= uchunk;
+                       if (image->file_mode)
+                               kbuf += uchunk;
+                       else
+                               buf += uchunk;
+               }
+
+               /* Clean up */
+               memunmap(ptr - page_offset);
+               if (result) {
+                       result = -EFAULT;
+                       goto out;
+               }
+               maddr  += mchunk;
+               mbytes -= mchunk;
+
+               cond_resched();
+       }
+out:
+       return result;
+}
+
 int kimage_load_segment(struct kimage *image, int idx)
 {
        int result = -ENOMEM;
@@ -956,6 +1061,9 @@ int kimage_load_segment(struct kimage *image, int idx)
                result = kimage_load_crash_segment(image, idx);
                break;
 #endif
+       case KEXEC_TYPE_MULTIKERNEL:
+               result = kimage_load_multikernel_segment(image, idx);
+               break;
        }
 
        return result;
@@ -1230,3 +1338,30 @@ int kernel_kexec(void)
        kexec_unlock();
        return error;
 }
+
+int multikernel_kexec(int cpu)
+{
+       int rc;
+
+       pr_info("multikernel kexec: cpu %d\n", cpu);
+
+       if (cpu_online(cpu)) {
+               pr_err("The CPU is currently running with this kernel 
instance.");
+               return -EBUSY;
+       }
+
+       if (!kexec_trylock())
+               return -EBUSY;
+       if (!kexec_image) {
+               rc = -EINVAL;
+               goto unlock;
+       }
+
+       cpus_read_lock();
+       rc = multikernel_kick_ap(cpu, kexec_image->start);
+       cpus_read_unlock();
+
+unlock:
+       kexec_unlock();
+       return rc;
+}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index ec087827c85c..f3ac703c4695 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -717,6 +717,10 @@ EXPORT_SYMBOL_GPL(kernel_power_off);
 
 DEFINE_MUTEX(system_transition_mutex);
 
+struct multikernel_boot_args {
+       int cpu;
+};
+
 /*
  * Reboot system call: for obvious reasons only root may call it,
  * and even root needs to set up some magic numbers in the registers
@@ -729,6 +733,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned 
int, cmd,
                void __user *, arg)
 {
        struct pid_namespace *pid_ns = task_active_pid_ns(current);
+       struct multikernel_boot_args boot_args;
        char buffer[256];
        int ret = 0;
 
@@ -799,6 +804,11 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned 
int, cmd,
        case LINUX_REBOOT_CMD_KEXEC:
                ret = kernel_kexec();
                break;
+       case LINUX_REBOOT_CMD_MULTIKERNEL:
+               if (copy_from_user(&boot_args, arg, sizeof(boot_args)))
+                       return -EFAULT;
+               ret = multikernel_kexec(boot_args.cpu);
+               break;
 #endif
 
 #ifdef CONFIG_HIBERNATION
-- 
2.34.1


Reply via email to