Re: Fix rebooting Linux guests in vmd(8)

2022-06-05 Thread Mike Larkin
On Sun, Jun 05, 2022 at 09:25:34AM -0400, Dave Voutila wrote:
> tech@ friends:
>
> tl;dr: testers wanted for fixing Linux guest reboot. If you've got
> Linux guests that no longer reboot properly, please test! For other
> vmd users, please check for any regressions.
>
> Our port of SeaBIOS is configured to enable QEMU features to simplify
> its working with vmd(8). This generally works well.
>
> SeaBIOS provides a reboot routine specifically for QEMU environments.
> One of the reasons is to provide some extra logic for refreshing the
> copy of the BIOS in memory (as if reading from ROM) before attempting
> the reset (first via PCI and falling back to triple-faulting). The way
> SeaBIOS does this appears to be it assumes there's a "pristine copy"
> of the BIOS loaded by the host's emulator to just below the 4GB mark
> in physical memory. (See src/fw/shadow.c in the SeaBIOS source tree.)
>
> This hasn't been a problem until recent Linux kernel changes started
> calling into the BIOS as a way to reboot the guest. (I know at least
> the 5.15 kernel shipped with Alpine does this.)
>
> Since vmd/vmm doesn't create a mapping for that area just below 4GB,
> guests experience a page fault vm-exit and a resulting failure as we
> consider that address part of the MMIO hole and reserved.
>
> This change to vmd(8) loads a second copy of the BIOS, ending at the
> 4GB mark in guest memory. Consequently, vmm(4)'s MMIO memory hole is
> adjusted to end 2MB below 4GB to accomodate SeaBIOS and future
> firmware payloads that may be > 1MB in size. (I believe EDK-II UEFI is
> larger than 1MB...haven't looked in awhile.)
>
> Along the way, I adjusted the use of hardcoded values for 1 MB and 4
> GB to use a more human readable version via #defines.
>
> For testers:
>   0. apply patch
>   1. build, install updated kernel, boot new kernel
>   2. copy or symlink sys/arch/amd64/include/vmmvar.h to
>  /usr/include/amd64/
>   3. build and install vmd(8)
>   4. test!
>
> ~dv
>

Does qemu load 2 copies of the bios or just rely on A20 tricks to make the
bios appear at two addresses?

-ml

>
> diff refs/heads/master refs/heads/vmd-bios4g
> blob - fea4ab52e6db7eff12b913ecde30abf970da0b54
> blob + f06212b18f8ae19b5edc8fa8d64684d7163e35a8
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -35,7 +35,7 @@
>  #define VMM_MAX_NICS_PER_VM  4
>
>  #define VMM_PCI_MMIO_BAR_BASE0xF000ULL
> -#define VMM_PCI_MMIO_BAR_END 0xULL
> +#define VMM_PCI_MMIO_BAR_END 0xFFDFULL   /* 2 MiB below 4 GiB */
>  #define VMM_PCI_MMIO_BAR_SIZE0x0001
>  #define VMM_PCI_IO_BAR_BASE  0x1000
>  #define VMM_PCI_IO_BAR_END   0x
> blob - d952ba4d8d0bff700fc09c066ffc284909150417
> blob + c36e17eb5ed4d1799f55fa1af5f7ca158923202e
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -65,6 +65,10 @@
>  #include "vmd.h"
>  #include "vmm.h"
>
> +#define _1_MB(1UL * 1024 * 1024)
> +#define _2_MB(2UL * 1024 * 1024)
> +#define _4_GB(4UL * 1024 * 1024 * 1024)
> +
>  io_fn_t ioports_map[MAX_PORTS];
>
>  int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
> @@ -234,7 +238,7 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
>   return (-1);
>
>   /* The BIOS image must end at 1MB */
> - if ((off = 1048576 - size) < 0)
> + if ((off = _1_MB - size) < 0)
>   return (-1);
>
>   /* Read BIOS image into memory */
> @@ -243,6 +247,16 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
>   return (-1);
>   }
>
> + if (gzseek(fp, 0, SEEK_SET) == -1)
> + return (-1);
> +
> + /* Read a second BIOS copy into memory ending at 4GB */
> + off = _4_GB - size;
> + if (mread(fp, off, size) != (size_t)size) {
> + errno = EIO;
> + return (-1);
> + }
> +
>   log_debug("%s: loaded BIOS image", __func__);
>
>   return (0);
> @@ -872,6 +886,7 @@ void
>  create_memory_map(struct vm_create_params *vcp)
>  {
>   size_t len, mem_bytes;
> + size_t above_1m = 0, above_4g = 0;
>
>   mem_bytes = vcp->vcp_memranges[0].vmr_size;
>   vcp->vcp_nmemranges = 0;
> @@ -893,29 +908,47 @@ create_memory_map(struct vm_create_params *vcp)
>* we need to make sure that vmm(4) permits accesses
>* to it. So allocate guest memory for it.
>*/
> - len = 0x10 - LOWMEM_KB * 1024;
> + len = _1_MB - (LOWMEM_KB * 1024);
>   vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
>   vcp->vcp_memranges[1].vmr_size = len;
>   mem_bytes -= len;
>
> - /* Make sure that we do not place physical memory into MMIO ranges. */
> - if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x10)
> - len = VMM_PCI_MMIO_BAR_BASE - 0x10;
> - else
> - len = mem_bytes;
> -
> - /* Third memory region: 1MB - (1MB + len) */
> - vcp->vcp_memranges[2].vmr_gpa = 0x10;
> - vcp->vcp_memranges[2].vmr_size = len;
> - mem_bytes -= len;

Fix rebooting Linux guests in vmd(8)

2022-06-05 Thread Dave Voutila
tech@ friends:

tl;dr: testers wanted for fixing Linux guest reboot. If you've got
Linux guests that no longer reboot properly, please test! For other
vmd users, please check for any regressions.

Our port of SeaBIOS is configured to enable QEMU features to simplify
its working with vmd(8). This generally works well.

SeaBIOS provides a reboot routine specifically for QEMU environments.
One of the reasons is to provide some extra logic for refreshing the
copy of the BIOS in memory (as if reading from ROM) before attempting
the reset (first via PCI and falling back to triple-faulting). The way
SeaBIOS does this appears to be it assumes there's a "pristine copy"
of the BIOS loaded by the host's emulator to just below the 4GB mark
in physical memory. (See src/fw/shadow.c in the SeaBIOS source tree.)

This hasn't been a problem until recent Linux kernel changes started
calling into the BIOS as a way to reboot the guest. (I know at least
the 5.15 kernel shipped with Alpine does this.)

Since vmd/vmm doesn't create a mapping for that area just below 4GB,
guests experience a page fault vm-exit and a resulting failure as we
consider that address part of the MMIO hole and reserved.

This change to vmd(8) loads a second copy of the BIOS, ending at the
4GB mark in guest memory. Consequently, vmm(4)'s MMIO memory hole is
adjusted to end 2MB below 4GB to accomodate SeaBIOS and future
firmware payloads that may be > 1MB in size. (I believe EDK-II UEFI is
larger than 1MB...haven't looked in awhile.)

Along the way, I adjusted the use of hardcoded values for 1 MB and 4
GB to use a more human readable version via #defines.

For testers:
  0. apply patch
  1. build, install updated kernel, boot new kernel
  2. copy or symlink sys/arch/amd64/include/vmmvar.h to
 /usr/include/amd64/
  3. build and install vmd(8)
  4. test!

~dv


diff refs/heads/master refs/heads/vmd-bios4g
blob - fea4ab52e6db7eff12b913ecde30abf970da0b54
blob + f06212b18f8ae19b5edc8fa8d64684d7163e35a8
--- sys/arch/amd64/include/vmmvar.h
+++ sys/arch/amd64/include/vmmvar.h
@@ -35,7 +35,7 @@
 #define VMM_MAX_NICS_PER_VM4

 #define VMM_PCI_MMIO_BAR_BASE  0xF000ULL
-#define VMM_PCI_MMIO_BAR_END   0xULL
+#define VMM_PCI_MMIO_BAR_END   0xFFDFULL   /* 2 MiB below 4 GiB */
 #define VMM_PCI_MMIO_BAR_SIZE  0x0001
 #define VMM_PCI_IO_BAR_BASE0x1000
 #define VMM_PCI_IO_BAR_END 0x
blob - d952ba4d8d0bff700fc09c066ffc284909150417
blob + c36e17eb5ed4d1799f55fa1af5f7ca158923202e
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -65,6 +65,10 @@
 #include "vmd.h"
 #include "vmm.h"

+#define _1_MB  (1UL * 1024 * 1024)
+#define _2_MB  (2UL * 1024 * 1024)
+#define _4_GB  (4UL * 1024 * 1024 * 1024)
+
 io_fn_t ioports_map[MAX_PORTS];

 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
@@ -234,7 +238,7 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
return (-1);

/* The BIOS image must end at 1MB */
-   if ((off = 1048576 - size) < 0)
+   if ((off = _1_MB - size) < 0)
return (-1);

/* Read BIOS image into memory */
@@ -243,6 +247,16 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
return (-1);
}

+   if (gzseek(fp, 0, SEEK_SET) == -1)
+   return (-1);
+
+   /* Read a second BIOS copy into memory ending at 4GB */
+   off = _4_GB - size;
+   if (mread(fp, off, size) != (size_t)size) {
+   errno = EIO;
+   return (-1);
+   }
+
log_debug("%s: loaded BIOS image", __func__);

return (0);
@@ -872,6 +886,7 @@ void
 create_memory_map(struct vm_create_params *vcp)
 {
size_t len, mem_bytes;
+   size_t above_1m = 0, above_4g = 0;

mem_bytes = vcp->vcp_memranges[0].vmr_size;
vcp->vcp_nmemranges = 0;
@@ -893,29 +908,47 @@ create_memory_map(struct vm_create_params *vcp)
 * we need to make sure that vmm(4) permits accesses
 * to it. So allocate guest memory for it.
 */
-   len = 0x10 - LOWMEM_KB * 1024;
+   len = _1_MB - (LOWMEM_KB * 1024);
vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
vcp->vcp_memranges[1].vmr_size = len;
mem_bytes -= len;

-   /* Make sure that we do not place physical memory into MMIO ranges. */
-   if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x10)
-   len = VMM_PCI_MMIO_BAR_BASE - 0x10;
-   else
-   len = mem_bytes;
-
-   /* Third memory region: 1MB - (1MB + len) */
-   vcp->vcp_memranges[2].vmr_gpa = 0x10;
-   vcp->vcp_memranges[2].vmr_size = len;
-   mem_bytes -= len;
-
-   if (mem_bytes > 0) {
-   /* Fourth memory region for the remaining memory (if any) */
-   vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
-   vcp->vcp_memranges[3].vmr_size = mem_bytes;
-   vcp->vcp_nmemranges = 4;
-   } else
+   /* If we have less than 2MB remaining, still c