Re: amdgpio(4) : preserve pin configuration on resume

2022-06-26 Thread Mike Larkin
On Wed, Apr 20, 2022 at 11:39:00AM +0200, Mark Kettenis wrote:
> > Date: Tue, 19 Apr 2022 22:02:00 -0700
> > From: Mike Larkin 
> >
> > On at least the Asus ROG Zephyrus 14 (2020), the trackpad fails to generate
> > any interrupts after resume. I tracked this down to amdgpio(4) not 
> > generating
> > interrupts after resume, and started looking at missing soft state.
> >
> > This diff preserves the interrupt pin configurations and restores them after
> > resume. This makes the device function properly post-zzz and post-ZZZ.
>
> I think it might make sense to structure this a bit more like
> pchgpio(4).  There we only restore the configuration for pins that are
> "in use" by OpenBSD.
>
> > Note: amdgpio_read_pin does not return the value that was previously written
> > during amdgpio_intr_establish (it always just returns 0x1 if the pin is
> > in use), so I'm just saving the actual value we write during
> > amdgpio_intr_establish and restoring that during resume.
>
> Well, using amdgpio_read_pin() for the purpose of saving the pin
> configuration doesn't make sense.  That function returns the pin input
> state.
>
> What you need to do is to read the register using bus_space_read_4()
> and restore that value.  Again take a look at pchgpio(4).
>
> > Note 2: In xxx_activate() functions, we usually call 
> > config_activate_children
> > but since amdgpio doesn't have any children, I left that out.
>
> I think that's fine.  But you should do the save/restore in
> DVACT_SUSPEND/DVACT_RESUME.  You want to restore the state as early as
> possible such that you don't get spurious interrupts when the BIOS
> leaves GPIO pins misconfigured.  Again, look at pchgpio(4).
>

I reworked this diff and made it look just like pchgpio. But it's a little
simpler than pchgpio since there is less to save/restore.

ok?

-ml

Index: amdgpio.c
===
RCS file: /cvs/src/sys/dev/acpi/amdgpio.c,v
retrieving revision 1.7
diff -u -p -a -u -r1.7 amdgpio.c
--- amdgpio.c   6 Apr 2022 18:59:27 -   1.7
+++ amdgpio.c   26 Jun 2022 13:53:19 -
@@ -48,6 +48,11 @@ struct amdgpio_intrhand {
void *ih_arg;
 };

+struct amdgpio_pincfg {
+   /* Modeled after pchgpio but we only have one value to save/restore */
+   uint32_tpin_cfg;
+};
+
 struct amdgpio_softc {
struct device sc_dev;
struct acpi_softc *sc_acpi;
@@ -59,6 +64,7 @@ struct amdgpio_softc {
void *sc_ih;

int sc_npins;
+   struct amdgpio_pincfg *sc_pin_cfg;
struct amdgpio_intrhand *sc_pin_ih;

struct acpi_gpio sc_gpio;
@@ -66,9 +72,11 @@ struct amdgpio_softc {

 intamdgpio_match(struct device *, void *, void *);
 void   amdgpio_attach(struct device *, struct device *, void *);
+intamdgpio_activate(struct device *, int);

 const struct cfattach amdgpio_ca = {
-   sizeof(struct amdgpio_softc), amdgpio_match, amdgpio_attach
+   sizeof(struct amdgpio_softc), amdgpio_match, amdgpio_attach,
+   NULL, amdgpio_activate
 };

 struct cfdriver amdgpio_cd = {
@@ -86,6 +94,10 @@ void amdgpio_write_pin(void *, int, int)
 void   amdgpio_intr_establish(void *, int, int, int (*)(void *), void *);
 intamdgpio_pin_intr(struct amdgpio_softc *, int);
 intamdgpio_intr(void *);
+void   amdgpio_save_pin(struct amdgpio_softc *, int pin);
+void   amdgpio_save(struct amdgpio_softc *);
+void   amdgpio_restore_pin(struct amdgpio_softc *, int pin);
+void   amdgpio_restore(struct amdgpio_softc *);

 int
 amdgpio_match(struct device *parent, void *match, void *aux)
@@ -135,6 +147,8 @@ amdgpio_attach(struct device *parent, st
return;
}

+   sc->sc_pin_cfg = mallocarray(sc->sc_npins, sizeof(*sc->sc_pin_cfg),
+   M_DEVBUF, M_WAITOK);
sc->sc_pin_ih = mallocarray(sc->sc_npins, sizeof(*sc->sc_pin_ih),
M_DEVBUF, M_WAITOK | M_ZERO);

@@ -159,6 +173,58 @@ amdgpio_attach(struct device *parent, st
 unmap:
free(sc->sc_pin_ih, M_DEVBUF, sc->sc_npins * sizeof(*sc->sc_pin_ih));
bus_space_unmap(sc->sc_memt, sc->sc_memh, aaa->aaa_size[0]);
+}
+
+int
+amdgpio_activate(struct device *self, int act)
+{
+   struct amdgpio_softc *sc = (struct amdgpio_softc *)self;
+
+   switch (act) {
+   case DVACT_SUSPEND:
+   amdgpio_save(sc);
+   break;
+   case DVACT_RESUME:
+   amdgpio_restore(sc);
+   break;
+   }
+
+   return 0;
+}
+
+void
+amdgpio_save_pin(struct amdgpio_softc *sc, int pin)
+{
+   sc->sc_pin_cfg[pin].pin_cfg = bus_space_read_4(sc->sc_memt, sc->sc_memh,
+   pin * 4);
+}
+
+void
+amdgpio_save(struct amdgpio_softc *sc)
+{
+   int pin;
+
+   for (pin = 0 ; pin < sc->sc_npins; pin++)
+

Re: rewrite amd64 cache printing

2022-06-25 Thread Mike Larkin
On Fri, Jun 24, 2022 at 07:19:47PM +1000, Jonathan Gray wrote:
> Rewrite amd64 printing of cache details.
> Previously we looked at cpuid 0x8005 for L1/TLB details
> which Intel documents as reserved.
> And cpuid 0x8006 for L2 details.
>
> Intel also encode cache details in cpuid 4.
> AMD have mostly the same encoding with cpuid 0x801d
> 0x8005/0x8006 is used as a fallback in this diff
>
> The amount of cache visible to the thread is shown
> and not which groups of cpus share a particular cache.
> In the case of Alder Lake P, P cores have 1.25MB L2, each group of
> 4 E cores shares a 2MB L2.

See below.

-ml

>
> cpu0: AMD Ryzen 5 2600X Six-Core Processor, 3593.83 MHz, 17-08-02
> before:
> cpu0: 64KB 64b/line 4-way I-cache, 32KB 64b/line 8-way D-cache, 512KB 
> 64b/line 8-way L2 cache
> cpu0: ITLB 64 4KB entries fully associative, 64 4MB entries fully associative
> cpu0: DTLB 64 4KB entries fully associative, 64 4MB entries fully associative
> after:
> 0x801d
> cpu0: 32KB 64b/line 8-way D-cache, 64KB 64b/line 4-way I-cache, 512KB 
> 64b/line 8-way L2 cache, 8MB 64b/line 16-way L3 cache
> 0x8005 / 0x8006
> cpu0: 32KB 64b/line 8-way D-cache, 64KB 64b/line 4-way I-cache
> cpu0: 512KB 64b/line 8-way L2 cache
>
> cpu0: Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz, 2494.54 MHz, 06-3d-04
> before:
> cpu0: 256KB 64b/line 8-way L2 cache
> after:
> 4
> cpu0: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache, 256KB 
> 64b/line 8-way L2 cache, 4MB 64b/line 16-way L3 cache
> 0x8005 / 0x8006
> cpu1: 256KB 64b/line 8-way L2 cache
>
> cpu0: Intel(R) Core(TM)2 Duo CPU T7250 @ 2.00GHz, 798.17 MHz, 06-0f-0d
> before:
> cpu0: 2MB 64b/line 8-way L2 cache
> after:
> 4
> cpu0: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache, 2MB 64b/line 
> 8-way L2 cache
> 0x8005 / 0x8006
> cpu0: 2MB 64b/line 8-way L2 cache
>
> cpu0: 12th Gen Intel(R) Core(TM) i7-1260P, 1995.55 MHz, 06-9a-03
> before:
> cpu0: 1MB 64b/line disabled L2 cache
> cpu8: 2MB 64b/line 16-way L2 cache
> after:
> 4
> cpu0: 48KB 64b/line 12-way D-cache, 32KB 64b/line 8-way I-cache, 1MB 64b/line 
> 10-way L2 cache, 18MB 64b/line 12-way L3 cache
> cpu8: 32KB 64b/line 8-way D-cache, 64KB 64b/line 8-way I-cache, 2MB 64b/line 
> 16-way L2 cache, 18MB 64b/line 12-way L3 cache
> 0x8005 / 0x8006
> cpu0: 1MB 64b/line  L2 cache
> cpu8: 2MB 64b/line 16-way L2 cache
>
> diff --git sys/arch/amd64/amd64/cacheinfo.c sys/arch/amd64/amd64/cacheinfo.c
> index 9a672186e9e..a80d1e4f553 100644
> --- sys/arch/amd64/amd64/cacheinfo.c
> +++ sys/arch/amd64/amd64/cacheinfo.c
> @@ -1,32 +1,19 @@
>  /*   $OpenBSD: cacheinfo.c,v 1.9 2020/12/22 03:42:03 jsg Exp $   */
>
> -/*-
> - * Copyright (c) 2000 The NetBSD Foundation, Inc.
> - * All rights reserved.
> +/*
> + * Copyright (c) 2022 Jonathan Gray 
>   *
> - * This code is derived from software contributed to The NetBSD Foundation
> - * by Jason R. Thorpe.
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
>   *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - *notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - *notice, this list of conditions and the following disclaimer in the
> - *documentation and/or other materials provided with the distribution.
> - *
> - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
> - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
> LIMITED
> - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> - * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
> - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
> - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> - * POSSIBILITY OF SUCH DAMAGE.
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + 

Re: Lenovo ThinkCentre M910q fails to suspend

2022-06-17 Thread Mike Larkin
On Fri, Jun 17, 2022 at 08:32:29PM +0100, Edd Barrett wrote:
> Hi Mike,
>
> On Fri, Jun 17, 2022 at 11:55:51AM -0700, Mike Larkin wrote:
> > >  - disabling xhci in ukc: the system fails to boot multi-user. The first
> > >oddness comes where cpus #1-3 fail to "become ready" (as reported by 
> > > dmesg).
> > >It spends a while thinking about these cores not coming up, before
> > >eventually proceeding, but eventually hard resetting. I guess the 
> > > system
> > >really needs xhci to function then...
> >
> > That really makes no sense. can you try the same experiment again using 
> > GENERIC?
>
> Doing `boot bsd.sp -c` and then `disable xhci` means that the system can at
> least boot with no xhci, but sadly it still won't stay in the suspended state.
>
> That might rule out xhci as a source of the issue, maybe.
>
> --
> Best Regards
> Edd Barrett
>
> https://www.theunixzoo.co.uk

Sorry, out of ideas.



Re: Lenovo ThinkCentre M910q fails to suspend

2022-06-17 Thread Mike Larkin
On Fri, Jun 17, 2022 at 06:41:23PM +0100, Edd Barrett wrote:
> Hi Mike,
>
> On Fri, Jun 17, 2022 at 10:20:35AM -0700, Mike Larkin wrote:
> > Oh, didn't read this closely enough the first time. If ZZZ doesn't 
> > instantly wake
> > the machine, then it's one of the two S3 devices described in your next 
> > email.
> > Since one is XHCI, I'd disable xhci in ukc and see if that helps. Or maybe 
> > the
> > other *hci(4)s also.
>
> Alright, so here's some more info.
>
>  - disabling xhci in ukc: the system fails to boot multi-user. The first
>oddness comes where cpus #1-3 fail to "become ready" (as reported by 
> dmesg).
>It spends a while thinking about these cores not coming up, before
>eventually proceeding, but eventually hard resetting. I guess the system
>really needs xhci to function then...

That really makes no sense. can you try the same experiment again using GENERIC?

>
>  - disabling ehci or ahci: no change to sleep behaviour.
>
>  - disabling usb: no change to sleep behaviour.
>
> I don't see any [XEA]HCI options in the BIOS that I could tweak.
>
> Unless you have any other ideas, I'll try disabling random devices in the hope
> that I can narrow it down... I've already tried the network card, it aint 
> that.
>
> Thanks.
>
> --
> Best Regards
> Edd Barrett
>
> https://www.theunixzoo.co.uk
>



Re: Lenovo ThinkCentre M910q fails to suspend

2022-06-17 Thread Mike Larkin
On Fri, Jun 17, 2022 at 09:14:45AM +0100, Edd Barrett wrote:
> Hi Mike,
>
> On Thu, Jun 16, 2022 at 09:19:50PM -0700, Mike Larkin wrote:
> > From your original dmesg:
> >
> > > acpi0: wakeup devices PEG0(S4) PEGP(S4) PEG1(S4) PEGP(S4) PEG2(S4) 
> > > PEGP(S4) SIO1(S3) RP09(S4) PXSX(S4) RP10(S4) PXSX(S4) RP11(S4) PXSX(S4) 
> > > RP12(S4) PXSX(S4) RP13(S4) [...]
> >
> > Notice the [...] at the end, this is printed after 16 devices. What I'd 
> > suggest
> > is this:
> >
> > 1. remove the code that truncates this list after 16, and note down all the 
> > wake
> > devices.
> >
> > 2. If there are any in S3, try using ZZZ instead of zzz. If the machine 
> > does not instantly
> > wake, it's possible it's because of one of those S3 devices doing the wake 
> > (since ZZZ
> > uses S4).
>
> I'll try removing the truncation then. Bear with me.
>
> In the meantime, notice that the truncated list does include one S3 item
> `SIO1(S3)`. I don't know if that's what we are looking for?
>
> FWIW, I have already tried `ZZZ` on this machine and it does succeed to
> hibernate, but upon wake up, it hangs when decompressing the memory image. I
> left it decompressing a ~50MB image for more than an hour and concluded it had
> got stuck.

Oh, didn't read this closely enough the first time. If ZZZ doesn't instantly 
wake
the machine, then it's one of the two S3 devices described in your next email.
Since one is XHCI, I'd disable xhci in ukc and see if that helps. Or maybe the
other *hci(4)s also.

Now, why ZZZ fails to unpack is some other problem but the instant wake is not
related to that.

-ml

>
> > 3. If everything is S4, well, you're going to have to trace down those 
> > short names
> > like PEGP, PXSX, etc, and disable one at a time until you find the one that 
> > is
> > doing the wake. And it's possible it's none of these and is a fixed function
> > button or something.
>
> One additional piece of info, which may be worthless. I tried a Debian live 
> USB
> stick, to see if Linux was able to sleep this box. It was able to.
>
> I don't know if that rules out the idea of a fixed-function button?
>
> --
> Best Regards
> Edd Barrett
>
> https://www.theunixzoo.co.uk



Re: Lenovo ThinkCentre M910q fails to suspend

2022-06-17 Thread Mike Larkin
On Fri, Jun 17, 2022 at 09:14:45AM +0100, Edd Barrett wrote:
> Hi Mike,
>
> On Thu, Jun 16, 2022 at 09:19:50PM -0700, Mike Larkin wrote:
> > From your original dmesg:
> >
> > > acpi0: wakeup devices PEG0(S4) PEGP(S4) PEG1(S4) PEGP(S4) PEG2(S4) 
> > > PEGP(S4) SIO1(S3) RP09(S4) PXSX(S4) RP10(S4) PXSX(S4) RP11(S4) PXSX(S4) 
> > > RP12(S4) PXSX(S4) RP13(S4) [...]
> >
> > Notice the [...] at the end, this is printed after 16 devices. What I'd 
> > suggest
> > is this:
> >
> > 1. remove the code that truncates this list after 16, and note down all the 
> > wake
> > devices.
> >
> > 2. If there are any in S3, try using ZZZ instead of zzz. If the machine 
> > does not instantly
> > wake, it's possible it's because of one of those S3 devices doing the wake 
> > (since ZZZ
> > uses S4).
>
> I'll try removing the truncation then. Bear with me.
>
> In the meantime, notice that the truncated list does include one S3 item
> `SIO1(S3)`. I don't know if that's what we are looking for?
>
> FWIW, I have already tried `ZZZ` on this machine and it does succeed to
> hibernate, but upon wake up, it hangs when decompressing the memory image. I
> left it decompressing a ~50MB image for more than an hour and concluded it had
> got stuck.
>
> > 3. If everything is S4, well, you're going to have to trace down those 
> > short names
> > like PEGP, PXSX, etc, and disable one at a time until you find the one that 
> > is
> > doing the wake. And it's possible it's none of these and is a fixed function
> > button or something.
>
> One additional piece of info, which may be worthless. I tried a Debian live 
> USB
> stick, to see if Linux was able to sleep this box. It was able to.
>
> I don't know if that rules out the idea of a fixed-function button?
>
> --
> Best Regards
> Edd Barrett
>
> https://www.theunixzoo.co.uk

You're going to have to play trial and error then disabling devices until
you find the one that hangs. Without the hardware in front of me, that's the
best advice I can offer. Sorry.

-ml




Re: Lenovo ThinkCentre M910q fails to suspend

2022-06-16 Thread Mike Larkin
On Thu, Jun 16, 2022 at 08:48:36PM +0100, Edd Barrett wrote:
> On Thu, Jun 16, 2022 at 10:22:16AM -0700, Mike Larkin wrote:
> > did it ever work in the past?
>
> I've only just received the machine, so it's difficult to say.
>
> I've spent the last hour changing various BIOS settings to see if anything
> changes, but alas no. I don't see any sleep-related power options, and any
> fancy power stuff I don't need or recognise, I've disabled. No joy.
>
> I've even updated the BIOS software to no avail. Hrm...
>
> --
> Best Regards
> Edd Barrett
>
> https://www.theunixzoo.co.uk

>From your original dmesg:

> acpi0: wakeup devices PEG0(S4) PEGP(S4) PEG1(S4) PEGP(S4) PEG2(S4) PEGP(S4) 
> SIO1(S3) RP09(S4) PXSX(S4) RP10(S4) PXSX(S4) RP11(S4) PXSX(S4) RP12(S4) 
> PXSX(S4) RP13(S4) [...]

Notice the [...] at the end, this is printed after 16 devices. What I'd suggest
is this:

1. remove the code that truncates this list after 16, and note down all the wake
devices.

2. If there are any in S3, try using ZZZ instead of zzz. If the machine does 
not instantly
wake, it's possible it's because of one of those S3 devices doing the wake 
(since ZZZ
uses S4).

3. If everything is S4, well, you're going to have to trace down those short 
names
like PEGP, PXSX, etc, and disable one at a time until you find the one that is
doing the wake. And it's possible it's none of these and is a fixed function
button or something.

good luck

-ml



Re: Lenovo ThinkCentre M910q fails to suspend

2022-06-16 Thread Mike Larkin
On Thu, Jun 16, 2022 at 05:14:53PM +0100, Edd Barrett wrote:
> Hi,
>
> Has anyone managed to get a Lenovo ThinkCentre M910q (or similar) to suspend
> with OpenBSD?
>
> When invoking `zzz` the system prepares to go down, the screen goes blank, but
> then a short while later, the system comes back, as though it was awoken
> straight away.

did it ever work in the past?

>
> Here's the diff between the initial dmesg and the dmesg after this "suspend 
> and
> come back" described above:
>
> ```
> --- dmesg Thu Jun 16 16:53:44 2022
> +++ dmesg.1   Thu Jun 16 16:55:31 2022
> @@ -360,3 +360,22 @@
>  inteldrm0: 1920x1080, 32bpp
>  wsdisplay0 at inteldrm0 mux 1: console (std, vt100 emulation), using wskbd0
>  wsdisplay0: screen 1-5 added (std, vt100 emulation)
> +uhub0 detached
> +uhub0 at usb0 configuration 1 interface 0 "Intel xHCI root hub" rev 
> 3.00/1.00 addr 1
> +drm:pid42656:intel_ddi_sanitize_encoder_pll_mapping *NOTICE* [drm] 
> [ENCODER:94:DDI J/PHY @] is disabled/in DSI mode with an ungated DDI clock, 
> gate it
> +drm:pid42656:intel_ddi_sanitize_encoder_pll_mapping *NOTICE* [drm] 
> [ENCODER:109:DDI J/PHY @] is disabled/in DSI mode with an ungated DDI clock, 
> gate it
> +drm:pid42656:intel_ddi_sanitize_encoder_pll_mapping *NOTICE* [drm] 
> [ENCODER:119:DDI J/PHY @] is disabled/in DSI mode with an ungated DDI clock, 
> gate it
> +uhidev0 at uhub0 port 11 configuration 1 interface 0 "SINO WEALTH Gaming KB" 
> rev 2.00/1.03 addr 2
> +uhidev0: iclass 3/1
> +ukbd0 at uhidev0: 8 variable keys, 6 key codes
> +wskbd1 at ukbd0 mux 1
> +wskbd1: connecting to wsdisplay0
> +uhidev1 at uhub0 port 11 configuration 1 interface 1 "SINO WEALTH Gaming KB" 
> rev 2.00/1.03 addr 2
> +uhidev1: iclass 3/0, 5 report ids
> +ukbd1 at uhidev1 reportid 1: 120 variable keys, 0 key codes
> +wskbd2 at ukbd1 mux 1
> +wskbd2: connecting to wsdisplay0
> +ucc0 at uhidev1 reportid 2: 573 usages, 18 keys, array
> +wskbd3 at ucc0 mux 1
> +wskbd3: connecting to wsdisplay0
> +uhid0 at uhidev1 reportid 5: input=0, output=0, feature=5
> ```
>
> Is it odd that devices come back which we never saw detach?
>
> Repeating the same again, but with the inteldrm driver disabled, in case those
> scary messages have something to do with this:
>
> ```
> --- dmesg Thu Jun 16 17:04:43 2022
> +++ dmesg.1   Thu Jun 16 17:05:01 2022
> @@ -554,3 +554,19 @@
>  softraid0 at root
>  scsibus4 at softraid0: 256 targets
>  root on sd0a (5d59e5562a788986.a) swap on sd0b dump on sd0b
> +wskbd1: disconnecting from wsdisplay0
> +wskbd1 detached
> +ukbd0 detached
> +uhidev0 detached
> +wskbd2: disconnecting from wsdisplay0
> +wskbd2 detached
> +ukbd1 detached
> +wskbd3: disconnecting from wsdisplay0
> +wskbd3 detached
> +ucc0 detached
> +uhid0 detached
> +uhidev1 detached
> +uhub0 detached
> +uhub0 at usb0 configuration 1 interface 0 "Intel xHCI root hub" rev 
> 3.00/1.00 addr 1
> +uhub0: port 11, set config 0 at addr 2 failed
> +uhub0: device problem, disabling port 11
> ```
>
> This time we see more devices detach, but when uhub0 comes back there is a
> problem. curious...
>
> Does anyone have any idea what is going on here? This was due to be my new
> porting box, but I need it to suspend...
>
> (FWIW: the system also fails to come up from ZZZ hibernate, but one thing at a
> time)
>
> Full dmesg (with inteldrm):
>
> ```
> OpenBSD 7.1-current (GENERIC.MP) #582: Mon Jun 13 15:37:01 MDT 2022
> dera...@amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> real mem = 17044692992 (16255MB)
> avail mem = 16510771200 (15745MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 3.0 @ 0xdcd91000 (88 entries)
> bios0: vendor LENOVO version "M1AKT39A" date 07/16/2018
> bios0: LENOVO 10MUS2UG00
> acpi0 at bios0: ACPI 6.1
> acpi0: sleep states S0 S3 S4 S5
> acpi0: tables DSDT FACP APIC FPDT MCFG SSDT FIDT SLIC MSDM SSDT SSDT HPET 
> SSDT UEFI SSDT LPIT WSMT SSDT SSDT DBGP DBG2 DMAR TPM2 LUFT ASF! BGRT
> acpi0: wakeup devices PEG0(S4) PEGP(S4) PEG1(S4) PEGP(S4) PEG2(S4) PEGP(S4) 
> SIO1(S3) RP09(S4) PXSX(S4) RP10(S4) PXSX(S4) RP11(S4) PXSX(S4) RP12(S4) 
> PXSX(S4) RP13(S4) [...]
> acpitimer0 at acpi0: 3579545 Hz, 24 bits
> acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
> cpu0 at mainbus0: apid 0 (boot processor)
> cpu0: Intel(R) Core(TM) i5-6500T CPU @ 2.50GHz, 2394.42 MHz, 06-5e-03
> cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,SGX,BMI1,HLE,AVX2,SMEP,BMI2,ERMS,INVPCID,RTM,MPX,RDSEED,ADX,SMAP,CLFLUSHOPT,PT,SRBDS_CTRL,MD_CLEAR,TSXFA,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES,MELTDOWN
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, 

Re: Fix rebooting Linux guests in vmd(8)

2022-06-05 Thread Mike Larkin
On Sun, Jun 05, 2022 at 09:25:34AM -0400, Dave Voutila wrote:
> tech@ friends:
>
> tl;dr: testers wanted for fixing Linux guest reboot. If you've got
> Linux guests that no longer reboot properly, please test! For other
> vmd users, please check for any regressions.
>
> Our port of SeaBIOS is configured to enable QEMU features to simplify
> its working with vmd(8). This generally works well.
>
> SeaBIOS provides a reboot routine specifically for QEMU environments.
> One of the reasons is to provide some extra logic for refreshing the
> copy of the BIOS in memory (as if reading from ROM) before attempting
> the reset (first via PCI and falling back to triple-faulting). The way
> SeaBIOS does this appears to be it assumes there's a "pristine copy"
> of the BIOS loaded by the host's emulator to just below the 4GB mark
> in physical memory. (See src/fw/shadow.c in the SeaBIOS source tree.)
>
> This hasn't been a problem until recent Linux kernel changes started
> calling into the BIOS as a way to reboot the guest. (I know at least
> the 5.15 kernel shipped with Alpine does this.)
>
> Since vmd/vmm doesn't create a mapping for that area just below 4GB,
> guests experience a page fault vm-exit and a resulting failure as we
> consider that address part of the MMIO hole and reserved.
>
> This change to vmd(8) loads a second copy of the BIOS, ending at the
> 4GB mark in guest memory. Consequently, vmm(4)'s MMIO memory hole is
> adjusted to end 2MB below 4GB to accomodate SeaBIOS and future
> firmware payloads that may be > 1MB in size. (I believe EDK-II UEFI is
> larger than 1MB...haven't looked in awhile.)
>
> Along the way, I adjusted the use of hardcoded values for 1 MB and 4
> GB to use a more human readable version via #defines.
>
> For testers:
>   0. apply patch
>   1. build, install updated kernel, boot new kernel
>   2. copy or symlink sys/arch/amd64/include/vmmvar.h to
>  /usr/include/amd64/
>   3. build and install vmd(8)
>   4. test!
>
> ~dv
>

Does qemu load 2 copies of the bios or just rely on A20 tricks to make the
bios appear at two addresses?

-ml

>
> diff refs/heads/master refs/heads/vmd-bios4g
> blob - fea4ab52e6db7eff12b913ecde30abf970da0b54
> blob + f06212b18f8ae19b5edc8fa8d64684d7163e35a8
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -35,7 +35,7 @@
>  #define VMM_MAX_NICS_PER_VM  4
>
>  #define VMM_PCI_MMIO_BAR_BASE0xF000ULL
> -#define VMM_PCI_MMIO_BAR_END 0xULL
> +#define VMM_PCI_MMIO_BAR_END 0xFFDFULL   /* 2 MiB below 4 GiB */
>  #define VMM_PCI_MMIO_BAR_SIZE0x0001
>  #define VMM_PCI_IO_BAR_BASE  0x1000
>  #define VMM_PCI_IO_BAR_END   0x
> blob - d952ba4d8d0bff700fc09c066ffc284909150417
> blob + c36e17eb5ed4d1799f55fa1af5f7ca158923202e
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -65,6 +65,10 @@
>  #include "vmd.h"
>  #include "vmm.h"
>
> +#define _1_MB(1UL * 1024 * 1024)
> +#define _2_MB(2UL * 1024 * 1024)
> +#define _4_GB(4UL * 1024 * 1024 * 1024)
> +
>  io_fn_t ioports_map[MAX_PORTS];
>
>  int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
> @@ -234,7 +238,7 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
>   return (-1);
>
>   /* The BIOS image must end at 1MB */
> - if ((off = 1048576 - size) < 0)
> + if ((off = _1_MB - size) < 0)
>   return (-1);
>
>   /* Read BIOS image into memory */
> @@ -243,6 +247,16 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
>   return (-1);
>   }
>
> + if (gzseek(fp, 0, SEEK_SET) == -1)
> + return (-1);
> +
> + /* Read a second BIOS copy into memory ending at 4GB */
> + off = _4_GB - size;
> + if (mread(fp, off, size) != (size_t)size) {
> + errno = EIO;
> + return (-1);
> + }
> +
>   log_debug("%s: loaded BIOS image", __func__);
>
>   return (0);
> @@ -872,6 +886,7 @@ void
>  create_memory_map(struct vm_create_params *vcp)
>  {
>   size_t len, mem_bytes;
> + size_t above_1m = 0, above_4g = 0;
>
>   mem_bytes = vcp->vcp_memranges[0].vmr_size;
>   vcp->vcp_nmemranges = 0;
> @@ -893,29 +908,47 @@ create_memory_map(struct vm_create_params *vcp)
>* we need to make sure that vmm(4) permits accesses
>* to it. So allocate guest memory for it.
>*/
> - len = 0x10 - LOWMEM_KB * 1024;
> + len = _1_MB - (LOWMEM_KB * 1024);
>   vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
>   vcp->vcp_memranges[1].vmr_size = len;
>   mem_bytes -= len;
>
> - /* Make sure that we do not place physical memory into MMIO ranges. */
> - if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x10)
> - len = VMM_PCI_MMIO_BAR_BASE - 0x10;
> - else
> - len = mem_bytes;
> -
> - /* Third memory region: 1MB - (1MB + len) */
> - vcp->vcp_memranges[2].vmr_gpa = 0x10;
> - vcp->vcp_memranges[2].vmr_size = len;
> - mem_bytes -= len;

Re: vmm: remove vm teardown from vcpu run path (testers needed)

2022-06-05 Thread Mike Larkin
On Thu, Jun 02, 2022 at 03:05:16PM -0400, Dave Voutila wrote:
>
> Dave Voutila  writes:
>
> > tech@ et al.:
> >
> > Looking for testers of the following diff for vmm(4). In my efforts to
> > fix some stability issues, I'm taking baby steps tweaking parts of the
> > code to make my upcoming proposal (adding refcnts) easier to swallow.
> >
> > This change removes the calling of vm_teardown from the code path in
> > vm_run after vmm has exited the vm/vcpu and is on its way back to
> > userland/vmd(8).
> >
> > vm_teardown is currently called in 3 areas to destroy/free a vm:
> >
> >   - vm_create: as cleanup in an error path
> >   - vm_terminate: on a vm the ioctl is killing
> >   - vm_run: the run ioctl handler
> >
> > This diff removes that last bullet. It's not needed as vmd will cleanup
> > the vm on child exit, calling vm_terminate. Any non-vmd user of vmm(4)
> > can stop being lazy and use the VMM_IOC_TERM ioctl.
> >
> > Not included in the snippet is the existing final else block that still
> > toggles the vcpu state:
> >
> > } else {
> > vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
> > vcpu->vc_state = VCPU_STATE_TERMINATED;
> > }
> >
> > If testing, please describe *any* difference in shutdown/reboot of vm
> > guests. (n.b. there's a known issue for Linux guests running very recent
> > Linux kernels not being able to reboot. That needs to be addressed in
> > vmd.)
> >
>
> Bumping as the diff has been out for testing and looking for ok's.
>
> -dv
>

ok mlarkin if this helps your subsequent cleanup

> >
> >
> > Index: sys/arch/amd64/amd64/vmm.c
> > ===
> > RCS file: /opt/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> > retrieving revision 1.311
> > diff -u -p -r1.311 vmm.c
> > --- sys/arch/amd64/amd64/vmm.c  20 May 2022 22:42:09 -  1.311
> > +++ sys/arch/amd64/amd64/vmm.c  23 May 2022 11:57:49 -
> > @@ -4495,22 +4495,8 @@ vm_run(struct vm_run_params *vrp)
> > ret = vcpu_run_svm(vcpu, vrp);
> > }
> >
> > -   /*
> > -* We can set the VCPU states here without CAS because once
> > -* a VCPU is in state RUNNING or REQTERM, only the VCPU itself
> > -* can switch the state.
> > -*/
> > atomic_dec_int(>vm_vcpus_running);
> > -   if (vcpu->vc_state == VCPU_STATE_REQTERM) {
> > -   vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
> > -   vcpu->vc_state = VCPU_STATE_TERMINATED;
> > -   if (vm->vm_vcpus_running == 0) {
> > -   rw_enter_write(_softc->vm_lock);
> > -   vm_teardown(vm);
> > -   rw_exit_write(_softc->vm_lock);
> > -   }
> > -   ret = 0;
> > -   } else if (ret == 0 || ret == EAGAIN) {
> > +   if (ret == 0 || ret == EAGAIN) {
> > /* If we are exiting, populate exit data so vmd can help. */
> > vrp->vrp_exit_reason = (ret == 0) ? VM_EXIT_NONE
> > : vcpu->vc_gueststate.vg_exit_reason;
>
>
> --
> -Dave Voutila



Re: move vmm(4) spinout paranoia behind MP_LOCKDEBUG

2022-05-20 Thread Mike Larkin
On Sat, Apr 16, 2022 at 12:09:46PM -0400, Dave Voutila wrote:
> This tucks all the spinout paranoia behind `#ifdef MP_LOCKDEBUG` and
> uses the same approach used in amd64's pmap's TLB shootdown code.
>
> Part of me wants to remove this altogether, but I'm not sure it's
> outlived its usefulness quite yet.
>
> Three areas that busy wait on ipi's are modified:
>
> 1. vmm_start - performs ipi to enable vmm on all cpus
> 2. vmm_stop - performs ipi to disable vmm on all cpus
> 3. vmx_remote_vmclear - performs ipi to vmclear a cpu (only pertinent to
>Intel hosts)
>
> (3) is the most likely to spin out and prior to bumping the spinout to
> the current value (based on __mp_lock_spinout) we had reports from users
> of hitting it on slower/older MP hardware.
>
> For vmm_{start, stop}, I moved the current cpu start/stop routine to
> before performing the ipi broadcast because if we're going to fail to
> (dis)enable vmm we should fail fast. If we fail, there's no need to
> broadcast the ipi. This simplifies the code paths and removes a local
> variable.
>
> All three migrate to infinite busy waits and only have a spinout if
> built with MP_LOCKDEBUG. On a spinout, we enter ddb.
>
> Compiled on amd64 GENERIC, GENERIC.MP, and GENERIC.MP with
> MP_LOCKDEBUG. (This time I won't break GENERIC :)
>
> OK?
>
> -dv

Sorry for the delay. ok mlarkin. I've had this on a few machines for
the better part of a month and haven't seen any problems.

-ml

>
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /opt/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.305
> diff -u -p -r1.305 vmm.c
> --- sys/arch/amd64/amd64/vmm.c28 Mar 2022 06:28:47 -  1.305
> +++ sys/arch/amd64/amd64/vmm.c16 Apr 2022 18:49:01 -
> @@ -43,6 +43,11 @@
>  #include 
>  #include 
>
> +#ifdef MP_LOCKDEBUG
> +#include 
> +extern int __mp_lock_spinout;
> +#endif /* MP_LOCKDEBUG */
> +
>  /* #define VMM_DEBUG */
>
>  void *l1tf_flush_region;
> @@ -1328,17 +1333,26 @@ int
>  vmm_start(void)
>  {
>   struct cpu_info *self = curcpu();
> - int ret = 0;
>  #ifdef MULTIPROCESSOR
>   struct cpu_info *ci;
>   CPU_INFO_ITERATOR cii;
> - int i;
> -#endif
> +#ifdef MP_LOCKDEBUG
> + int nticks;
> +#endif /* MP_LOCKDEBUG */
> +#endif /* MULTIPROCESSOR */
>
>   /* VMM is already running */
>   if (self->ci_flags & CPUF_VMM)
>   return (0);
>
> + /* Start VMM on this CPU */
> + start_vmm_on_cpu(self);
> + if (!(self->ci_flags & CPUF_VMM)) {
> + printf("%s: failed to enter VMM mode\n",
> + self->ci_dev->dv_xname);
> + return (EIO);
> + }
> +
>  #ifdef MULTIPROCESSOR
>   /* Broadcast start VMM IPI */
>   x86_broadcast_ipi(X86_IPI_START_VMM);
> @@ -1346,25 +1360,23 @@ vmm_start(void)
>   CPU_INFO_FOREACH(cii, ci) {
>   if (ci == self)
>   continue;
> - for (i = 10; (!(ci->ci_flags & CPUF_VMM)) && i>0;i--)
> - delay(10);
> - if (!(ci->ci_flags & CPUF_VMM)) {
> - printf("%s: failed to enter VMM mode\n",
> - ci->ci_dev->dv_xname);
> - ret = EIO;
> +#ifdef MP_LOCKDEBUG
> + nticks = __mp_lock_spinout;
> +#endif /* MP_LOCKDEBUG */
> + while (!(ci->ci_flags & CPUF_VMM)) {
> + CPU_BUSY_CYCLE();
> +#ifdef MP_LOCKDEBUG
> + if (--nticks <= 0) {
> + db_printf("%s: spun out", __func__);
> + db_enter();
> + nticks = __mp_lock_spinout;
> + }
> +#endif /* MP_LOCKDEBUG */
>   }
>   }
>  #endif /* MULTIPROCESSOR */
>
> - /* Start VMM on this CPU */
> - start_vmm_on_cpu(self);
> - if (!(self->ci_flags & CPUF_VMM)) {
> - printf("%s: failed to enter VMM mode\n",
> - self->ci_dev->dv_xname);
> - ret = EIO;
> - }
> -
> - return (ret);
> + return (0);
>  }
>
>  /*
> @@ -1376,17 +1388,26 @@ int
>  vmm_stop(void)
>  {
>   struct cpu_info *self = curcpu();
> - int ret = 0;
>  #ifdef MULTIPROCESSOR
>   struct cpu_info *ci;
>   CPU_INFO_ITERATOR cii;
> - int i;
> -#endif
> +#ifdef MP_LOCKDEBUG
> + int nticks;
> +#endif /* MP_LOCKDEBUG */
> +#endif /* MULTIPROCESSOR */
>
>   /* VMM is not running */
>   if (!(self->ci_flags & CPUF_VMM))
>   return (0);
>
> + /* Stop VMM on this CPU */
> + stop_vmm_on_cpu(self);
> + if (self->ci_flags & CPUF_VMM) {
> + printf("%s: failed to exit VMM mode\n",
> + self->ci_dev->dv_xname);
> + return (EIO);
> + }
> +
>  #ifdef MULTIPROCESSOR
>   /* Stop VMM on other CPUs */
>   x86_broadcast_ipi(X86_IPI_STOP_VMM);
> @@ -1394,25 +1415,23 @@ vmm_stop(void)
>

Re: vmm: load vmcs before reading vcpu registers

2022-05-20 Thread Mike Larkin
On Wed, May 18, 2022 at 10:27:11AM -0400, Dave Voutila wrote:
>
> ping...would like to get this in if possible so I can move onto fixing
> some things in vmm.
>

sorry. ok mlarkin

> Dave Voutila  writes:
>
> > tech@,
> >
> > Continuing my vmm/vmd bug hunt, the following diff adapts
> > vcpu_readregs_vmx to optionally load the vmcs on the current cpu. This
> > has gone unnoticed as the ioctl isn't used in typical vmd usage and the
> > usage of vcpu_readregs_vmx in the run ioctl is after the vmcs is already
> > loaded on the current cpu.
> >
> > This fixes `vmctl send` on Intel hosts. (A fix for `vmctl receive` comes
> > next.)
> >
> > Currently, `vmctl send` tries to serialize the vcpu registers as part of
> > serializing the vm state. On an MP machine, it's highly probable that
> > the vmread instructions will fail as they'll be executed on a cpu that
> > doesn't have the vmcs loaded.
> >
> > While here, I noticed the vcpu_writeregs_vmx function doesn't set the
> > vcpu's vmcs state variable to VMCS_CLEARED after running vmclear. This
> > can cause failure to vm-enter as vmm uses that state to determine which
> > of the two Intel instructions to call (vmlaunch or vmresume).
> >
> > ok?
> >
> > -dv
> >
> > Index: sys/arch/amd64/amd64/vmm.c
> > ===
> > RCS file: /opt/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> > retrieving revision 1.308
> > diff -u -p -r1.308 vmm.c
> > --- sys/arch/amd64/amd64/vmm.c  4 May 2022 02:24:26 -   1.308
> > +++ sys/arch/amd64/amd64/vmm.c  8 May 2022 18:37:42 -
> > @@ -140,7 +140,7 @@ int vm_rwregs(struct vm_rwregs_params *,
> >  int vm_mprotect_ept(struct vm_mprotect_ept_params *);
> >  int vm_rwvmparams(struct vm_rwvmparams_params *, int);
> >  int vm_find(uint32_t, struct vm **);
> > -int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
> > +int vcpu_readregs_vmx(struct vcpu *, uint64_t, int, struct vcpu_reg_state 
> > *);
> >  int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
> >  int vcpu_writeregs_vmx(struct vcpu *, uint64_t, int, struct vcpu_reg_state 
> > *);
> >  int vcpu_writeregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
> > @@ -978,7 +978,7 @@ vm_rwregs(struct vm_rwregs_params *vrwp,
> > if (vmm_softc->mode == VMM_MODE_VMX ||
> > vmm_softc->mode == VMM_MODE_EPT)
> > ret = (dir == 0) ?
> > -   vcpu_readregs_vmx(vcpu, vrwp->vrwp_mask, vrs) :
> > +   vcpu_readregs_vmx(vcpu, vrwp->vrwp_mask, 1, vrs) :
> > vcpu_writeregs_vmx(vcpu, vrwp->vrwp_mask, 1, vrs);
> > else if (vmm_softc->mode == VMM_MODE_SVM ||
> > vmm_softc->mode == VMM_MODE_RVI)
> > @@ -1986,6 +1986,7 @@ vcpu_reload_vmcs_vmx(struct vcpu *vcpu)
> >   * Parameters:
> >   *  vcpu: the vcpu to read register values from
> >   *  regmask: the types of registers to read
> > + *  loadvmcs: bit to indicate whether the VMCS has to be loaded first
> >   *  vrs: output parameter where register values are stored
> >   *
> >   * Return values:
> > @@ -1993,7 +1994,7 @@ vcpu_reload_vmcs_vmx(struct vcpu *vcpu)
> >   *  EINVAL: an error reading registers occurred
> >   */
> >  int
> > -vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask,
> > +vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask, int loadvmcs,
> >  struct vcpu_reg_state *vrs)
> >  {
> > int i, ret = 0;
> > @@ -2005,6 +2006,11 @@ vcpu_readregs_vmx(struct vcpu *vcpu, uin
> > struct vcpu_segment_info *sregs = vrs->vrs_sregs;
> > struct vmx_msr_store *msr_store;
> >
> > +   if (loadvmcs) {
> > +   if (vcpu_reload_vmcs_vmx(vcpu))
> > +   return (EINVAL);
> > +   }
> > +
> >  #ifdef VMM_DEBUG
> > /* VMCS should be loaded... */
> > paddr_t pa = 0ULL;
> > @@ -2393,6 +2399,7 @@ out:
> > if (loadvmcs) {
> > if (vmclear(>vc_control_pa))
> > ret = EINVAL;
> > +   atomic_swap_uint(>vc_vmx_vmcs_state, VMCS_CLEARED);
> > }
> > return (ret);
> >  }
> > @@ -4631,7 +4638,7 @@ vmm_translate_gva(struct vcpu *vcpu, uin
> >
> > if (vmm_softc->mode == VMM_MODE_EPT ||
> > vmm_softc->mode == VMM_MODE_VMX) {
> > -   if (vcpu_readregs_vmx(vcpu, VM_RWREGS_ALL, ))
> > +   if (vcpu_readregs_vmx(vcpu, VM_RWREGS_ALL, 1, ))
> > return (EINVAL);
> > } else if (vmm_softc->mode == VMM_MODE_RVI ||
> > vmm_softc->mode == VMM_MODE_SVM) {
> > @@ -5111,7 +5118,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct v
> > vcpu->vc_last_pcpu = curcpu();
> >
> > /* Copy the VCPU register state to the exit structure */
> > -   if (vcpu_readregs_vmx(vcpu, VM_RWREGS_ALL, >vc_exit.vrs))
> > +   if (vcpu_readregs_vmx(vcpu, VM_RWREGS_ALL, 0, >vc_exit.vrs))
> > ret = EINVAL;
> > vcpu->vc_exit.cpl = vmm_get_guest_cpu_cpl(vcpu);
>
>
> --
> -Dave Voutila
>



Re: Picky, but much more efficient arc4random_uniform!

2022-05-15 Thread Mike Larkin
On Sun, May 15, 2022 at 08:40:19PM -0500, Luke Small wrote:
> https://marc.info/?l=openbsd-tech=165259528425835=2
>
> This one (which is strongly based upon my first of two versions) which I
> submitted after Guenther correctly trashed version 2, doesn’t reuse any
> part of the sample. It picks up a clean new bitfield upon failure.
>
> I think Guenther didn’t, perhaps like yourself, realize I submitted this
> later program. That’s why he said it wasn’t correct. It didn’t occur to me
> at the time of responding to him: “correct correct correct.”
>

You've had several developers tell you this is not going to go in. I'd suggest
"read the room".

If you want this for your own use, just keep it as a local diff. Nobody will
know (or likely care).

-ml

> On Sun, May 15, 2022 at 7:47 PM Damien Miller  wrote:
>
> > On Sat, 14 May 2022, Luke Small wrote:
> >
> > > Look at my code. I don’t even use a modulus operator. I perform hit and
> > > miss with a random bitstream.
> > >
> > > How can I have a bias of something I don’t do? I return a bitstream which
> > > meets the parameters of being a value less than the upper bound. Much
> > like
> > > arc4random_buf().
> > >
> > > If I use arc4random_uniform() repeatedly to create a random distribution
> > of
> > > say numbers less than 0x1000 or even something weird like 0x1300 will the
> > > random distribution be better with arc4random_uniform() or with mine? For
> > > 0x1000 mine will simply pluck 12 bits of random data straight from the
> > > arc4random() (and preserve the remaining 20 bits for later) on the first
> > > try, just like it’s arc4random_buf().
> > >
> > > arc4random_uniform() will perform a modulus of a 32 bit number which adds
> > > data to the bitstream. Does it make it better? Perhaps it makes it harder
> > > to guess the source bits.
> > >
> > > I don’t know; and I’m not going to pretend to be a cryptologist. But I’m
> > > looking at modulo bias.
> > >
> > > I didn’t know what it was, before, but I basically “rejection sample”:
> > >
> > >
> > https://research.kudelskisecurity.com/2020/07/28/the-definitive-guide-to-modulo-bias-and-how-to-avoid-it/
> >
> > No, you aren't:
> >
> > > for (;;) {
> > > if (rand_bits < bits) {
> > > rand_holder |= ((uint64_t)arc4random()) <<
> > > rand_bits;
> > >
> > > /*
> > >  * rand_bits will be a number between 0 and 31
> > here
> > >  * so the 0x20 bit will be empty
> > >  * rand_bits += 32;
> > >  */
> > > rand_bits |= 32;
> > > }
> > >
> > > ret = rand_holder & uuu;
> > > rand_holder >>= bits;
> > > rand_bits -= bits;
> > >
> > > if (ret < upper_bound)
> > > return ret;
> > > }
> >
> > This isn't rejection sampling. This is reusing part of the rejected
> > sample.
> >
> > Think of it like this: you want to uniformly generate a number in the
> > range [2:10] by rolling 2x 6-sided dice. What do you do when you roll
> > 11 or 12? You can't just reroll one of the dice because the other dice
> > is constrained to be have rolled either 5 or 6, and so proceeding with
> > it would force the output to be in the range [6:11] for these ~5.6%
> > of initial rolls. Your output is no longer uniform.
> >
> > BTW the existing code already implements the prefered approach of the
> > article you quoted.
> >
> > -d
>
> --
> -Luke



Re: vmm: give a lonely enum a friend, fixing `vmctl receive`

2022-05-13 Thread Mike Larkin
On Sun, May 08, 2022 at 10:30:46PM -0400, Dave Voutila wrote:
> tech@,
>
> Another vmm/vmd update: fix `vmctl receive` on Intel hosts by adding
> another fault enum value to disambiguate fault reasons.
>
> It's expected that the guest will trigger nested page faults after being
> received by vmd. When you connect to the vm using `vmctl console` and
> interact with the guest, it generates both a page fault and interrupt.
>
> This combo is special because while the page fault will be handled by
> vmm via uvm_fault(9), it will still exit to userland/vmd to handle the
> interrupt.
>
> vmd always checks the vm-exit reason after the return from vmm before
> looping around and servicing interrupts before re-entering vmm. vmd has
> a single userland handler for nested page faults for when we have a
> protection fault. In this case, it reboots the vm. :-(
>
> Since the enum we used for the fault type flag has only one value, vmm
> isn't able to properly convey the type of nested fault. In this case, I
> chose to add a "VEE_FAULT_HANDLED" value to indicate the fault has
> already been handled by vmm and no userland assist is needed. (And
> HANDLED is the same num of characters of PROTECT.)
>
> This prevents the ambiguity and vm happily skips rebooting the vm.
>
> It's possible this reboot could occur at any point in a vm's lifetime,
> though I think the probability is low, so this is worth fixing
> regardless.
>
> ok?
>

This is ok mlarkin. Thanks!

> -dv
>
>
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /opt/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.308
> diff -u -p -r1.308 vmm.c
> --- sys/arch/amd64/amd64/vmm.c4 May 2022 02:24:26 -   1.308
> +++ sys/arch/amd64/amd64/vmm.c9 May 2022 13:45:02 -
> @@ -5732,14 +5732,16 @@ vmx_fault_page(struct vcpu *vcpu, paddr_
>   int fault_type, ret;
>
>   fault_type = vmx_get_guest_faulttype();
> - if (fault_type == -1) {
> + switch (fault_type) {
> + case -1:
>   printf("%s: invalid fault type\n", __func__);
>   return (EINVAL);
> - }
> -
> - if (fault_type == VM_FAULT_PROTECT) {
> + case VM_FAULT_PROTECT:
>   vcpu->vc_exit.vee.vee_fault_type = VEE_FAULT_PROTECT;
>   return (EAGAIN);
> + default:
> + vcpu->vc_exit.vee.vee_fault_type = VEE_FAULT_HANDLED;
> + break;
>   }
>
>   /* We may sleep during uvm_fault(9), so reload VMCS. */
> Index: sys/arch/amd64/include/vmmvar.h
> ===
> RCS file: /opt/cvs/src/sys/arch/amd64/include/vmmvar.h,v
> retrieving revision 1.75
> diff -u -p -r1.75 vmmvar.h
> --- sys/arch/amd64/include/vmmvar.h   3 May 2022 21:39:19 -   1.75
> +++ sys/arch/amd64/include/vmmvar.h   9 May 2022 13:38:18 -
> @@ -324,7 +324,8 @@ enum {
>  };
>
>  enum {
> - VEE_FAULT_PROTECT
> + VEE_FAULT_HANDLED,
> + VEE_FAULT_PROTECT,
>  };
>
>  enum {
>



Re: vmd: fix rebooting a received vm

2022-05-07 Thread Mike Larkin
On Sat, May 07, 2022 at 07:58:15AM -0400, Dave Voutila wrote:
> tech@:
>
> Now that vmd only accounts for memory in bytes [1], this fix is a lot
> simpler!
>
> If you use the send/receive functionality and "receive" a sent vm, it
> functions as expected. However, if that vm tries to reboot, it causes
> vmd to exit. (An ipc socket is closed in some error handling and
> triggers a code path ending vmd's event loop.)
>
> The problem was two-fold (and describing it is probably longer than the
> diff itself):
>
> 1. Not un-toggling the VM_RECEIVE_STATE bit on the vm after initial
>launch, triggering "received vm" code paths upon vm reboot.
>
>vmd's "parent" and "vmm" processes *both* track known vm's. The "vmm"
>process removes the vm from its list upon a loss of the child process
>(vm reboot), but the "parent" process keeps it in the tailq and
>reuses it, knowing the vm just requires a restart. (It has to resend
>the vm to the "vmm" process, which sees it as a "new" vm, creating a
>new child process.)
>
> 2. A "received vm" comes with pre-defined memory ranges created when it
>initially booted and these are restored before the vm is resumed. The
>problem is vmd overloads the use of these memory ranges, setting the
>number of ranges to 0 and using the first range's size as a way to
>communicate "max memory" for the vm. Since a clean reboot of a vm
>results in the "parent" process triggering the "vm start" paths, it
>assumes it can use that logic to determine the max memory.
>
>Depending on if you only fix (1) above, the vm results in either
>using the default vm memory (512MB) _or_ the size of the first
>range...which is always 640KB.
>
>Contrary to popular belief, 640KB is not enough for everyone,
>especially our vm.
>
> The diff below resolves (1) in vmd.c's vm_stop() and (2) in config.c's
> config_setvm().
>
> The fact this issue has been present for awhile indicates few people use
> or care about the send/receive functionality. I want to keep the
> functionality in place for awhile longer because I've begun to
> experiment with it *and* it's helping me find other bugs in vmd(8) as
> well as vmm(4). (Expect a vmm diff shortly.)
>
> For anyone looking to test [2], the simplest approach is to create a vm
> without a disk just boot the bsd.rd ramdisk while using a memory value
> that's *not* the default 512m:
>
>   # vmctl start -c -b /bsd.rd -m 1g test
>
> Wait for it to give you the installer prompt and then send it to a file:
>
>   # vmctl send test > test.vm
>
> You should have a 1g test.vm file. Restore it:
>
>   # vmctl receive test < test.vm
>
> Connect to the console and reboot:
>
>   # vmctl console test
>   (in vm)# reboot
>
> With the diff: the vm reboots and you end up back at the installer
> prompt. `vmctl stat` shows the correct 1g max mem. Reboot at least one
> more time and confirm the same result.
>
> Without the diff: the vmd parent process will exit taking its children
> with it.
>
> ok?
>

reads ok to me, thanks for the explanation. ok mlarkin

> -dv
>
> [1] https://marc.info/?l=openbsd-tech=165151507323339=2
>
> [2] note that the vmm issue I found means this will work reliably on AMD
> hosts, but may not on Intel hosts. fix coming soon.
>
> diff refs/heads/master refs/heads/vmd-memrange
> blob - 2750be4f580896325e5a3971667c64d61231db06
> blob + cf076cdc27ceaee6e2cbb9cce5825452f0a6
> --- usr.sbin/vmd/config.c
> +++ usr.sbin/vmd/config.c
> @@ -231,6 +231,7 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, ui
>   unsigned int unit;
>   struct timeval   tv, rate, since_last;
>   struct vmop_addr_req var;
> + size_t   bytes = 0;
>
>   if (vm->vm_state & VM_STATE_RUNNING) {
>   log_warnx("%s: vm is already running", __func__);
> @@ -518,6 +519,14 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, ui
>
>   free(tapfds);
>
> + /* Collapse any memranges after the vm was sent to PROC_VMM */
> + if (vcp->vcp_nmemranges > 0) {
> + for (i = 0; i < vcp->vcp_nmemranges; i++)
> + bytes += vcp->vcp_memranges[i].vmr_size;
> + memset(>vcp_memranges, 0, sizeof(vcp->vcp_memranges));
> + vcp->vcp_nmemranges = 0;
> + vcp->vcp_memranges[0].vmr_size = bytes;
> + }
>   vm->vm_state |= VM_STATE_RUNNING;
>   return (0);
>
> blob - 4d7e7b5e613723c2166077523dd6e8b9177d6718
> blob + d5d841fd20d9f82e852e3b844ec81d9383713923
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -1162,7 +1162,8 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *ca
>   __func__, ps->ps_title[privsep_process], caller,
>   vm->vm_vmid, keeptty ? ", keeping tty open" : "");
>
> - vm->vm_state &= ~(VM_STATE_RUNNING | VM_STATE_SHUTDOWN);
> + vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING
> + | VM_STATE_SHUTDOWN);
>
>   

Re: aml parse error

2022-05-03 Thread Mike Larkin
On Tue, May 03, 2022 at 04:46:55PM +0200, aphekz wrote:
> On Mon, May 02, 2022 at 07:05:24PM -0700, Mike Larkin wrote:
> > On Mon, May 02, 2022 at 11:42:57PM +0200, aphekz wrote:
> > >
> > > May  2 21:29:06 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: 
> > > OPST
> > > May  2 21:29:06 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> > > May  2 21:29:55 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: 
> > > OPST
> > > May  2 21:29:55 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> > > May  2 21:30:00 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: 
> > > OPST
> > > May  2 21:30:00 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> > > May  2 21:30:03 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: 
> > > OPST
> > > May  2 21:30:03 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> > > May  2 21:30:28 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: 
> > > OPST
> > > May  2 21:30:28 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> > > May  2 21:31:40 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: 
> > > OPST
> > > May  2 21:31:40 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> > >
> > > any hint what is going on?  some acpi/temp related stuff?
> > >
> >
> > likely bad AML. BIOS on this machine is over 10 years old, I'd look for a 
> > newer
> > one.
> >
> > -ml
>
> old machine with no newest BIOS avaliable.
>
> i haven't noticed such problem on 7.0-stable, so i'd like to at least

I'd start bisecting diffs then. Shouldn't take too long to find the commit
that broke it.

-ml

> undestand what the problem is or might be. would be great to fix it too.
>
> as for now acpidump/iasl says.
>
> External (OPST, UnknownObj)
>
> Method (_Q14, 0, NotSerialized)  // _Qxx: EC Query, xx=0x00-0xFF
> {
> P80H = 0x14
> If ((Zero == OPST))
> {
> OPST = One
> }
> Else
> {
> OPST = Zero
> }
>
> Notify (^^^PEG0.PEGP, 0xDF) // Hardware-Specific
> }
>
> isn't it some acpitz / inteldrm related method?
>
>
> >
> > > --
> > >
> > > OpenBSD 7.1-stable (GENERIC.MP) #0: Mon May  2 20:31:55 CEST 2022
> > > aph...@dev.kroczynski.net:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> > > real mem = 8483532800 (8090MB)
> > > avail mem = 8209141760 (7828MB)
> > > random: good seed from bootblocks
> > > mpath0 at root
> > > scsibus0 at mpath0: 256 targets
> > > mainbus0 at root
> > > bios0 at mainbus0: SMBIOS rev. 2.6 @ 0xf9e10 (66 entries)
> > > bios0: vendor LENOVO version "44CN45WW" date 02/16/2012
> > > bios0: LENOVO HuronRiver Platform
> > > acpi0 at bios0: ACPI 3.0
> > > acpi0: sleep states S0 S1 S3 S4 S5
> > > acpi0: tables DSDT FACP SLIC SSDT ASF! HPET APIC MCFG SSDT SSDT UEFI UEFI 
> > > UEFI
> > > acpi0: wakeup devices P0P1(S4) GLAN(S4) EHC1(S3) EHC2(S3) HDEF(S4) 
> > > RP01(S4) PXSX(S4) RP02(S4) PXSX(S4) RP03(S4) PXSX(S4) RP04(S4) PXSX(S4) 
> > > RP05(S4) PXSX(S4) RP06(S4) [...]
> > > acpitimer0 at acpi0: 3579545 Hz, 24 bits
> > > acpihpet0 at acpi0: 14318179 Hz
> > > acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
> > > cpu0 at mainbus0: apid 0 (boot processor)
> > > cpu0: Intel(R) Celeron(R) CPU B830 @ 1.80GHz, 1796.21 MHz, 06-2a-07
> > > cpu0: 
> > > FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,POPCNT,DEADLINE,XSAVE,NXE,RDTSCP,LONG,LAHF,PERF,ITSC,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,MELTDOWN
> > > cpu0: 256KB 64b/line 8-way L2 cache
> > > cpu0: smt 0, core 0, package 0
> > > mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
> > > cpu0: apic clock running at 99MHz
> > > cpu0: mwait min=64, max=64, C-substates=0.2.1, IBE
> > > cpu1 at mainbus0: apid 2 (application processor)
> > > cpu1: Intel(R) Celeron(R) CPU B830 @ 1.80GHz, 1795.94 MHz, 06-2a-07
> > > cpu1: 
> > > FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,POPCNT,DEADLINE,XSAVE,NXE,RDTSCP,LONG,LAHF,PERF,ITSC,MD_CLEAR,IBRS,IB

Re: aml parse error

2022-05-02 Thread Mike Larkin
On Mon, May 02, 2022 at 11:42:57PM +0200, aphekz wrote:
>
> May  2 21:29:06 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: OPST
> May  2 21:29:06 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> May  2 21:29:55 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: OPST
> May  2 21:29:55 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> May  2 21:30:00 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: OPST
> May  2 21:30:00 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> May  2 21:30:03 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: OPST
> May  2 21:30:03 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> May  2 21:30:28 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: OPST
> May  2 21:30:28 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
> May  2 21:31:40 dev /bsd: ### AML PARSE ERROR (0x8f3a): Undefined name: OPST
> May  2 21:31:40 dev /bsd: error evaluating: \\_SB_.PCI0.LPCB.EC0_._Q14
>
> any hint what is going on?  some acpi/temp related stuff?
>

likely bad AML. BIOS on this machine is over 10 years old, I'd look for a newer
one.

-ml

> --
>
> OpenBSD 7.1-stable (GENERIC.MP) #0: Mon May  2 20:31:55 CEST 2022
> aph...@dev.kroczynski.net:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> real mem = 8483532800 (8090MB)
> avail mem = 8209141760 (7828MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 2.6 @ 0xf9e10 (66 entries)
> bios0: vendor LENOVO version "44CN45WW" date 02/16/2012
> bios0: LENOVO HuronRiver Platform
> acpi0 at bios0: ACPI 3.0
> acpi0: sleep states S0 S1 S3 S4 S5
> acpi0: tables DSDT FACP SLIC SSDT ASF! HPET APIC MCFG SSDT SSDT UEFI UEFI UEFI
> acpi0: wakeup devices P0P1(S4) GLAN(S4) EHC1(S3) EHC2(S3) HDEF(S4) RP01(S4) 
> PXSX(S4) RP02(S4) PXSX(S4) RP03(S4) PXSX(S4) RP04(S4) PXSX(S4) RP05(S4) 
> PXSX(S4) RP06(S4) [...]
> acpitimer0 at acpi0: 3579545 Hz, 24 bits
> acpihpet0 at acpi0: 14318179 Hz
> acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
> cpu0 at mainbus0: apid 0 (boot processor)
> cpu0: Intel(R) Celeron(R) CPU B830 @ 1.80GHz, 1796.21 MHz, 06-2a-07
> cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,POPCNT,DEADLINE,XSAVE,NXE,RDTSCP,LONG,LAHF,PERF,ITSC,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,MELTDOWN
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, package 0
> mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
> cpu0: apic clock running at 99MHz
> cpu0: mwait min=64, max=64, C-substates=0.2.1, IBE
> cpu1 at mainbus0: apid 2 (application processor)
> cpu1: Intel(R) Celeron(R) CPU B830 @ 1.80GHz, 1795.94 MHz, 06-2a-07
> cpu1: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,POPCNT,DEADLINE,XSAVE,NXE,RDTSCP,LONG,LAHF,PERF,ITSC,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,MELTDOWN
> cpu1: 256KB 64b/line 8-way L2 cache
> cpu1: smt 0, core 1, package 0
> ioapic0 at mainbus0: apid 2 pa 0xfec0, version 20, 24 pins
> acpimcfg0 at acpi0
> acpimcfg0: addr 0xf800, bus 0-63
> acpiprt0 at acpi0: bus 0 (PCI0)
> acpiprt1 at acpi0: bus -1 (P0P1)
> acpiprt2 at acpi0: bus 1 (RP01)
> acpiprt3 at acpi0: bus -1 (RP02)
> acpiprt4 at acpi0: bus -1 (RP03)
> acpiprt5 at acpi0: bus 2 (RP04)
> acpiprt6 at acpi0: bus -1 (RP05)
> acpiprt7 at acpi0: bus -1 (RP06)
> acpiprt8 at acpi0: bus -1 (RP07)
> acpiprt9 at acpi0: bus -1 (RP08)
> acpiprt10 at acpi0: bus -1 (PEG0)
> acpiprt11 at acpi0: bus -1 (PEG1)
> acpiprt12 at acpi0: bus -1 (PEG2)
> acpiprt13 at acpi0: bus -1 (PEG3)
> acpiec0 at acpi0
> acpipci0 at acpi0 PCI0: 0x0004 0x0011 0x0001
> acpicmos0 at acpi0
> acpiac0 at acpi0: AC unit online
> acpibat0 at acpi0: BAT0 model "L08L6Y02" serial 44863 type LION oem 
> "4f594e4153"
> "VPC2004" at acpi0 not configured
> "ETD0604" at acpi0 not configured
> acpibtn0 at acpi0: LID0
> acpibtn1 at acpi0: SLPB
> "PNP0C14" at acpi0 not configured
> acpicpu0 at acpi0: C2(500@80 io@0x414), C1(1000@1 halt), PSS
> acpicpu1 at acpi0: C2(500@80 io@0x414), C1(1000@1 halt), PSS
> acpitz0 at acpi0: critical temperature is 98 degC
> acpitz1 at acpi0: critical temperature is 126 degC
> acpivideo0 at acpi0: PEGP
> acpivideo1 at acpi0: GFX0
> acpivout0 at acpivideo1: DD02
> cpu0: using VERW MDS workaround (except on vmm entry)
> cpu0: Enhanced SpeedStep 1796 MHz: speeds: 1800, 1700, 1600, 1500, 1400, 
> 1300, 1200, 1100, 1000, 900, 800 MHz
> pci0 at mainbus0 bus 0
> pchb0 at pci0 dev 0 function 0 "Intel Core 2G Host" rev 0x09
> inteldrm0 at pci0 dev 2 function 0 "Intel HD Graphics 2000" rev 0x09
> drm0 at inteldrm0
> inteldrm0: msi, SANDYBRIDGE, gen 6
> "Intel 6 Series MEI" rev 0x04 

Re: migrate vmd/vmm/vmctl to use bytes, not MBs

2022-05-02 Thread Mike Larkin
On Mon, May 02, 2022 at 04:09:19PM -0400, Dave Voutila wrote:
>
> Dave Voutila  writes:
>
> > tech@,
> >
> > tl;dr: standardize vmd/vmm/vmctl on counting memory in bytes at all
> > times instead of a mix of MiB and bytes.
> >
> > There's some design friction between vmd(8)/vmctl(8) and vmm(4).
> >
> > For instance, the user-facing code deals in MiB, but internally a vm's
> > memory ranges are defined in terms of bytes...but only after being
> > converted at vm launch.
> >
> > Consequently, at different points in vmd's lifecycle, the same struct
> > member for storing a vm's requested memory size contains a value in
> > bytes OR in MiB meaning any code accessing the value needs to be
> > contextually aware of if/when the value must be scaled.
> >
> > Given we dropped vmm(4) on i386 awhile ago, let's make use of 64-bit
> > values! Plus this helps my other queued up changes simpler as they can
> > avoid confusing scaling at points.
> >
> > There *is* some existing code duplication between vmd/vmctl related to
> > parsing user provided memory values via scan_scaled(3), but I'm not
> > looking to consolidate that now.
> >
> > If you're going to test, you'll need to build the kernel and either copy
> > or link the patched vmmvar.h into /usr/include/machine/ before building
> > vmd(8)/vmctl(8). (Don't forget to actually boot the kernel.)
> >
> > Otherwise, looking for ok's so I can continue squashing a few bugs in
> > vmd that will be easier/cleaner to fix once this goes in.
> >
> > While the diff looks long-ish, it shouldn't require deep vmm/vmd
> > knowledge to help review ;)
> >
>
> Updated with a fix (printing wrong limit value) and a tweak (checking a
> size_t == 0 vs < 1). No functional changes so if by chance you already
> applied the previous, please feel free to continue to test.
>
> -dv
>

Thanks. ok mlarkin@

-ml

>
> diff refs/heads/master refs/heads/vmd-bytes
> blob - 765fc19bca559dbfd83cd14c48dee94f86c4b3cc
> blob + 699798c1bbffafe7074fea43755ef7e20f073a90
> --- sys/arch/amd64/amd64/vmm.c
> +++ sys/arch/amd64/amd64/vmm.c
> @@ -1575,7 +1575,7 @@ vm_create_check_mem_ranges(struct vm_create_params *vc
>  {
>   size_t i, memsize = 0;
>   struct vm_mem_range *vmr, *pvmr;
> - const paddr_t maxgpa = (uint64_t)VMM_MAX_VM_MEM_SIZE * 1024 * 1024;
> + const paddr_t maxgpa = VMM_MAX_VM_MEM_SIZE;
>
>   if (vcp->vcp_nmemranges == 0 ||
>   vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
> blob - 94bb172832d4c2847b1e83ebb9cc05538db6ac80
> blob + 012a023943b9fbc70339166889070ff0b4619046
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -31,7 +31,7 @@
>  #define VMM_MAX_KERNEL_PATH  128
>  #define VMM_MAX_VCPUS512
>  #define VMM_MAX_VCPUS_PER_VM 64
> -#define VMM_MAX_VM_MEM_SIZE  32768
> +#define VMM_MAX_VM_MEM_SIZE  32L * 1024 * 1024 * 1024/* 32 GiB */
>  #define VMM_MAX_NICS_PER_VM  4
>
>  #define VMM_PCI_MMIO_BAR_BASE0xF000ULL
> blob - 0f7e4329a00d54a64fe41e1fb2bd2afcbaa9d68a
> blob + c54aebcb982fdc14cc7a02910301d561e6623e4d
> --- usr.sbin/vmctl/main.c
> +++ usr.sbin/vmctl/main.c
> @@ -404,24 +404,39 @@ parse_network(struct parse_result *res, char *word)
>  int
>  parse_size(struct parse_result *res, char *word)
>  {
> - long long val = 0;
> + char result[FMT_SCALED_STRSIZE];
> + long longval = 0;
>
>   if (word != NULL) {
>   if (scan_scaled(word, ) != 0) {
> - warn("invalid size: %s", word);
> + warn("invalid memory size: %s", word);
>   return (-1);
>   }
>   }
>
>   if (val < (1024 * 1024)) {
> - warnx("size must be at least one megabyte");
> + warnx("memory size must be at least 1M");
>   return (-1);
> - } else
> - res->size = val / 1024 / 1024;
> + }
>
> - if ((res->size * 1024 * 1024) != val)
> - warnx("size rounded to %lld megabytes", res->size);
> + if (val > VMM_MAX_VM_MEM_SIZE) {
> + if (fmt_scaled(VMM_MAX_VM_MEM_SIZE, result) == 0)
> + warnx("memory size too large (limit is %s)", result);
> + else
> + warnx("memory size too large");
> + return (-1);
> + }
>
> + /* Round down to the megabyte. */
> + res->size = (val / (1024 * 1024)) * (1024 * 1024);
> +
> + if (res->size != (size_t)val) {
> + if (fmt_scaled(res->size, result) == 0)
> + warnx("memory size rounded to %s", result);
> + else
> + warnx("memory size rounded to %zu bytes", res->size);
> + }
> +
>   return (0);
>  }
>
> blob - 4c0b62fc6e16adbeb5cf951dcafbaebdbc356da8
> blob + 15e6dd89ec15fa2501dcf6539c9ae9d90879ba56
> --- usr.sbin/vmctl/vmctl.c
> +++ usr.sbin/vmctl/vmctl.c
> @@ -73,7 +73,7 @@ struct imsgbuf *ibuf;
>   *  ENOMEM if a memory allocation failure occurred.
>   */
>  int
> 

Re: DPTF sensors driver

2022-04-25 Thread Mike Larkin
On Sun, Apr 24, 2022 at 08:00:50PM -0500, joshua stein wrote:
> Any interest in this?
>
> acpidptfs0 at acpi0: SEN2, sensor "Sensor 2 USB2"
> acpidptfs1 at acpi0: SEN4, sensor "Sensor 4 Ambience"
> acpidptfs2 at acpi0: SEN1, sensor "Thermistor CPU SOC"
> acpidptfs3 at acpi0: SEN3, sensor "Sensor 3 SSD"
> acpidptfs4 at acpi0: SEN5, sensor "Thermistor USB Type-C"
>
> hw.sensors.acpidptfs0.temp0=32.05 degC (Sensor 2 USB2)
> hw.sensors.acpidptfs1.temp0=26.05 degC (Sensor 4 Ambience)
> hw.sensors.acpidptfs2.temp0=35.05 degC (Thermistor CPU SOC)
> hw.sensors.acpidptfs3.temp0=35.05 degC (Sensor 3 SSD)
> hw.sensors.acpidptfs4.temp0=29.05 degC (Thermistor USB Type-C)
>

I like it, one question below.

otherwise ok mlarkin

-ml

>
>
> commit 959656ab8227367705adc45d73f5b6d47d552ac3
> Author: joshua stein 
> Date:   Mon Aug 9 12:45:15 2021 -0500
>
> acpidptfs: Add a driver for Dynamic Platform and Thermal Framework sensors
>
> diff --git sys/arch/amd64/conf/GENERIC sys/arch/amd64/conf/GENERIC
> index ecbf4d82305..3fc30b1e941 100644
> --- sys/arch/amd64/conf/GENERIC
> +++ sys/arch/amd64/conf/GENERIC
> @@ -85,6 +85,7 @@ acpihid*at acpi?
>  ipmi0at acpi? disable
>  ccpmic*  at iic?
>  tipmic*  at iic?
> +acpidptfs*   at acpi?
>
>  mpbios0  at bios0
>
> diff --git sys/dev/acpi/acpidptfs.c sys/dev/acpi/acpidptfs.c
> new file mode 100644
> index 000..c863c8d1f97
> --- /dev/null
> +++ sys/dev/acpi/acpidptfs.c
> @@ -0,0 +1,173 @@
> +/* $OpenBSD$ */
> +/*
> + * Copyright (c) 2021 joshua stein 
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +struct acpidptfs_softc {
> + struct device   sc_dev;
> +
> + struct acpi_softc   *sc_acpi;
> + struct aml_node *sc_devnode;
> +
> + int sc_devtype;
> +
> + struct ksensor  sc_sensor;
> + struct ksensordev   sc_sensdev;
> +};
> +
> +#define ACPIDPTFS_TYPE_SENSOR0x03
> +#define ACPIDPTFS_TYPE_CHARGER   0x0B
> +#define ACPIDPTFS_TYPE_BATTERY   0x0C
> +
> +int  acpidptfs_match(struct device *, void *, void *);
> +void acpidptfs_attach(struct device *, struct device *, void *);
> +void acpidptfs_sensor_add(struct acpidptfs_softc *);
> +int  acpidptfs_notify(struct aml_node *, int, void *);
> +void acpidptfs_update(struct acpidptfs_softc *);
> +
> +struct cfattach acpidptfs_ca = {
> + sizeof(struct acpidptfs_softc),
> + acpidptfs_match,
> + acpidptfs_attach,
> + NULL,
> +};
> +
> +struct cfdriver acpidptfs_cd = {
> + NULL, "acpidptfs", DV_DULL
> +};
> +
> +const char *acpidptfs_hids[] = {
> + "INT3403",
> + "INTC1043",
> + "INTC1046",
> + NULL
> +};
> +
> +int
> +acpidptfs_match(struct device *parent, void *match, void *aux)
> +{
> + struct acpi_attach_args *aaa = aux;
> + struct cfdata *cf = match;
> +
> + return acpi_matchhids(aaa, acpidptfs_hids, cf->cf_driver->cd_name);
> +}
> +
> +void
> +acpidptfs_attach(struct device *parent, struct device *self, void *aux)
> +{
> + struct acpidptfs_softc *sc = (struct acpidptfs_softc *)self;
> + struct acpi_attach_args *aa = aux;
> + int64_t res;
> +
> + sc->sc_acpi = (struct acpi_softc *)parent;
> + sc->sc_devnode = aa->aaa_node;
> + sc->sc_devtype = -1;
> +
> + printf(": %s", sc->sc_devnode->name);
> +
> + if (aml_evalinteger((struct acpi_softc *)parent, aa->aaa_node,
> + "_TMP", 0, NULL, ) == 0)
> + sc->sc_devtype = ACPIDPTFS_TYPE_SENSOR;
> + else if (aml_evalinteger((struct acpi_softc *)parent, aa->aaa_node,
> + "PTYP", 0, NULL, ) == 0)
> + sc->sc_devtype = res;
> +
> + switch (sc->sc_devtype) {
> + case ACPIDPTFS_TYPE_SENSOR:
> + acpidptfs_sensor_add(sc);
> + break;
> + case ACPIDPTFS_TYPE_CHARGER:
> + /* TODO */
> + printf(", charger\n");
> + break;
> + case ACPIDPTFS_TYPE_BATTERY:
> + /* TODO */
> + printf(", battery\n");
> + break;
> + default:
> + printf(", unknown type\n");
> + 

Re: amdgpio(4) : preserve pin configuration on resume

2022-04-20 Thread Mike Larkin
On Wed, Apr 20, 2022 at 11:39:00AM +0200, Mark Kettenis wrote:
> > Date: Tue, 19 Apr 2022 22:02:00 -0700
> > From: Mike Larkin 
> >
> > On at least the Asus ROG Zephyrus 14 (2020), the trackpad fails to generate
> > any interrupts after resume. I tracked this down to amdgpio(4) not 
> > generating
> > interrupts after resume, and started looking at missing soft state.
> >
> > This diff preserves the interrupt pin configurations and restores them after
> > resume. This makes the device function properly post-zzz and post-ZZZ.
>
> I think it might make sense to structure this a bit more like
> pchgpio(4).  There we only restore the configuration for pins that are
> "in use" by OpenBSD.
>
> > Note: amdgpio_read_pin does not return the value that was previously written
> > during amdgpio_intr_establish (it always just returns 0x1 if the pin is
> > in use), so I'm just saving the actual value we write during
> > amdgpio_intr_establish and restoring that during resume.
>
> Well, using amdgpio_read_pin() for the purpose of saving the pin
> configuration doesn't make sense.  That function returns the pin input
> state.
>
> What you need to do is to read the register using bus_space_read_4()
> and restore that value.  Again take a look at pchgpio(4).
>
> > Note 2: In xxx_activate() functions, we usually call 
> > config_activate_children
> > but since amdgpio doesn't have any children, I left that out.
>
> I think that's fine.  But you should do the save/restore in
> DVACT_SUSPEND/DVACT_RESUME.  You want to restore the state as early as
> possible such that you don't get spurious interrupts when the BIOS
> leaves GPIO pins misconfigured.  Again, look at pchgpio(4).
>

Will take a look, thanks!

-ml

> >
> > ok?
> >
> > -ml
> >
> >
> > diff a82721d2c9ea32a8f6043a3e06b2a7f8280ef68b /export/bin/src/OpenBSD/g14
> > blob - 1d0cd5fcede71f0495a271a9d06fc9c0ecb16412
> > file + sys/dev/acpi/amdgpio.c
> > --- sys/dev/acpi/amdgpio.c
> > +++ sys/dev/acpi/amdgpio.c
> > @@ -62,13 +62,17 @@ struct amdgpio_softc {
> > struct amdgpio_intrhand *sc_pin_ih;
> >
> > struct acpi_gpio sc_gpio;
> > +
> > +   uint32_t *sc_pincfg;
> >  };
> >
> >  intamdgpio_match(struct device *, void *, void *);
> >  void   amdgpio_attach(struct device *, struct device *, void *);
> > +intamdgpio_activate(struct device *, int);
> >
> >  const struct cfattach amdgpio_ca = {
> > -   sizeof(struct amdgpio_softc), amdgpio_match, amdgpio_attach
> > +   sizeof(struct amdgpio_softc), amdgpio_match, amdgpio_attach, NULL,
> > +   amdgpio_activate
> >  };
> >
> >  struct cfdriver amdgpio_cd = {
> > @@ -98,6 +102,24 @@ amdgpio_match(struct device *parent, void *match, void
> > return acpi_matchhids(aaa, amdgpio_hids, cf->cf_driver->cd_name);
> >  }
> >
> > +int
> > +amdgpio_activate(struct device *self, int act)
> > +{
> > +   struct amdgpio_softc *sc = (struct amdgpio_softc *)self;
> > +   int rv = 0, i;
> > +
> > +   switch (act) {
> > +   case DVACT_WAKEUP:
> > +   for (i = 0; i < sc->sc_npins; i++) {
> > +   if (sc->sc_pincfg[i])
> > +   bus_space_write_4(sc->sc_memt, sc->sc_memh,
> > +   i * 4, sc->sc_pincfg[i]);
> > +   }
> > +   }
> > +
> > +   return (rv);
> > +}
> > +
> >  void
> >  amdgpio_attach(struct device *parent, struct device *self, void *aux)
> >  {
> > @@ -152,6 +174,8 @@ amdgpio_attach(struct device *parent, struct device *s
> > sc->sc_node->gpio = >sc_gpio;
> >
> > printf(", %d pins\n", sc->sc_npins);
> > +   sc->sc_pincfg = malloc(sc->sc_npins * sizeof(uint32_t), M_DEVBUF,
> > +   M_WAITOK | M_ZERO);
> >
> > acpi_register_gpio(sc->sc_acpi, sc->sc_node);
> > return;
> > @@ -210,6 +234,8 @@ amdgpio_intr_establish(void *cookie, int pin, int flag
> > reg |= AMDGPIO_CONF_ACTBOTH;
> > reg |= (AMDGPIO_CONF_INT_MASK | AMDGPIO_CONF_INT_EN);
> > bus_space_write_4(sc->sc_memt, sc->sc_memh, pin * 4, reg);
> > +
> > +   sc->sc_pincfg[pin] = reg;
> >  }
> >
> >  int
> >
> >
>



amdgpio(4) : preserve pin configuration on resume

2022-04-19 Thread Mike Larkin
On at least the Asus ROG Zephyrus 14 (2020), the trackpad fails to generate
any interrupts after resume. I tracked this down to amdgpio(4) not generating
interrupts after resume, and started looking at missing soft state.

This diff preserves the interrupt pin configurations and restores them after
resume. This makes the device function properly post-zzz and post-ZZZ.

Note: amdgpio_read_pin does not return the value that was previously written
during amdgpio_intr_establish (it always just returns 0x1 if the pin is
in use), so I'm just saving the actual value we write during
amdgpio_intr_establish and restoring that during resume.

Note 2: In xxx_activate() functions, we usually call config_activate_children
but since amdgpio doesn't have any children, I left that out.

ok?

-ml


diff a82721d2c9ea32a8f6043a3e06b2a7f8280ef68b /export/bin/src/OpenBSD/g14
blob - 1d0cd5fcede71f0495a271a9d06fc9c0ecb16412
file + sys/dev/acpi/amdgpio.c
--- sys/dev/acpi/amdgpio.c
+++ sys/dev/acpi/amdgpio.c
@@ -62,13 +62,17 @@ struct amdgpio_softc {
struct amdgpio_intrhand *sc_pin_ih;

struct acpi_gpio sc_gpio;
+
+   uint32_t *sc_pincfg;
 };

 intamdgpio_match(struct device *, void *, void *);
 void   amdgpio_attach(struct device *, struct device *, void *);
+intamdgpio_activate(struct device *, int);

 const struct cfattach amdgpio_ca = {
-   sizeof(struct amdgpio_softc), amdgpio_match, amdgpio_attach
+   sizeof(struct amdgpio_softc), amdgpio_match, amdgpio_attach, NULL,
+   amdgpio_activate
 };

 struct cfdriver amdgpio_cd = {
@@ -98,6 +102,24 @@ amdgpio_match(struct device *parent, void *match, void
return acpi_matchhids(aaa, amdgpio_hids, cf->cf_driver->cd_name);
 }

+int
+amdgpio_activate(struct device *self, int act)
+{
+   struct amdgpio_softc *sc = (struct amdgpio_softc *)self;
+   int rv = 0, i;
+
+   switch (act) {
+   case DVACT_WAKEUP:
+   for (i = 0; i < sc->sc_npins; i++) {
+   if (sc->sc_pincfg[i])
+   bus_space_write_4(sc->sc_memt, sc->sc_memh,
+   i * 4, sc->sc_pincfg[i]);
+   }
+   }
+
+   return (rv);
+}
+
 void
 amdgpio_attach(struct device *parent, struct device *self, void *aux)
 {
@@ -152,6 +174,8 @@ amdgpio_attach(struct device *parent, struct device *s
sc->sc_node->gpio = >sc_gpio;

printf(", %d pins\n", sc->sc_npins);
+   sc->sc_pincfg = malloc(sc->sc_npins * sizeof(uint32_t), M_DEVBUF,
+   M_WAITOK | M_ZERO);

acpi_register_gpio(sc->sc_acpi, sc->sc_node);
return;
@@ -210,6 +234,8 @@ amdgpio_intr_establish(void *cookie, int pin, int flag
reg |= AMDGPIO_CONF_ACTBOTH;
reg |= (AMDGPIO_CONF_INT_MASK | AMDGPIO_CONF_INT_EN);
bus_space_write_4(sc->sc_memt, sc->sc_memh, pin * 4, reg);
+
+   sc->sc_pincfg[pin] = reg;
 }

 int



Re: vmd writes corrupt qcow2 images

2022-04-18 Thread Mike Larkin
On Mon, Apr 18, 2022 at 12:21:39PM -0400, Dave Voutila wrote:
>
> "Thomas L."  writes:
>
> > Hi,
> >
> > I recently tried to use qemu-img with qcow2 images of my VMs and
> > qemu-img finds them corrupted. I can reproduce the issue in the
> > following way (on -current, but is the same on -stable; tried different
> > hosts to exclude hardware errors):
> >
> > marsden# vmctl create -s 300G test.qcow2
> > vmctl: qcow2 imagefile created
> > marsden# qemu-img check test.qcow2
> > No errors were found on the image.
> > Image end offset: 262144
> > marsden# vmctl start -cL -B net -b /bsd.rd -d test.qcow2 test
> 
> > marsden# qemu-img check test.qcow2
> > ERROR cluster 32769 refcount=0 reference=1
> >
> > 1 errors were found on the image.
> > Data may be corrupted, or further writes to the image may corrupt it.
> > 39422/4915200 = 0.80% allocated, 3.31% fragmented, 0.00% compressed clusters
> > Image end offset: 2610888704
>
> I've been able to reproduce the above, but interestingly if I "repair"
> the qcow2 image using:
>
> $ qemu-img -r all test.qcow2
>
> It then reports 0 errors. Booting up the vm and installing packages like
> python3 and git and then cloning some git repos to work the disk, it
> still reports 0 errors.
>
> Interestingly, if I install something like an Alpine 3.13 guest (just an
> iso I had handy), qemu-img reports 0 errors.
>
> I also looked at a handful of qcow2 images on one of my workstations and
> seems only an Alpine and NixOS guest do not have this "corruption"
> report. However, I've never experienced any noticeable abnormalities.
>
> Since I see this from non-OpenBSD guests (Debian, for instance) it
> doesn't seem to be our vioblk(4) driver and is probably vmd(8)'s qcow2
> implementation..
>
> Anyone familiar enough with qcow2 that might make heads or tails of
> this? (cc'd ori@ as he was the original implementer.)
>
> -dv
>

Yeah, Ori would be the right person to answer. If not I can take a look if I
find the time.



Re: VMM avoid duplication and reduce atack surface with octboot(4)

2022-03-22 Thread Mike Larkin
On Wed, Mar 23, 2022 at 04:27:40AM +, Alexis wrote:
> Indeed I understood both octboot and vmm seabios/uefi initialazation process.
>
> But has its done with kexec and linuxboot coreboot payload octboot could be 
> ported to act in the same way. Explaining now again for the 3rd time, to 
> avoid stack duplication and decrease atack surface. Basically only openbsd 
> kernel network drivers etc would be needed not all equivalent seabios/uefi 
> ones ...
>
> But the problem here is one of ego because Philipe for one to understand 
> that, people need to check link provided before falling into unbased 
> assumptions. I also understand this is a major project nontheless this was 
> the idea and vision I intended to share with original post
>
> Cheers regardeless, happy to answer further queations on it as well

Your diff to implement this new functionality would be welcome on tech@, as are
all vmm diffs.

-ml



Re: VMM avoid duplication and reduce atack surface with octboot(4)

2022-03-22 Thread Mike Larkin
On Wed, Mar 23, 2022 at 12:58:41AM +, Alexis wrote:
> Have vmm/vmd core developers ever thought of using octboot has a way to u
> se openbsd has a bootloader to avoid stack duplication, and attached atack
> surface reduction. Avoiding to maintain 2 stacks, seabios/uefi and host
> vm, could be a simple way to improve vmm for HVM or other nonOpenBSD
> system
>
> Cheers in advance for feedback.

wat



Re: initial 11ac support for iwm(4)

2022-03-17 Thread Mike Larkin
On Wed, Mar 16, 2022 at 11:17:47PM +0100, Stefan Sperling wrote:
> On Wed, Mar 16, 2022 at 04:11:41PM +0100, Stefan Sperling wrote:
> > This patch adds initial 11ac support to the iwm(4) driver.
> > It allows use of 80 MHz channels and VHT MCS.
>
> Updated patch. Fixes a fatal firmware error on devices which
> do not support MIMO, such as the 3160.
>

Works great here on

iwm0 at pci3 dev 0 function 0 "Intel AC 7265" rev 0x59, msi
iwm0: hw rev 0x210, fw ver 17.3216344376.0

One thing I did notice is that after ZZZ/un-ZZZ, it won't move out of 11n mode
until I ifconfig iwm0 down / up. Then it picks up the VHT rate again.

-ml



Re: initial iwx(4) 11ac patch for testing

2022-03-09 Thread Mike Larkin
On Wed, Mar 09, 2022 at 01:07:47PM +0100, Stefan Sperling wrote:
> This patch adds initial 11ac support to the iwx(4) driver.
> This means that 80MHz channels can be used. No other 11ac features
> are enabled yet.
>
> This is not yet a patch which could be committed. Apart from debug
> prints which need to go, there is a known issue found by dv@ where
> this patch causes a firmware error, sysassert 0x20101A25. The reason
> for this is not known.
> It would help to get more testing to see if more clues can be found
> based on where this error pops up. I cannot reproduce the error myself.
>
> When sending feedback, please be clear about which iwx(4) device and
> which access point has been tested. Thanks!
>
> The patch works for me on AX200 and AX201 with a pepwave AC one mini AP,
> although throughput is not much different to 11n 40MHz with this AP.
>

I tried this on my iwx. The performance increase is noticeable. It looks
like it negotiated VHT-MCS9. I did see some 80MHz dmesg printfs but I'm
not sure how to interpret them. LMK if you want more info.

Before

[ ID] Interval   Transfer Bitrate
[  6]   0.00-1.00   sec  20.5 MBytes   172 Mbits/sec
[  6]   1.00-2.00   sec  22.6 MBytes   190 Mbits/sec
[  6]   2.00-3.00   sec  22.4 MBytes   187 Mbits/sec
[  6]   3.00-4.00   sec  22.6 MBytes   189 Mbits/sec
[  6]   4.00-5.00   sec  23.0 MBytes   193 Mbits/sec
[  6]   5.00-6.01   sec  22.5 MBytes   188 Mbits/sec
[  6]   6.01-7.00   sec  22.4 MBytes   188 Mbits/sec
[  6]   7.00-8.00   sec  22.2 MBytes   186 Mbits/sec
[  6]   8.00-9.00   sec  22.9 MBytes   193 Mbits/sec
[  6]   9.00-10.00  sec  22.9 MBytes   192 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval   Transfer Bitrate
[  6]   0.00-10.00  sec   224 MBytes   188 Mbits/sec  sender
[  6]   0.00-10.25  sec   224 MBytes   183 Mbits/sec  receiver


After


[ ID] Interval   Transfer Bitrate
[  6]   0.00-1.00   sec  35.7 MBytes   299 Mbits/sec
[  6]   1.00-2.00   sec  33.1 MBytes   278 Mbits/sec
[  6]   2.00-3.00   sec  36.1 MBytes   303 Mbits/sec
[  6]   3.00-4.00   sec  35.4 MBytes   296 Mbits/sec
[  6]   4.00-5.00   sec  36.5 MBytes   307 Mbits/sec
[  6]   5.00-6.00   sec  36.8 MBytes   309 Mbits/sec
[  6]   6.00-7.00   sec  37.2 MBytes   312 Mbits/sec
[  6]   7.00-8.00   sec  38.2 MBytes   321 Mbits/sec
[  6]   8.00-9.00   sec  34.0 MBytes   285 Mbits/sec
[  6]   9.00-10.00  sec  38.3 MBytes   322 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval   Transfer Bitrate
[  6]   0.00-10.00  sec   361 MBytes   303 Mbits/sec  sender
[  6]   0.00-10.20  sec   361 MBytes   297 Mbits/sec  receiver


iwx0 at pci0 dev 20 function 3 "Intel Wi-Fi 6 AX201" rev 0x20, msix
iwx0: hw rev 0x350, fw ver 67.8f59b80b.0

iwx0: flags=808847 mtu 
1500
lladdr x
index 1 priority 4 llprio 3
groups: wlan egress
media: IEEE802.11 autoselect (VHT-MCS9 mode 11ac)
status: active
ieee80211: join x chan 149 bssid x 77% wpakey wpaprotos wpa2 
wpaakms psk wpaciphers ccmp wpagroupcipher ccmp



Re: Properly check if ACPI devices are enabled

2022-01-27 Thread Mike Larkin
On Mon, Jan 24, 2022 at 12:15:58PM -0800, Philip Guenther wrote:
> On Mon, Jan 24, 2022 at 11:41 AM Mark Kettenis 
> wrote:
>
> > > Date: Mon, 24 Jan 2022 20:19:46 +0100
> > > From: Anton Lindqvist 
> > >
> > > On Mon, Jan 24, 2022 at 05:31:49PM +0100, Mark Kettenis wrote:
> > > > Currently we attach ACPI devices that are present in a machine.
> > > > However, in some cases ACPI devices can be present, but not enabled.
> > > > Attaching a device driver to devices that are not enabled is not a
> > > > good idea since reading and writing from/to its registers will fail
> > > > and the driver will malfunction in interesting ways.  Such as a com(4)
> > > > serial port that is misdetected and hangs the kernel when it is
> > > > actually opened.
> > > >
> > > > The diff below makes sure we only enable devices that are actually
> > > > enabled.  This may cause some devices to disappear in OpenBSD.
> > > > However those devices should have been unusable anyway, so that isn't
> > > > an issue.
> > > >
> > > > ok?
> > >
> > > According to the ACPI specification[1]:
> > >
> > > > A device can only decode its hardware resources if both bits 0 and 1
> > are
> > > > set. If the device is not present (bit [0] cleared) or not enabled (bit
> > > > [1] cleared), then the device must not decode its resources.
> >
> > Just before that it says:
> >
> >   If bit [0] is cleared, then bit 1 must also be cleared (in other
> >   words, a device that is not present cannot be enabled).
> >
> > > Should we therefore check for presence of both STA_PRESENT and
> > > STA_ENABLED?
> >
> > So according to the ACPI specification we don't need to do that.
> > Should we do it just to be safe?
> >
>
> Unless you're taking money bets about this being the one thing in the ACPI
> spec that some vendor won't screw up, doing both seems "can't be worse; can
> be better".
>
> Philip

A bit late to the party here, but we should all remember the HP bios with the
FACS table version of "1" and not 1.  (0x31 and not 0x01).

So yes, someone will indeed screw it up if it's possible to screw it up.



Re: dt: make vmm tracepoints amd64 only

2022-01-17 Thread Mike Larkin
On Mon, Jan 17, 2022 at 10:10:26AM -0300, Crystal Kolipe wrote:
> On Mon, Jan 17, 2022 at 01:00:44PM +, Klemens Nanni wrote:
> > These don't hurt on !VMM architectures but I was still surprised to see
> > them on e.g. sparc64:
> >
> > # arch -s ; btrace -l | grep vmm
> > sparc64
> > tracepoint:vmm:guest_enter
> > tracepoint:vmm:guest_exit
> >
> > Like some network drivers, we could use __amd64__ to limit those to
> > amd64 and save a few bits in all other kernels.
>
> Don't we want this on i386 too?
>

pd@ removed i386 vmm(4) years ago. It was sorta pointless running VMs
on a host that can only support 4GB physmem.



Re: Silence vmd rtc_update_rega non-32KHz timebase spam

2022-01-15 Thread Mike Larkin
On Wed, Dec 08, 2021 at 07:45:50PM -0600, Brian Conway wrote:
> Ping with complete diff. Thanks.
>
> Brian Conway
>

Catching up on old emails. Committed. Thanks.

-ml

> diff --git usr.sbin/vmd/mc146818.c usr.sbin/vmd/mc146818.c
> index e3599c685..001c1a055 100644
> --- usr.sbin/vmd/mc146818.c
> +++ usr.sbin/vmd/mc146818.c
> @@ -34,7 +34,6 @@
>  #include "vmd.h"
>  #include "vmm.h"
>
> -#define MC_DIVIDER_MASK 0xe0
>  #define MC_RATE_MASK 0xf
>
>  #define NVRAM_CENTURY 0x32
> @@ -236,10 +235,6 @@ rtc_reschedule_per(void)
>  static void
>  rtc_update_rega(uint32_t data)
>  {
> -if ((data & MC_DIVIDER_MASK) != MC_BASE_32_KHz)
> -log_warnx("%s: set non-32KHz timebase not supported",
> -__func__);
> -
>  rtc.regs[MC_REGA] = data;
>  if ((rtc.regs[MC_REGA] ^ data) & 0x0f)
>  vm_pipe_send(_pipe, MC146818_RESCHEDULE_PER);
>
>
> On Thu, Nov 18, 2021 at 8:02 AM Brian Conway  wrote:
> >
> > Per https://marc.info/?l=openbsd-misc=159113575425726 , mlarkin@
> > suggested someone can remove it. It's still pretty spammy at the
> > current time for me.
> >
> > Brian Conway
> > Software Engineer, Owner
> > RCE Software, LLC
> >
> > diff --git usr.sbin/vmd/mc146818.c usr.sbin/vmd/mc146818.c
> > index e3599c68504..17cf21221e5 100644
> > --- usr.sbin/vmd/mc146818.c
> > +++ usr.sbin/vmd/mc146818.c
> > @@ -236,10 +236,6 @@ rtc_reschedule_per(void)
> >  static void
> >  rtc_update_rega(uint32_t data)
> >  {
> > -if ((data & MC_DIVIDER_MASK) != MC_BASE_32_KHz)
> > -log_warnx("%s: set non-32KHz timebase not supported",
> > -__func__);
> > -
> >  rtc.regs[MC_REGA] = data;
> >  if ((rtc.regs[MC_REGA] ^ data) & 0x0f)
> >  vm_pipe_send(_pipe, MC146818_RESCHEDULE_PER);
>



Re: mpsafe dwxe(4)

2022-01-04 Thread Mike Larkin
On Mon, Jan 03, 2022 at 09:24:15PM +1000, Jonathan Matthew wrote:
> This is almost identical to the changes I made to dwge(4) recently, since
> these drivers are very closely related.  Unfortunately the only machine I
> have with dwxe(4) in it is armv7, so I can't test this properly, but it
> does still work there.
>
> Could someone with an arm64 allwinner board try this out more extensively?
>

Tested on my sopine with dwxe(4) on GENERIC.MP and it seems to work fine.

>
> Index: if_dwxe.c
> ===
> RCS file: /cvs/src/sys/dev/fdt/if_dwxe.c,v
> retrieving revision 1.19
> diff -u -p -r1.19 if_dwxe.c
> --- if_dwxe.c 24 Oct 2021 17:52:26 -  1.19
> +++ if_dwxe.c 3 Jan 2022 11:21:19 -
> @@ -275,6 +275,7 @@ struct dwxe_softc {
>   bus_space_tag_t sc_iot;
>   bus_space_handle_t  sc_ioh;
>   bus_dma_tag_t   sc_dmat;
> + void*sc_ih;
>
>   struct arpcom   sc_ac;
>  #define sc_lladdrsc_ac.ac_enaddr
> @@ -287,7 +288,6 @@ struct dwxe_softc {
>   struct dwxe_buf *sc_txbuf;
>   struct dwxe_desc*sc_txdesc;
>   int sc_tx_prod;
> - int sc_tx_cnt;
>   int sc_tx_cons;
>
>   struct dwxe_dmamem  *sc_rxring;
> @@ -322,7 +322,7 @@ uint32_t dwxe_read(struct dwxe_softc *,
>  void dwxe_write(struct dwxe_softc *, bus_addr_t, uint32_t);
>
>  int  dwxe_ioctl(struct ifnet *, u_long, caddr_t);
> -void dwxe_start(struct ifnet *);
> +void dwxe_start(struct ifqueue *);
>  void dwxe_watchdog(struct ifnet *);
>
>  int  dwxe_media_change(struct ifnet *);
> @@ -345,7 +345,7 @@ void  dwxe_rx_proc(struct dwxe_softc *);
>  void dwxe_up(struct dwxe_softc *);
>  void dwxe_down(struct dwxe_softc *);
>  void dwxe_iff(struct dwxe_softc *);
> -int  dwxe_encap(struct dwxe_softc *, struct mbuf *, int *);
> +int  dwxe_encap(struct dwxe_softc *, struct mbuf *, int *, int *);
>
>  void dwxe_reset(struct dwxe_softc *);
>  void dwxe_stop_dma(struct dwxe_softc *);
> @@ -431,8 +431,9 @@ dwxe_attach(struct device *parent, struc
>   ifp = >sc_ac.ac_if;
>   ifp->if_softc = sc;
>   ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
> + ifp->if_xflags = IFXF_MPSAFE;
>   ifp->if_ioctl = dwxe_ioctl;
> - ifp->if_start = dwxe_start;
> + ifp->if_qstart = dwxe_start;
>   ifp->if_watchdog = dwxe_watchdog;
>   ifq_set_maxlen(>if_snd, DWXE_NTXDESC - 1);
>   bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
> @@ -460,8 +461,10 @@ dwxe_attach(struct device *parent, struc
>   if_attach(ifp);
>   ether_ifattach(ifp);
>
> - fdt_intr_establish(faa->fa_node, IPL_NET, dwxe_intr, sc,
> - sc->sc_dev.dv_xname);
> + sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE,
> + dwxe_intr, sc, sc->sc_dev.dv_xname);
> + if (sc->sc_ih == NULL)
> + printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname);
>  }
>
>  void
> @@ -584,11 +587,12 @@ dwxe_lladdr_write(struct dwxe_softc *sc)
>  }
>
>  void
> -dwxe_start(struct ifnet *ifp)
> +dwxe_start(struct ifqueue *ifq)
>  {
> + struct ifnet *ifp = ifq->ifq_if;
>   struct dwxe_softc *sc = ifp->if_softc;
>   struct mbuf *m;
> - int error, idx;
> + int error, idx, left, used;
>
>   if (!(ifp->if_flags & IFF_RUNNING))
>   return;
> @@ -600,27 +604,29 @@ dwxe_start(struct ifnet *ifp)
>   return;
>
>   idx = sc->sc_tx_prod;
> - while ((sc->sc_txdesc[idx].sd_status & DWXE_TX_DESC_CTL) == 0) {
> - m = ifq_deq_begin(>if_snd);
> - if (m == NULL)
> + left = sc->sc_tx_cons;
> + if (left <= idx)
> + left += DWXE_NTXDESC;
> + left -= idx;
> + used = 0;
> +
> + for (;;) {
> + if (used + DWXE_NTXSEGS + 1 > left) {
> + ifq_set_oactive(ifq);
>   break;
> + }
>
> - error = dwxe_encap(sc, m, );
> - if (error == ENOBUFS) {
> - ifq_deq_rollback(>if_snd, m);
> - ifq_set_oactive(>if_snd);
> + m = ifq_dequeue(ifq);
> + if (m == NULL)
>   break;
> - }
> +
> + error = dwxe_encap(sc, m, , );
>   if (error == EFBIG) {
> - ifq_deq_commit(>if_snd, m);
>   m_freem(m); /* give up: drop it */
>   ifp->if_oerrors++;
>   continue;
>   }
>
> - /* Now we are committed to transmit the packet. */
> - ifq_deq_commit(>if_snd, m);
> -
>  #if NBPFILTER > 0
>   if (ifp->if_bpf)
>   bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
> @@ -632,6 +638,9 @@ dwxe_start(struct ifnet *ifp)
>
>   /* Set a timeout in case the chip goes out to lunch. */
>   

Re: Remove gtp from amd64 GENERIC kernel

2022-01-03 Thread Mike Larkin
On Mon, Jan 03, 2022 at 06:43:29PM -0800, Greg Steuck wrote:
> Crystal Kolipe  writes:
>
> > The gpt driver was completely deleted from the tree in 2016, and removed 
> > from the i386 GENERIC config in revision 1.819.
> >
> > It has, however, remained in the amd64 GENERIC config commented out,
> > which seems like an oversight.
>
> I agree, thanks!
>
> >
> > This patch removes it from amd64 GENERIC:
>
> OK gnezdo, if somebody wants to commit. Or tell me that I should.
>

ok mlarkin if not done already

> >
> > --- GENERIC.origMon Jan  3 08:42:52 2022
> > +++ GENERIC Mon Jan  3 08:45:14 2022
> > @@ -670,11 +670,7 @@
> >
> >  bktr0  at pci?
> >
> > -# FM-Radio devices
> > -#gtp*  at pci? # Gemtek/Guillemot Radio PCI Radio Card
> > -
> >  # FM-Radio support
> > -#radio*at gtp?
> >  radio* at bktr?
> >
> >  #wdt0  at pci? # Ind Computer Source PCI-WDT50x driver
>



Re: vmm(4): restore vmcs after sleep points [vmx 2/3]

2021-12-03 Thread Mike Larkin
On Mon, Nov 29, 2021 at 08:41:22PM -0500, Dave Voutila wrote:
>
> Dave Voutila  writes:
>
> > This diff removes instability from VMX-based hosts by either removing
> > the possibility of the process sleeping while the VMCS is active or
> > reloading it if we had no choice.
> >
> > A mutex is added to help guard the VMCS state so testing with witness
> > has helped verify the diff.
> >
>
> Removed the mutex as it has served its purpose in ferreting out some
> sleep points.
>
> > The rwlock on the cpu originally used in the remote vmclear routine is
> > changed to a mutex accordingly.
> >
>
> Reverted this. This update doesn't change the rwlock to a mutex...it's
> fine if we sleep while we wait for a remote clear as it doesn't matter
> which CPU we wake up on as we're about to reload the VMCS anyways.
>
> > This diff does not remote possible calls to printf(9) via the DPRINTF
> > macro as that's part of the next diff.
> >
>
> Moot at this point.
>
> > One area of note: in vmx_load_pdptes() there's a XXX to call out that
> > because of the printf(9) call on failure to km_alloc that the VMCS is
> > potentially no longer valid. The upcoming diff to swap out printf(9) for
> > log(9) will remove that.
> >
>
> Revisited the above now that we're holding off on this printf -> log
> changeover.
>
> It was in the previous diff as well, but just to point out this removes
> the KERNEL_LOCK dance around uvm_fault. We were only doing this on Intel
> hosts as it wasn't understood (at that time) what was causing the VMCS
> corruption. AMD hosts haven't done this during nested page fault exit
> handling since my work to unlock vmm(4) at k2k21.
>
> ok?
>

ok mlarkin, and thanks for tracking these down.

-ml

>
> blob - 8e588f7dcbd1cec2e61e7b7292ee32ff4eb9a2e1
> blob + ac91b74fd4d5da774808ad1c78d75469ff89b458
> --- sys/arch/amd64/amd64/vmm.c
> +++ sys/arch/amd64/amd64/vmm.c
> @@ -3028,12 +3028,22 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg
>   IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
>   if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
>   IA32_VMX_ENABLE_VPID, 1)) {
> - if (vmm_alloc_vpid()) {
> +
> + /* We may sleep during allocation, so reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
> + ret = vmm_alloc_vpid();
> + if (vcpu_reload_vmcs_vmx(vcpu)) {
> + printf("%s: failed to reload vmcs\n", __func__);
> + ret = EINVAL;
> + goto exit;
> + }
> + if (ret) {
>   DPRINTF("%s: could not allocate VPID\n",
>   __func__);
>   ret = EINVAL;
>   goto exit;
>   }
> +
>   if (vmwrite(VMCS_GUEST_VPID, vpid)) {
>   DPRINTF("%s: error setting guest VPID\n",
>   __func__);
> @@ -5549,7 +5559,7 @@ svm_handle_np_fault(struct vcpu *vcpu)
>   *
>   * Return Values:
>   *  0: if successful
> - *  EINVAL: if fault type could not be determined
> + *  EINVAL: if fault type could not be determined or VMCS reload fails
>   *  EAGAIN: if a protection fault occurred, ie writing to a read-only page
>   *  errno: if uvm_fault(9) fails to wire in the page
>   */
> @@ -5569,10 +5579,14 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
>   return (EAGAIN);
>   }
>
> - KERNEL_LOCK();
> + /* We may sleep during uvm_fault(9), so reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
>   ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, VM_FAULT_WIRE,
>   PROT_READ | PROT_WRITE | PROT_EXEC);
> - KERNEL_UNLOCK();
> + if (vcpu_reload_vmcs_vmx(vcpu)) {
> + printf("%s: failed to reload vmcs\n", __func__);
> + return (EINVAL);
> + }
>
>   if (ret)
>   printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n",
> @@ -5962,7 +5976,16 @@ vmx_load_pdptes(struct vcpu *vcpu)
>
>   ret = 0;
>
> - cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, _any, _none, 
> _waitok);
> + /* We may sleep during km_alloc(9), so reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
> + cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, _any, _none,
> + _waitok);
> + if (vcpu_reload_vmcs_vmx(vcpu)) {
> + printf("%s: failed to reload vmcs\n", __func__);
> + ret = EINVAL;
> + goto exit;
> + }
> +
>   if (!cr3_host_virt) {
>   printf("%s: can't allocate address for guest CR3 mapping\n",
>   __func__);
> @@ -5998,7 +6021,15 @@ vmx_load_pdptes(struct vcpu *vcpu)
>
>  exit:
>   pmap_kremove(cr3_host_virt, PAGE_SIZE);
> +
> + /* km_free(9) might sleep, so we need to reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
>

Re: vmm(4): bump vmclear spinout [vmx 1/3]

2021-11-28 Thread Mike Larkin
On Sun, Nov 28, 2021 at 10:32:47PM -0500, Dave Voutila wrote:
> Smallest of the VMX/VMCS stability diffs. This bumps the spinout to be
> the same number of ticks used by the mplock debug. This is needed on
> older/slower hosts.
>
> ok?
>
> -dv
>
> diff e8c587551f20ba6fdaa0f483ea768aade9f66f7d 
> 981a8cfd4e1dfe412e9c72fb5b47e7e46813bfbb
> blob - a7b21ec75899c81f076143fbe59f14279334ea09
> blob + e335a1dc5e8a400b4bbf49cac2ec8853dffcdae3
> --- sys/arch/amd64/amd64/vmm.c
> +++ sys/arch/amd64/amd64/vmm.c
> @@ -1373,7 +1373,7 @@ vmclear_on_cpu(struct cpu_info *ci)
>  static int
>  vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *vcpu)
>  {
> - int ret = 0, nticks = 10;
> + int ret = 0, nticks = 2;
>
>   mtx_enter(>ci_vmcs_mtx);
>   atomic_swap_ulong(>ci_vmcs_pa, vcpu->vc_control_pa);

ok mlarkin



Re: vmm(4): copyout guest regs, irqready on VM_EXIT_NONE

2021-11-21 Thread Mike Larkin
On Sat, Nov 20, 2021 at 09:14:31PM -0500, Dave Voutila wrote:
> The below diff fixes an issue reported by kn@ on bugs@ [1]. joshe@ also
> observed the issue and confirmed the below diff resolves it.
>
> The symptoms were quite odd: errors from fdc(4) during an OpenBSD guest
> booting under vmm(4)/vmd(8). We don't emulate a floppy disk drive!!!
>
> I introduced a bug in r1.287 [2] when simplifying parts of
> vcpu_run_{svm,vmx} by letting the functions return 0 instead of
> voluntarily yielding. The edge case I didn't account for is if after a
> vmexit for an IN instruction, the io port address isn't one emulated by
> vmd(8) in userland, vmm(4) will perform the emulation (not the bug) by
> writing the appropriate number of 0xff bytes to AL/AX/EAX. IF the
> scheduler would like us to yield, we return setting a vrp exit code of
> VM_EXIT_NONE (since we aren't asking userland/vmd to help with any
> emulation).
>
> vmd(8) correctly handles this exit, but vmm(4) never copies out the
> current vcpu registers and irqready state. When vmd(8) runs the vcpu
> again, the vcpu's guest state still has a vmexit related to the IO
> operation and presumes vmd(8) modified RAX and overwrites the vcpu's
> RAX before re-entering the guest.
>
> This behavior occurs on both Intel and AMD. To confirm, I added some
> printfs to fdc(4) and specifically checked when the dma reads returned
> something other than 0xff on instances of both types of host. (Since
> it's probabilistic, it's not uncommon to see it happen only 3-4 times
> out of the 100k bus reads out_fdc() attempts, but it seems more
> reproducible on older hardware.)
>
> ok?
>
> -dv
>

ok mlarkin if not already committed

> [1] https://marc.info/?l=openbsd-bugs=163682062027764=2
> [2] 
> https://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/arch/amd64/amd64/vmm.c.diff?r1=1.286=1.287
>
>
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /opt/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.294
> diff -u -p -r1.294 vmm.c
> --- sys/arch/amd64/amd64/vmm.c26 Oct 2021 16:29:49 -  1.294
> +++ sys/arch/amd64/amd64/vmm.c20 Nov 2021 21:46:07 -
> @@ -4301,9 +4301,10 @@ vm_run(struct vm_run_params *vrp)
>   rw_exit_write(_softc->vm_lock);
>   }
>   ret = 0;
> - } else if (ret == EAGAIN) {
> + } else if (ret == 0 || ret == EAGAIN) {
>   /* If we are exiting, populate exit data so vmd can help. */
> - vrp->vrp_exit_reason = vcpu->vc_gueststate.vg_exit_reason;
> + vrp->vrp_exit_reason = (ret == 0) ? VM_EXIT_NONE
> + : vcpu->vc_gueststate.vg_exit_reason;
>   vrp->vrp_irqready = vcpu->vc_irqready;
>   vcpu->vc_state = VCPU_STATE_STOPPED;
>
> @@ -4312,9 +4313,6 @@ vm_run(struct vm_run_params *vrp)
>   ret = EFAULT;
>   } else
>   ret = 0;
> - } else if (ret == 0) {
> - vrp->vrp_exit_reason = VM_EXIT_NONE;
> - vcpu->vc_state = VCPU_STATE_STOPPED;
>   } else {
>   vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
>   vcpu->vc_state = VCPU_STATE_TERMINATED;



Re: vmd(8): fix broken bootorder for cdrom

2021-11-04 Thread Mike Larkin
On Thu, Nov 04, 2021 at 08:09:16PM +0100, Jan Klemkow wrote:
> On Thu, Nov 04, 2021 at 10:43:46AM -0400, Dave Voutila wrote:
> > Jan Klemkow  writes:
> > > This fix [1] in seabios breaks our "boot device cdrom" feature.
> > >
> > > # vmctl start -Lc -d disk.img -r cd70.iso -B cdrom vm
> > > ...
> > > No bootable device.  Retrying in 60 seconds.
> > >
> > > # vmctl start -Lc -d disk.img -r cd70.iso vm
> > > doas vmctl start -c -r cd70.iso vm
> > > ...
> > > CD-ROM: E0
> > > Loading /7.0/AMD64/CDBOOT
> > > probing: pc0 com0 mem[638K 510M a20=on]
> > > disk: cd0
> > >>> OpenBSD/amd64 CDBOOT 3.53
> > > boot>
> > >
> > > The diff below, fixes the lun number of the bootorder string for cdrom.
> > >
> > > OK?
> >
> > This change definitely fixes -B cdrom, but -B disk seems broken as well.
> >
> > ok dv to fix the -B cdrom issue, but do you also have an idea how to fix
> > the -B disk option?
>
> The diff below, fixes the -B disk option. BUT...
>
> The bootorder for disk and cdrom are hard coded strings for a runtime
> dynamic PCI bus.  The current disk bootorder string just works, if there
> is no network device configured.  With the diff below, it will work, if
> there is just one network device.  The current cdrom bootorder string
> just works, with one network and one disk device, or with no network and
> two disk devices.
>
> One example of vmd(8)'s PCI bus:
>
> PCI: init bdf=00:00.0 id=0b5d:0666/* VMM Host */
> PCI: init bdf=00:01.0 id=1af4:1005/* Virtio RNG */
> PCI: init bdf=00:02.0 id=1af4:1000/* Virtio Network */
> PCI: init bdf=00:03.0 id=1af4:1001/* Virtio Storage (disk) */
> PCI: init bdf=00:04.0 id=1af4:1004/* Virtio SCSI (cdrom) */
> PCI: init bdf=00:05.0 id=0b5d:0777/* VMM Control */
>
> We should assemble dynamic bootorder strings, which fits to our dynamic
> assembled PCI bus.  This would be a general solution for this problem.
>
> For now, this diff will fix the -B disk option for the most common case
> of one NIC.  Which fits to the cdrom bootorder string for one NIC, one
> disk and a cdrom.
>
> OK?
>
> bye,
> Jan
>
> Index: fw_cfg.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/fw_cfg.c,v
> retrieving revision 1.4
> diff -u -p -r1.4 fw_cfg.c
> --- fw_cfg.c  4 Nov 2021 17:50:05 -   1.4
> +++ fw_cfg.c  4 Nov 2021 18:48:48 -
> @@ -77,7 +77,7 @@ fw_cfg_init(struct vmop_create_params *v
>
>   switch (vmc->vmc_bootdevice) {
>   case VMBOOTDEV_DISK:
> - bootorder = "/pci@i0cf8/*@2\nHALT";
> + bootorder = "/pci@i0cf8/*@3\nHALT";
>   break;
>   case VMBOOTDEV_CDROM:
>   bootorder = "/pci@i0cf8/*@4/*@0/*@0,4100\nHALT";
>

Thanks. ok mlarkin@

And I do agree that we probably need dynamic bootorder strings.

-ml



Re: More pchgpio(4)

2021-10-21 Thread Mike Larkin
On Tue, Oct 12, 2021 at 01:19:55PM -0700, Mike Larkin wrote:
> On Sun, Oct 10, 2021 at 11:42:31PM +0200, Mark Kettenis wrote:
> > > Date: Sat, 9 Oct 2021 22:27:52 +0200 (CEST)
> > > From: Mark Kettenis 
> > >
> > > > Date: Sat, 9 Oct 2021 20:55:10 +0200 (CEST)
> > > > From: Mark Kettenis 
> > > >
> > > > This time adding support for Sunrisepoint-H and Sunrisepoint-LP.
> > > > Because of all the failed attempts by Intel to get their 10nm process
> > > > under control, this may cover Intel Mobile CPUs marketed as 6th, 7th,
> > > > 8th, 9th and 10th generation.  So if you have a Laptop that isn't at
> > > > least 5 years old, give this a try if pchgpio(4) doesn't attach.  This
> > > > may fix all sorts of issues with keyboards, touchpads or
> > > > suspend/resume.
> > > >
> > > > ok?
> > >
> > > Updated diff that masks unhandled interrupts like we do in amdgpio(4).
> >
> > And another update to fix a typo in the pin groups for Sunrisepoint-LP.
> >
> >
>
> Thanks. I'll give this a try but it might take me a couple days since I'm
> traveling.
>
> -ml
>

Sorry for taking so long to get to this. The new diff below didn't seem to make
any difference, the Sgo3 is still very very slow with this, same interrupt 
storm.

-ml

> > Index: dev/acpi/pchgpio.c
> > ===
> > RCS file: /cvs/src/sys/dev/acpi/pchgpio.c,v
> > retrieving revision 1.8
> > diff -u -p -r1.8 pchgpio.c
> > --- dev/acpi/pchgpio.c  29 Sep 2021 22:03:33 -  1.8
> > +++ dev/acpi/pchgpio.c  10 Oct 2021 21:40:45 -
> > @@ -107,13 +107,76 @@ struct cfdriver pchgpio_cd = {
> >  };
> >
> >  const char *pchgpio_hids[] = {
> > +   "INT344B",
> > "INT3450",
> > +   "INT3451",
> > +   "INT345D",
> > "INT34BB",
> > "INT34C5",
> > "INT34C6",
> > NULL
> >  };
> >
> > +/* Sunrisepoint-LP */
> > +
> > +const struct pchgpio_group spt_lp_groups[] =
> > +{
> > +   /* Community 0 */
> > +   { 0, 0, 0, 23, 0 }, /* GPP_A */
> > +   { 0, 1, 24, 47, 24 },   /* GPP_B */
> > +
> > +   /* Community 1 */
> > +   { 1, 0, 48, 71, 48 },   /* GPP_C */
> > +   { 1, 1, 72, 95, 72 },   /* GPP_D */
> > +   { 1, 2, 96, 119, 96 },  /* GPP_E */
> > +
> > +   /* Community 3 */
> > +   { 2, 0, 120, 143, 120 },/* GPP_F */
> > +   { 2, 1, 144, 151, 144 },/* GPP_G */
> > +};
> > +
> > +const struct pchgpio_device spt_lp_device =
> > +{
> > +   .pad_size = 16,
> > +   .gpi_is = 0x100,
> > +   .gpi_ie = 0x120,
> > +   .groups = spt_lp_groups,
> > +   .ngroups = nitems(spt_lp_groups),
> > +   .npins = 176,
> > +};
> > +
> > +/* Sunrisepoint-H */
> > +
> > +const struct pchgpio_group spt_h_groups[] =
> > +{
> > +   /* Community 0 */
> > +   { 0, 0, 0, 23, 0 }, /* GPP_A */
> > +   { 0, 1, 24, 47, 24 },   /* GPP_B */
> > +
> > +   /* Community 1 */
> > +   { 1, 0, 48, 71, 48 },   /* GPP_C */
> > +   { 1, 1, 72, 95, 72 },   /* GPP_D */
> > +   { 1, 2, 96, 108, 96 },  /* GPP_E */
> > +   { 1, 3, 109, 132, 120 },/* GPP_F */
> > +   { 1, 4, 133, 156, 144 },/* GPP_G */
> > +   { 1, 5, 157, 180, 168 },/* GPP_H */
> > +
> > +   /* Community 3 */
> > +   { 2, 0, 181, 191, 192 },/* GPP_I */
> > +};
> > +
> > +const struct pchgpio_device spt_h_device =
> > +{
> > +   .pad_size = 16,
> > +   .gpi_is = 0x100,
> > +   .gpi_ie = 0x120,
> > +   .groups = spt_h_groups,
> > +   .ngroups = nitems(spt_h_groups),
> > +   .npins = 224,
> > +};
> > +
> > +/* Cannon Lake-H */
> > +
> >  const struct pchgpio_group cnl_h_groups[] =
> >  {
> > /* Community 0 */
> > @@ -146,6 +209,8 @@ const struct pchgpio_device cnl_h_device
> > .npins = 384,
> >  };
> >
> > +/* Cannon Lake-LP */
> > +
> >  const struct pchgpio_group cnl_lp_groups[] =
> >  {
> > /* Community 0 */
> > @@ -173,6 +238,8 @@ const struct pchgpio_device cnl_lp_devic
> > .npins = 320,
> >  };
> >
> > +/* Tiger Lake-LP */
> > +
> >  const struct pchgpio_group tgl_lp_groups[] =
> >  {
> > /* Community 0 */
> >

Re: More pchgpio(4)

2021-10-12 Thread Mike Larkin
On Sun, Oct 10, 2021 at 11:42:31PM +0200, Mark Kettenis wrote:
> > Date: Sat, 9 Oct 2021 22:27:52 +0200 (CEST)
> > From: Mark Kettenis 
> >
> > > Date: Sat, 9 Oct 2021 20:55:10 +0200 (CEST)
> > > From: Mark Kettenis 
> > >
> > > This time adding support for Sunrisepoint-H and Sunrisepoint-LP.
> > > Because of all the failed attempts by Intel to get their 10nm process
> > > under control, this may cover Intel Mobile CPUs marketed as 6th, 7th,
> > > 8th, 9th and 10th generation.  So if you have a Laptop that isn't at
> > > least 5 years old, give this a try if pchgpio(4) doesn't attach.  This
> > > may fix all sorts of issues with keyboards, touchpads or
> > > suspend/resume.
> > >
> > > ok?
> >
> > Updated diff that masks unhandled interrupts like we do in amdgpio(4).
>
> And another update to fix a typo in the pin groups for Sunrisepoint-LP.
>
>

Thanks. I'll give this a try but it might take me a couple days since I'm
traveling.

-ml

> Index: dev/acpi/pchgpio.c
> ===
> RCS file: /cvs/src/sys/dev/acpi/pchgpio.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 pchgpio.c
> --- dev/acpi/pchgpio.c29 Sep 2021 22:03:33 -  1.8
> +++ dev/acpi/pchgpio.c10 Oct 2021 21:40:45 -
> @@ -107,13 +107,76 @@ struct cfdriver pchgpio_cd = {
>  };
>
>  const char *pchgpio_hids[] = {
> + "INT344B",
>   "INT3450",
> + "INT3451",
> + "INT345D",
>   "INT34BB",
>   "INT34C5",
>   "INT34C6",
>   NULL
>  };
>
> +/* Sunrisepoint-LP */
> +
> +const struct pchgpio_group spt_lp_groups[] =
> +{
> + /* Community 0 */
> + { 0, 0, 0, 23, 0 }, /* GPP_A */
> + { 0, 1, 24, 47, 24 },   /* GPP_B */
> +
> + /* Community 1 */
> + { 1, 0, 48, 71, 48 },   /* GPP_C */
> + { 1, 1, 72, 95, 72 },   /* GPP_D */
> + { 1, 2, 96, 119, 96 },  /* GPP_E */
> +
> + /* Community 3 */
> + { 2, 0, 120, 143, 120 },/* GPP_F */
> + { 2, 1, 144, 151, 144 },/* GPP_G */
> +};
> +
> +const struct pchgpio_device spt_lp_device =
> +{
> + .pad_size = 16,
> + .gpi_is = 0x100,
> + .gpi_ie = 0x120,
> + .groups = spt_lp_groups,
> + .ngroups = nitems(spt_lp_groups),
> + .npins = 176,
> +};
> +
> +/* Sunrisepoint-H */
> +
> +const struct pchgpio_group spt_h_groups[] =
> +{
> + /* Community 0 */
> + { 0, 0, 0, 23, 0 }, /* GPP_A */
> + { 0, 1, 24, 47, 24 },   /* GPP_B */
> +
> + /* Community 1 */
> + { 1, 0, 48, 71, 48 },   /* GPP_C */
> + { 1, 1, 72, 95, 72 },   /* GPP_D */
> + { 1, 2, 96, 108, 96 },  /* GPP_E */
> + { 1, 3, 109, 132, 120 },/* GPP_F */
> + { 1, 4, 133, 156, 144 },/* GPP_G */
> + { 1, 5, 157, 180, 168 },/* GPP_H */
> +
> + /* Community 3 */
> + { 2, 0, 181, 191, 192 },/* GPP_I */
> +};
> +
> +const struct pchgpio_device spt_h_device =
> +{
> + .pad_size = 16,
> + .gpi_is = 0x100,
> + .gpi_ie = 0x120,
> + .groups = spt_h_groups,
> + .ngroups = nitems(spt_h_groups),
> + .npins = 224,
> +};
> +
> +/* Cannon Lake-H */
> +
>  const struct pchgpio_group cnl_h_groups[] =
>  {
>   /* Community 0 */
> @@ -146,6 +209,8 @@ const struct pchgpio_device cnl_h_device
>   .npins = 384,
>  };
>
> +/* Cannon Lake-LP */
> +
>  const struct pchgpio_group cnl_lp_groups[] =
>  {
>   /* Community 0 */
> @@ -173,6 +238,8 @@ const struct pchgpio_device cnl_lp_devic
>   .npins = 320,
>  };
>
> +/* Tiger Lake-LP */
> +
>  const struct pchgpio_group tgl_lp_groups[] =
>  {
>   /* Community 0 */
> @@ -205,6 +272,8 @@ const struct pchgpio_device tgl_lp_devic
>   .npins = 360,
>  };
>
> +/* Tiger Lake-H */
> +
>  const struct pchgpio_group tgl_h_groups[] =
>  {
>   /* Community 0 */
> @@ -242,7 +311,10 @@ const struct pchgpio_device tgl_h_device
>  };
>
>  struct pchgpio_match pchgpio_devices[] = {
> + { "INT344B", _lp_device },
>   { "INT3450", _h_device },
> + { "INT3451", _h_device },
> + { "INT345D", _h_device },
>   { "INT34BB", _lp_device },
>   { "INT34C5", _lp_device },
>   { "INT34C6", _h_device },
> @@ -473,11 +545,38 @@ pchgpio_intr_establish(void *cookie, int
>  }
>
>  int
> +pchgpio_intr_handle(struct pchgpio_softc *sc, int group, int bit)
> +{
> + uint32_t enable;
> + int gpiobase, pin, handled = 0;
> + uint8_t bank, bar;
> +
> + bar = sc->sc_device->groups[group].bar;
> + bank = sc->sc_device->groups[group].bank;
> + gpiobase = sc->sc_device->groups[group].gpiobase;
> +
> + pin = gpiobase + bit;
> + if (sc->sc_pin_ih[pin].ih_func) {
> + sc->sc_pin_ih[pin].ih_func(sc->sc_pin_ih[pin].ih_arg);
> + handled = 1;
> + } else {
> + /* Mask unhandled interrupt. */
> + enable = bus_space_read_4(sc->sc_memt[bar], sc->sc_memh[bar],
> + 

Re: vmm(4): set global vcpu limit to 512

2021-09-11 Thread Mike Larkin
On Sat, Sep 11, 2021 at 01:44:33PM -0400, Dave Voutila wrote:
> Syzbot recently discovered that since we don't have any bounds in place
> for number of vms or vcpus it's possible to completely exhuast kernel
> memory or at least put the system in a state where malloc(9) or
> km_alloc(9) fail in systems (e.g. DRM, unveil, etc.) resulting in
> panics. Actually, it first discovered some lock ordering issues, but
> once those were fixed this issue surfaced via the reproducer [1].
>
> I chose 512 as a conservative bound based on the idea that vcpu's have a
> few wired pages of memory each for various VMX/SVM things like VMCS/VMCB
> structures.
>
> Given we also wire guest memory on a page fault and only support 1 vcpu
> per guest currently, it's highly unlikely someone is successfully
> running 512 guests. Once we finish fixing the tlb issues forcing us to
> wire or implement SMP, we can revisit this number.
>
> I checked with openbsd.amsterdam and this is well over their current
> densities. (If anyone *IS* somehow running > 512 guests as of this
> moment, please speak up.)
>
> ok?
>
> [1] https://syzkaller.appspot.com/text?tag=ReproC=11f507de30
>

ok mlarkin

> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.292
> diff -u -p -r1.292 vmm.c
> --- sys/arch/amd64/amd64/vmm.c5 Sep 2021 16:36:34 -   1.292
> +++ sys/arch/amd64/amd64/vmm.c11 Sep 2021 17:36:28 -
> @@ -99,6 +99,9 @@ struct vmm_softc {
>
>   int mode;
>
> + size_t  vcpu_ct;
> + size_t  vcpu_max;
> +
>   struct rwlock   vm_lock;
>   size_t  vm_ct;  /* number of in-memory VMs */
>   size_t  vm_idx; /* next unique VM index */
> @@ -368,6 +371,8 @@ vmm_attach(struct device *parent, struct
>   sc->nr_svm_cpus = 0;
>   sc->nr_rvi_cpus = 0;
>   sc->nr_ept_cpus = 0;
> + sc->vcpu_ct = 0;
> + sc->vcpu_max = VMM_MAX_VCPUS;
>   sc->vm_ct = 0;
>   sc->vm_idx = 0;
>
> @@ -1498,6 +1503,15 @@ vm_create(struct vm_create_params *vcp,
>   if (vcp->vcp_ncpus != 1)
>   return (EINVAL);
>
> + rw_enter_write(_softc->vm_lock);
> + if (vmm_softc->vcpu_ct + vcp->vcp_ncpus > vmm_softc->vcpu_max) {
> + printf("%s: maximum vcpus (%lu) reached\n", __func__,
> + vmm_softc->vcpu_max);
> + rw_exit_write(_softc->vm_lock);
> + return (ENOMEM);
> + }
> + vmm_softc->vcpu_ct += vcp->vcp_ncpus;
> +
>   vm = pool_get(_pool, PR_WAITOK | PR_ZERO);
>   SLIST_INIT(>vm_vcpu_list);
>   rw_init(>vm_vcpu_lock, "vcpu_list");
> @@ -1509,8 +1523,6 @@ vm_create(struct vm_create_params *vcp,
>   vm->vm_memory_size = memsize;
>   strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1);
>
> - rw_enter_write(_softc->vm_lock);
> -
>   if (vm_impl_init(vm, p)) {
>   printf("failed to init arch-specific features for vm %p\n", vm);
>   vm_teardown(vm);
> @@ -3784,6 +3796,7 @@ vm_teardown(struct vm *vm)
>   SLIST_REMOVE(>vm_vcpu_list, vcpu, vcpu, vc_vcpu_link);
>   vcpu_deinit(vcpu);
>   pool_put(_pool, vcpu);
> + vmm_softc->vcpu_ct--;
>   }
>
>   vm_impl_deinit(vm);
> Index: sys/arch/amd64/include/vmmvar.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
> retrieving revision 1.73
> diff -u -p -r1.73 vmmvar.h
> --- sys/arch/amd64/include/vmmvar.h   31 Aug 2021 17:40:59 -  1.73
> +++ sys/arch/amd64/include/vmmvar.h   11 Sep 2021 17:36:28 -
> @@ -29,6 +29,7 @@
>  #define VMM_MAX_PATH_CDROM   128
>  #define VMM_MAX_NAME_LEN 64
>  #define VMM_MAX_KERNEL_PATH  128
> +#define VMM_MAX_VCPUS512
>  #define VMM_MAX_VCPUS_PER_VM 64
>  #define VMM_MAX_VM_MEM_SIZE  32768
>  #define VMM_MAX_NICS_PER_VM  4
>



Re: updated patch for iwx(4) Tx aggregation

2021-09-11 Thread Mike Larkin
On Sat, Sep 11, 2021 at 02:04:32PM +0200, Stefan Sperling wrote:
> On Fri, Sep 10, 2021 at 06:49:49PM +0200, Stefan Sperling wrote:
> > Here is another attempt at adding Tx aggregation to iwx(4).
> > This patch is based on the latest state in CVS (if_iwx.c r1.107, which
> > I have committed a minute ago). Sync your tree before applying this patch.
> >
> > Compared to previous iterations of this patch, I have fixed bugs which
> > caused fatal firmware errors and which made traffic stall after roaming.
> >
> > This patch could still make 7.0 release if it gets sufficient test coverage.
> > Please run with this and report any regressions. Thanks!
> >
> > So far, tested by me on AX200 and AX201 against a Pepwave 11ac AP.
> > I have so far not seen any fatal firmware errors, and roaming between 2GHz
> > and 5GHz channels offered by the same AP seems to work reliably.
> > Throughput goes up to 100 Mbit/s max.
>
> The previous version had a problem where it did not take frames
> off the Tx ring when they were done. It is possible that this
> could lead to memory corruption (seen by mlarkin).
>
> Please run this updated patch instead.
>
> And please enable 'ifconfig iwx0 debug' while testing this patch.
> Problem reports will be a lot more useful with debug enabled :)
>

This diff seems to resolve the NFS related corruption I was seeing.

Thanks!

-ml

> diff refs/heads/iwx-resume2 refs/heads/iwx-txagg
> blob - 4cfc91b7f4819a1a9f50fdaac339a78f67d1ab5a
> blob + 9e31a8d0bb5c9ba1fad3614fe6dcb5ebdcd33403
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -318,18 +318,16 @@ int iwx_ampdu_rx_start(struct ieee80211com *, 
> struct i
>   uint8_t);
>  void iwx_ampdu_rx_stop(struct ieee80211com *, struct ieee80211_node *,
>   uint8_t);
> +int  iwx_ampdu_tx_start(struct ieee80211com *, struct ieee80211_node *,
> + uint8_t);
>  void iwx_rx_ba_session_expired(void *);
>  void iwx_rx_bar_frame_release(struct iwx_softc *, struct iwx_rx_packet *,
>   struct iwx_rx_data *, struct mbuf_list *);
>  void iwx_reorder_timer_expired(void *);
>  void iwx_sta_rx_agg(struct iwx_softc *, struct ieee80211_node *, uint8_t,
>   uint16_t, uint16_t, int, int);
> -#ifdef notyet
> -int  iwx_ampdu_tx_start(struct ieee80211com *, struct ieee80211_node *,
> +void iwx_sta_tx_agg_start(struct iwx_softc *, struct ieee80211_node *,
>   uint8_t);
> -void iwx_ampdu_tx_stop(struct ieee80211com *, struct ieee80211_node *,
> - uint8_t);
> -#endif
>  void iwx_ba_task(void *);
>
>  int  iwx_set_mac_addr_from_csr(struct iwx_softc *, struct iwx_nvm_data *);
> @@ -355,10 +353,13 @@ int iwx_ccmp_decap(struct iwx_softc *, struct mbuf 
> *,
>   struct ieee80211_node *, struct ieee80211_rxinfo *);
>  void iwx_rx_frame(struct iwx_softc *, struct mbuf *, int, uint32_t, int, int,
>   uint32_t, struct ieee80211_rxinfo *, struct mbuf_list *);
> -void iwx_rx_tx_cmd_single(struct iwx_softc *, struct iwx_rx_packet *,
> - struct iwx_node *);
> +void iwx_clear_tx_desc(struct iwx_softc *, struct iwx_tx_ring *, int);
> +void iwx_txd_done(struct iwx_softc *, struct iwx_tx_data *);
> +void iwx_tx_ba_move_window(struct ieee80211com *, int, struct mbuf *);
> +void iwx_txq_advance(struct iwx_softc *, struct iwx_tx_ring *, int);
>  void iwx_rx_tx_cmd(struct iwx_softc *, struct iwx_rx_packet *,
>   struct iwx_rx_data *);
> +void iwx_clear_oactive(struct iwx_softc *, struct iwx_tx_ring *);
>  void iwx_rx_bmiss(struct iwx_softc *, struct iwx_rx_packet *,
>   struct iwx_rx_data *);
>  int  iwx_binding_cmd(struct iwx_softc *, struct iwx_node *, uint32_t);
> @@ -382,8 +383,11 @@ void iwx_cmd_done(struct iwx_softc *, int, int, int);
>  const struct iwx_rate *iwx_tx_fill_cmd(struct iwx_softc *, struct iwx_node *,
>   struct ieee80211_frame *, struct iwx_tx_cmd_gen2 *);
>  void iwx_tx_update_byte_tbl(struct iwx_tx_ring *, int, uint16_t, uint16_t);
> -int  iwx_tx(struct iwx_softc *, struct mbuf *, struct ieee80211_node *, int);
> -int  iwx_flush_tx_path(struct iwx_softc *);
> +int  iwx_tx(struct iwx_softc *, struct mbuf *, struct ieee80211_node *);
> +int  iwx_flush_sta_tids(struct iwx_softc *, int, uint16_t);
> +int  iwx_wait_tx_queues_empty(struct iwx_softc *);
> +int  iwx_drain_sta(struct iwx_softc *sc, struct iwx_node *, int);
> +int  iwx_flush_sta(struct iwx_softc *, struct iwx_node *);
>  int  iwx_beacon_filter_send_cmd(struct iwx_softc *,
>   struct iwx_beacon_filter_cmd *);
>  int  iwx_update_beacon_abort(struct iwx_softc *, struct iwx_node *, int);
> @@ -396,6 +400,7 @@ int   iwx_disable_beacon_filter(struct iwx_softc *);
>  int  iwx_add_sta_cmd(struct iwx_softc *, struct iwx_node *, int);
>  int  iwx_add_aux_sta(struct iwx_softc *);
>  int  iwx_rm_sta_cmd(struct iwx_softc *, struct iwx_node *);
> +int  iwx_rm_sta(struct iwx_softc *, struct iwx_node *);
>  int  iwx_fill_probe_req(struct iwx_softc *, struct iwx_scan_probe_req *);
>  int  

Re: let iwx(4) resume in the acpi thread

2021-09-10 Thread Mike Larkin
On Fri, Sep 10, 2021 at 11:12:45AM +0200, Stefan Sperling wrote:
> On Fri, Sep 10, 2021 at 10:59:25AM +0200, Stefan Sperling wrote:
> > On Fri, Sep 10, 2021 at 10:58:47AM +0200, Stefan Sperling wrote:
> > > All those changes are shown below. My device is still happy with this.
> > > I will reply with a new full diff against -current next.
> >
> > Full diff:
>
> Just realized that because iwx_resume() can no longer return an error
> we can make it a void function. And simplify the DVACT_RESUME/DVACT_WAKEUP
> case statements in iwx_activate() accordingly.
>
> Sorry about sending too many diffs :)
>

LGTM, ok mlarkin

> diff c9db663b670f8930f62c8f20c36e84d72697f036 refs/heads/iwx-resume2
> blob - 51063c862bfc0cf2dc9fbe3f41628bbdbdf3486e
> blob + 4cfc91b7f4819a1a9f50fdaac339a78f67d1ab5a
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -489,7 +489,8 @@ void  iwx_attach_hook(struct device *);
>  void iwx_attach(struct device *, struct device *, void *);
>  void iwx_init_task(void *);
>  int  iwx_activate(struct device *, int);
> -int  iwx_resume(struct iwx_softc *);
> +void iwx_resume(struct iwx_softc *);
> +int  iwx_wakeup(struct iwx_softc *);
>
>  #if NBPFILTER > 0
>  void iwx_radiotap_attach(struct iwx_softc *);
> @@ -1913,11 +1914,8 @@ int
>  iwx_check_rfkill(struct iwx_softc *sc)
>  {
>   uint32_t v;
> - int s;
>   int rv;
>
> - s = splnet();
> -
>   /*
>* "documentation" is not really helpful here:
>*  27: HW_RF_KILL_SW
> @@ -1933,7 +1931,6 @@ iwx_check_rfkill(struct iwx_softc *sc)
>   sc->sc_flags &= ~IWX_FLAG_RFKILL;
>   }
>
> - splx(s);
>   return rv;
>  }
>
> @@ -1986,8 +1983,6 @@ iwx_restore_interrupts(struct iwx_softc *sc)
>  void
>  iwx_disable_interrupts(struct iwx_softc *sc)
>  {
> - int s = splnet();
> -
>   if (!sc->sc_msix) {
>   IWX_WRITE(sc, IWX_CSR_INT_MASK, 0);
>
> @@ -2000,8 +1995,6 @@ iwx_disable_interrupts(struct iwx_softc *sc)
>   IWX_WRITE(sc, IWX_CSR_MSIX_HW_INT_MASK_AD,
>   sc->sc_hw_init_mask);
>   }
> -
> - splx(s);
>  }
>
>  void
> @@ -7822,16 +7815,6 @@ iwx_init_hw(struct iwx_softc *sc)
>   struct ieee80211com *ic = >sc_ic;
>   int err, i;
>
> - err = iwx_preinit(sc);
> - if (err)
> - return err;
> -
> - err = iwx_start_hw(sc);
> - if (err) {
> - printf("%s: could not initialize hardware\n", DEVNAME(sc));
> - return err;
> - }
> -
>   err = iwx_run_init_mvm_ucode(sc, 0);
>   if (err)
>   return err;
> @@ -7984,6 +7967,16 @@ iwx_init(struct ifnet *ifp)
>   KASSERT(sc->task_refs.refs == 0);
>   refcnt_init(>task_refs);
>
> + err = iwx_preinit(sc);
> + if (err)
> + return err;
> +
> + err = iwx_start_hw(sc);
> + if (err) {
> + printf("%s: could not initialize hardware\n", DEVNAME(sc));
> + return err;
> + }
> +
>   err = iwx_init_hw(sc);
>   if (err) {
>   if (generation == sc->sc_generation)
> @@ -9281,7 +9274,10 @@ iwx_attach(struct device *parent, struct device *self,
>   return;
>   }
>
> - /* Clear device-specific "PCI retry timeout" register (41h). */
> + /*
> +  * We disable the RETRY_TIMEOUT register (0x41) to keep
> +  * PCI Tx retries from interfering with C3 CPU state.
> +  */
>   reg = pci_conf_read(sc->sc_pct, sc->sc_pcitag, 0x40);
>   pci_conf_write(sc->sc_pct, sc->sc_pcitag, 0x40, reg & ~0xff00);
>
> @@ -9568,12 +9564,15 @@ iwx_init_task(void *arg1)
>   splx(s);
>  }
>
> -int
> +void
>  iwx_resume(struct iwx_softc *sc)
>  {
>   pcireg_t reg;
>
> - /* Clear device-specific "PCI retry timeout" register (41h). */
> + /*
> +  * We disable the RETRY_TIMEOUT register (0x41) to keep
> +  * PCI Tx retries from interfering with C3 CPU state.
> +  */
>   reg = pci_conf_read(sc->sc_pct, sc->sc_pcitag, 0x40);
>   pci_conf_write(sc->sc_pct, sc->sc_pcitag, 0x40, reg & ~0xff00);
>
> @@ -9588,8 +9587,34 @@ iwx_resume(struct iwx_softc *sc)
>   }
>
>   iwx_disable_interrupts(sc);
> +}
>
> - return iwx_start_hw(sc);
> +int
> +iwx_wakeup(struct iwx_softc *sc)
> +{
> + struct ieee80211com *ic = >sc_ic;
> + struct ifnet *ifp = >sc_ic.ic_if;
> + int err;
> +
> + refcnt_init(>task_refs);
> +
> + err = iwx_start_hw(sc);
> + if (err)
> + return err;
> +
> + err = iwx_init_hw(sc);
> + if (err)
> + return err;
> +
> + ifq_clr_oactive(>if_snd);
> + ifp->if_flags |= IFF_RUNNING;
> +
> + if (ic->ic_opmode == IEEE80211_M_MONITOR)
> + ieee80211_new_state(ic, IEEE80211_S_RUN, -1);
> + else
> + ieee80211_begin_scan(ifp);
> +
> + return 0;
>  }
>
>  int
> @@ -9608,15 +9633,15 @@ iwx_activate(struct device *self, int act)
>   }
>   break;
>   case DVACT_RESUME:
> - err = 

Re: iwx(4) firmware memory fixes

2021-09-09 Thread Mike Larkin
On Wed, Sep 08, 2021 at 02:08:36PM +0200, Stefan Sperling wrote:
> Add a missing call to iwx_ctxt_info_free_fw_img() in an error path
> of iwx_ctxt_info_init() which should always free on error.
>
> Also, free firmware paging DMA memory in case loading firmware has failed.
> If we don't free paging on error we hit KASSERT(dram->paging == NULL)
> in iwx_init_fw_sec() once we try to load firmware again.  I have hit
> this while debugging firmware load failures during suspend/resume.
>
> (Ideally, we would re-allocate firmware image and paging memory only
> after re-loading a potentially different fw image, but this can be
> fixed later.)
>
> ok?
>

ok mlarkin

> diff 50816b19557cd9c29c50f92eebbe32098a494bd3 
> 055f053850bb0f3af81ea3aa7c4f705a85cfcb76
> blob - f7d69707ed0a98dfcd7717c9c82faac3af4f39d7
> blob + 51063c862bfc0cf2dc9fbe3f41628bbdbdf3486e
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -914,8 +914,10 @@ iwx_ctxt_info_init(struct iwx_softc *sc, const struct
>   IWX_WRITE(sc, IWX_CSR_CTXT_INFO_BA + 4, paddr >> 32);
>
>   /* kick FW self load */
> - if (!iwx_nic_lock(sc))
> + if (!iwx_nic_lock(sc)) {
> + iwx_ctxt_info_free_fw_img(sc);
>   return EBUSY;
> + }
>   iwx_write_prph(sc, IWX_UREG_CPU_INIT_RUN, 1);
>   iwx_nic_unlock(sc);
>
> @@ -3364,8 +3366,10 @@ iwx_load_firmware(struct iwx_softc *sc)
>
>   /* wait for the firmware to load */
>   err = tsleep_nsec(>sc_uc, 0, "iwxuc", SEC_TO_NSEC(1));
> - if (err || !sc->sc_uc.uc_ok)
> + if (err || !sc->sc_uc.uc_ok) {
>   printf("%s: could not load firmware, %d\n", DEVNAME(sc), err);
> + iwx_ctxt_info_free_paging(sc);
> + }
>
>   iwx_ctxt_info_free_fw_img(sc);
>
>



Re: let iwx(4) resume in the acpi thread

2021-09-09 Thread Mike Larkin
On Wed, Sep 08, 2021 at 03:25:20PM +0200, Stefan Sperling wrote:
> On Wed, Sep 08, 2021 at 02:19:00PM +0200, Stefan Sperling wrote:
> > This patch applies on top of all the other iwx(4) diffs I've sent today.
> > It makes iwx(4) initialize the device completely in the acpi thread.
> >
> > We now prepare the device for loading firmware during DVACT_RESUME,
> > and load firmware from host memory into the device during DVACT_WAKEUP.
> >
> > Previously, DVACT_WAKEUP would schedule the init_task which resets the
> > device, undoing work done during DVACT_RESUME, and starts all over again.
> >
> > ok?
>
> The previous version had a bug: It resumed the device even while the
> interface was marked down. Fixed patch below.
>

It looks like DVACT_RESUME invokes iwx_resume which does a not-trivial amount
of chip repair/bringup. If you are satisfied this is safe, ok mlarkin@

-ml

> diff 055f053850bb0f3af81ea3aa7c4f705a85cfcb76 
> c734175f035f120197d6be7df1987cb81e535d3e
> blob - 51063c862bfc0cf2dc9fbe3f41628bbdbdf3486e
> blob + 26f8a7fa85aa48a054d79e7a175e35bfe96a447b
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -490,6 +490,7 @@ void  iwx_attach(struct device *, struct device *, 
> void
>  void iwx_init_task(void *);
>  int  iwx_activate(struct device *, int);
>  int  iwx_resume(struct iwx_softc *);
> +int  iwx_wakeup(struct iwx_softc *);
>
>  #if NBPFILTER > 0
>  void iwx_radiotap_attach(struct iwx_softc *);
> @@ -7822,16 +7823,6 @@ iwx_init_hw(struct iwx_softc *sc)
>   struct ieee80211com *ic = >sc_ic;
>   int err, i;
>
> - err = iwx_preinit(sc);
> - if (err)
> - return err;
> -
> - err = iwx_start_hw(sc);
> - if (err) {
> - printf("%s: could not initialize hardware\n", DEVNAME(sc));
> - return err;
> - }
> -
>   err = iwx_run_init_mvm_ucode(sc, 0);
>   if (err)
>   return err;
> @@ -7984,6 +7975,16 @@ iwx_init(struct ifnet *ifp)
>   KASSERT(sc->task_refs.refs == 0);
>   refcnt_init(>task_refs);
>
> + err = iwx_preinit(sc);
> + if (err)
> + return err;
> +
> + err = iwx_start_hw(sc);
> + if (err) {
> + printf("%s: could not initialize hardware\n", DEVNAME(sc));
> + return err;
> + }
> +
>   err = iwx_init_hw(sc);
>   if (err) {
>   if (generation == sc->sc_generation)
> @@ -9593,6 +9594,30 @@ iwx_resume(struct iwx_softc *sc)
>  }
>
>  int
> +iwx_wakeup(struct iwx_softc *sc)
> +{
> + struct ieee80211com *ic = >sc_ic;
> + struct ifnet *ifp = >sc_ic.ic_if;
> + int err;
> +
> + refcnt_init(>task_refs);
> +
> + err = iwx_init_hw(sc);
> + if (err)
> + return err;
> +
> + ifq_clr_oactive(>if_snd);
> + ifp->if_flags |= IFF_RUNNING;
> +
> + if (ic->ic_opmode == IEEE80211_M_MONITOR)
> + ieee80211_new_state(ic, IEEE80211_S_RUN, -1);
> + else
> + ieee80211_begin_scan(ifp);
> +
> + return 0;
> +}
> +
> +int
>  iwx_activate(struct device *self, int act)
>  {
>   struct iwx_softc *sc = (struct iwx_softc *)self;
> @@ -9608,15 +9633,27 @@ iwx_activate(struct device *self, int act)
>   }
>   break;
>   case DVACT_RESUME:
> + if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != IFF_UP)
> + break;
> + sc->sc_flags &= ~IWX_FLAG_SHUTDOWN;
>   err = iwx_resume(sc);
> - if (err)
> + if (err) {
>   printf("%s: could not initialize hardware\n",
>   DEVNAME(sc));
> + sc->sc_flags |= IWX_FLAG_SHUTDOWN;
> + }
>   break;
>   case DVACT_WAKEUP:
> - /* Hardware should be up at this point. */
> - if (iwx_set_hw_ready(sc))
> - task_add(systq, >init_task);
> + if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != IFF_UP)
> + break;
> + if (sc->sc_flags & IWX_FLAG_SHUTDOWN)
> + sc->sc_flags &= ~IWX_FLAG_SHUTDOWN;
> + else {
> + err = iwx_wakeup(sc);
> + if (err)
> + printf("%s: could not initialize hardware\n",
> + DEVNAME(sc));
> + }
>   break;
>   }
>
>
>
>



Re: riscv64/trap.c debug printfs

2021-09-03 Thread Mike Larkin
On Fri, Sep 03, 2021 at 04:38:55PM +0200, Jeremie Courreges-Anglas wrote:
>
> This one is a bit too chatty whenever you run a program under egdb.
> But the other printfs in this file seem ok, thus I'm not touching them.
>
> ok?
>
>
> Index: trap.c
> ===
> RCS file: /d/cvs/src/sys/arch/riscv64/riscv64/trap.c,v
> retrieving revision 1.16
> diff -u -p -p -u -r1.16 trap.c
> --- trap.c26 Jul 2021 22:13:19 -  1.16
> +++ trap.c3 Sep 2021 14:25:31 -
> @@ -159,7 +159,6 @@ do_trap_user(struct trapframe *frame)
>   trapsignal(p, SIGILL, 0, ILL_ILLTRP, sv);
>   break;
>   case EXCP_BREAKPOINT:
> - printf("BREAKPOINT\n");
>   sv.sival_ptr = (void *)frame->tf_stval;
>   trapsignal(p, SIGTRAP, 0, TRAP_BRKPT, sv);
>   break;
>
> --
> jca | PGP : 0x1524E7EE / 5135 92C1 AD36 5293 2BDF  DDCC 0DFA 74AE 1524 E7EE
>

ok mlarkin



Re: Incorrect IPL when pool_get(9) is called under rwlock

2021-09-01 Thread Mike Larkin
On Wed, Sep 01, 2021 at 08:53:35AM +0200, Martin Pieuchot wrote:
> syzkaller reported [0] the following lock ordering issue:
>
> db{0}> trace
> db_enter() at db_enter+0x18 sys/arch/amd64/amd64/db_interface.c:440
> panic(82464b8f) at panic+0x177 sys/kern/subr_prf.c:202
> witness_checkorder(82838c20,9,0) at witness_checkorder+0x11eb 
> sys/kern/subr_witness.c:833
> __mp_lock(82838a18) at __mp_lock+0xa1 read_rflags 
> machine/cpufunc.h:195 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 intr_disable 
> machine/cpufunc.h:216 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 sys/kern/kern_lock.c:142
> intr_handler(80002123ad80,80255d80) at intr_handler+0x5e 
> sys/arch/amd64/amd64/intr.c:532
> Xintr_ioapic_edge20_untramp() at Xintr_ioapic_edge20_untramp+0x18f
> Xspllower() at Xspllower+0x19
> mtx_enter_try(829b8d10) at mtx_enter_try+0x100
> mtx_enter(829b8d10) at mtx_enter+0x4b sys/kern/kern_lock.c:266
> pool_get(829b8d10,9) at pool_get+0xbf sys/kern/subr_pool.c:581
> vm_create(80b29000,8000211922a8) at vm_create+0x261 
> sys/arch/amd64/amd64/vmm.c:1526
> vmmioctl(a00,c5005601,80b29000,1,8000211922a8) at vmmioctl+0x1f2
> VOP_IOCTL(fd806e213830,c5005601,80b29000,1,fd807f7d8840,8000211922a8)
>  at VOP_IOCTL+0x9a sys/kern/vfs_vops.c:295
> vn_ioctl(fd806e4aca28,c5005601,80b29000,8000211922a8) at 
> vn_ioctl+0xba sys/kern/vfs_vnops.c:531
> sys_ioctl(8000211922a8,80002123b398,80002123b3e0) at 
> sys_ioctl+0x4a2
>
>
> The issue is that pool_get(9) at line 1526 is done after grabbing the
> `vm_lock'.  If an interrupt needing the KERNEL_LOCK() occurs at that
> moment the above mentionned lock ordering problem could cause a
> deadlock.
>
> To prevent such issue we generally mark the pool with IPL_MPFLOOR.
>
> [0] 
> https://syzkaller.appspot.com/bug?id=c73756cc996a58a625da35fbaa90ba6b9e0c60dc
>

ok mlarkin@

> Index: arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.287
> diff -u -p -r1.287 vmm.c
> --- arch/amd64/amd64/vmm.c31 Aug 2021 17:40:59 -  1.287
> +++ arch/amd64/amd64/vmm.c1 Sep 2021 06:45:38 -
> @@ -430,7 +430,7 @@ vmm_attach(struct device *parent, struct
>
>   pool_init(_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK,
>   "vmpool", NULL);
> - pool_init(_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
> + pool_init(_pool, sizeof(struct vcpu), 64, IPL_MPFLOOR, PR_WAITOK,
>   "vcpupl", NULL);
>
>   vmm_softc = sc;



Re: Incorrect IPL when pool_get(9) is called under rwlock

2021-09-01 Thread Mike Larkin
On Wed, Sep 01, 2021 at 08:53:35AM +0200, Martin Pieuchot wrote:
> syzkaller reported [0] the following lock ordering issue:
>
> db{0}> trace
> db_enter() at db_enter+0x18 sys/arch/amd64/amd64/db_interface.c:440
> panic(82464b8f) at panic+0x177 sys/kern/subr_prf.c:202
> witness_checkorder(82838c20,9,0) at witness_checkorder+0x11eb 
> sys/kern/subr_witness.c:833
> __mp_lock(82838a18) at __mp_lock+0xa1 read_rflags 
> machine/cpufunc.h:195 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 intr_disable 
> machine/cpufunc.h:216 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 sys/kern/kern_lock.c:142
> intr_handler(80002123ad80,80255d80) at intr_handler+0x5e 
> sys/arch/amd64/amd64/intr.c:532
> Xintr_ioapic_edge20_untramp() at Xintr_ioapic_edge20_untramp+0x18f
> Xspllower() at Xspllower+0x19
> mtx_enter_try(829b8d10) at mtx_enter_try+0x100
> mtx_enter(829b8d10) at mtx_enter+0x4b sys/kern/kern_lock.c:266
> pool_get(829b8d10,9) at pool_get+0xbf sys/kern/subr_pool.c:581
> vm_create(80b29000,8000211922a8) at vm_create+0x261 
> sys/arch/amd64/amd64/vmm.c:1526
> vmmioctl(a00,c5005601,80b29000,1,8000211922a8) at vmmioctl+0x1f2
> VOP_IOCTL(fd806e213830,c5005601,80b29000,1,fd807f7d8840,8000211922a8)
>  at VOP_IOCTL+0x9a sys/kern/vfs_vops.c:295
> vn_ioctl(fd806e4aca28,c5005601,80b29000,8000211922a8) at 
> vn_ioctl+0xba sys/kern/vfs_vnops.c:531
> sys_ioctl(8000211922a8,80002123b398,80002123b3e0) at 
> sys_ioctl+0x4a2
>
>
> The issue is that pool_get(9) at line 1526 is done after grabbing the
> `vm_lock'.  If an interrupt needing the KERNEL_LOCK() occurs at that
> moment the above mentionned lock ordering problem could cause a
> deadlock.
>
> To prevent such issue we generally mark the pool with IPL_MPFLOOR.
>
> [0] 
> https://syzkaller.appspot.com/bug?id=c73756cc996a58a625da35fbaa90ba6b9e0c60dc
>

Thanks, will take a look. This was introduced yesterday with the new vcpu 
locking
diff.

-ml

> Index: arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.287
> diff -u -p -r1.287 vmm.c
> --- arch/amd64/amd64/vmm.c31 Aug 2021 17:40:59 -  1.287
> +++ arch/amd64/amd64/vmm.c1 Sep 2021 06:45:38 -
> @@ -430,7 +430,7 @@ vmm_attach(struct device *parent, struct
>
>   pool_init(_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK,
>   "vmpool", NULL);
> - pool_init(_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
> + pool_init(_pool, sizeof(struct vcpu), 64, IPL_MPFLOOR, PR_WAITOK,
>   "vcpupl", NULL);
>
>   vmm_softc = sc;
>



Re: ddb: machine sysregs for amd64

2021-08-31 Thread Mike Larkin
On Tue, Aug 31, 2021 at 06:30:40PM +1000, Alex Wilson wrote:
> Hi,
>
> This is a short diff to add "machine sysregs" to ddb on amd64 (plus it also
> prints out gsbase/kgsbase). This command is available on i386 but not amd64.
> I swear I remember discussing this with mlarkin at some point but I couldn't
> find a previous patch for it on tech@. If I missed it somehow, I am super
> sorry, and please hit me with the search stick.
>
> This command is mostly useful if you're futzing with page tables or GDT/IDT
> setup etc, but it's also useful for sanity-checking state generally
> sometimes, and quite useful for teaching demos showing how it all works
> (which is the main reason I want it right now).
>

Thanks, I'll commit this.

-ml

>
>
> Index: sys/arch/amd64//amd64/db_interface.c
> ===
> RCS file: /cvs/./src/sys/arch/amd64/amd64/db_interface.c,v
> retrieving revision 1.35
> diff -u -p -r1.35 db_interface.c
> --- sys/arch/amd64//amd64/db_interface.c  6 Nov 2019 07:34:35 -   
> 1.35
> +++ sys/arch/amd64//amd64/db_interface.c  31 Aug 2021 08:12:06 -
> @@ -46,6 +46,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>  #include 
> @@ -160,6 +161,45 @@ db_ktrap(int type, int code, db_regs_t *
>   return (1);
>  }
>
> +void
> +db_sysregs_cmd(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
> +{
> + int64_t idtr, gdtr;
> + uint64_t cr;
> + uint16_t ldtr, tr;
> + uint64_t gsb;
> +
> + __asm__ volatile("sidt %0" : "=m" (idtr));
> + db_printf("idtr:   0x%08llx/%04llx\n", idtr >> 16, idtr & 0x);
> +
> + __asm__ volatile("sgdt %0" : "=m" (gdtr));
> + db_printf("gdtr:   0x%08llx/%04llx\n", gdtr >> 16, gdtr & 0x);
> +
> + __asm__ volatile("sldt %0" : "=g" (ldtr));
> + db_printf("ldtr:   0x%04x\n", ldtr);
> +
> + __asm__ volatile("str %0" : "=g" (tr));
> + db_printf("tr: 0x%04x\n", tr);
> +
> + __asm__ volatile("movq %%cr0,%0" : "=r" (cr));
> + db_printf("cr0:0x%016llx\n", cr);
> +
> + __asm__ volatile("movq %%cr2,%0" : "=r" (cr));
> + db_printf("cr2:0x%016llx\n", cr);
> +
> + __asm__ volatile("movq %%cr3,%0" : "=r" (cr));
> + db_printf("cr3:0x%016llx\n", cr);
> +
> + __asm__ volatile("movq %%cr4,%0" : "=r" (cr));
> + db_printf("cr4:0x%016llx\n", cr);
> +
> + gsb = rdmsr(MSR_GSBASE);
> + db_printf("gsb:0x%016llx\n", gsb);
> +
> + gsb = rdmsr(MSR_KERNELGSBASE);
> + db_printf("kgsb:   0x%016llx\n", gsb);
> +}
> +
>
>  #ifdef MULTIPROCESSOR
>  void
> @@ -368,6 +408,7 @@ struct db_command db_machine_command_tab
>   { "startcpu",   db_startproc_cmd,   0,  0 },
>   { "stopcpu",db_stopproc_cmd,0,  0 },
>   { "ddbcpu", db_ddbproc_cmd, 0,  0 },
> + { "sysregs",db_sysregs_cmd, 0,  0 },
>  #endif
>  #if NACPI > 0
>   { "acpi",   NULL,   0,  db_acpi_cmds },
>



Re: vmd(8): simplify vcpu logic, removing uart & net reads

2021-07-15 Thread Mike Larkin
On Sun, Jul 11, 2021 at 08:10:42AM -0400, Dave Voutila wrote:
>
> Ping...looking for OK. Would like to get this committed this week.
>

Sorry this took so long. ok mlarkin.

Thanks to the numerous testers who ran with this for the past few
weeks.

> Dave Voutila writes:
>
> > Looking for an OK for this one now. Anyone?
> >
> > Dave Voutila  writes:
> >
> >> Dave Voutila writes:
> >>
> >>> Looking for some broader testing of the following diff. It cleans up
> >>> some complicated logic predominantly left over from the early days of
> >>> vmd prior to its having a dedicated device thread.
> >>
> >> Still looking for tester feedback. I've been running this diff while
> >> hosting multiple guests continously (OpenBSD-current, Alpine 3.14,
> >> Debian 10.10, Ubuntu 20.04) with no issues.
> >>
> >> I know a few folks have told me they've applied the diff and have not
> >> seen issues.
> >
> > I've had positive reports from 4 people. Thanks everyone that tested and
> > provided feedback!
> >
> >>
> >> I'll prod for OK next week, so if you've tested the diff please let me
> >> know!
> >
> > OK to commit?
> >
> >>
> >>>
> >>> In summary, this diff:
> >>>
> >>> - Removes vionet "rx pending" state handling and removes the code path
> >>>   for the vcpu thread to possibly take control of the virtio net device
> >>>   and attempt a read of the underlying tap(4). (virtio.{c,h}, vm.c)
> >>>
> >>> - Removes ns8250 "rcv pending" state handling and removes the code path
> >>>   for the vcpu thread to read the pty via com_rcv(). (ns8250.{c,h})
> >>>
> >>> In both of the above cases, the event handling thread will be notified
> >>> of readable data and deal with it.
> >>>
> >>> Why remove them? The logic is overly complicated and hard to reason
> >>> about for zero gain. (This diff results in no intended functional
> >>> change.) Plus, some of the above logic I helped add to deal with the
> >>> race conditions and state corruption over a year ago. The logic was
> >>> needed once upon a time, but shouldn't be needed at present.
> >>>
> >>> I've had positive testing feedback from abieber@ so far with at least
> >>> the ns8250/uart diff, but want to cast a broader net here with both
> >>> before either part is committed. I debated splitting these up, but
> >>> they're thematically related.
> >>>
> >>> -dv
> >>>
> >>> Index: virtio.c
> >>> ===
> >>> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> >>> retrieving revision 1.91
> >>> diff -u -p -r1.91 virtio.c
> >>> --- virtio.c  21 Jun 2021 02:38:18 -  1.91
> >>> +++ virtio.c  23 Jun 2021 11:28:03 -
> >>> @@ -1254,12 +1254,12 @@ static int
> >>>  vionet_rx(struct vionet_dev *dev)
> >>>  {
> >>>   char buf[PAGE_SIZE];
> >>> - int hasdata, num_enq = 0, spc = 0;
> >>> + int num_enq = 0, spc = 0;
> >>>   struct ether_header *eh;
> >>>   ssize_t sz;
> >>>
> >>>   do {
> >>> - sz = read(dev->fd, buf, sizeof buf);
> >>> + sz = read(dev->fd, buf, sizeof(buf));
> >>>   if (sz == -1) {
> >>>   /*
> >>>* If we get EAGAIN, No data is currently available.
> >>> @@ -1270,21 +1270,17 @@ vionet_rx(struct vionet_dev *dev)
> >>>   "device");
> >>>   } else if (sz > 0) {
> >>>   eh = (struct ether_header *)buf;
> >>> - if (!dev->lockedmac || sz < ETHER_HDR_LEN ||
> >>> + if (!dev->lockedmac ||
> >>>   ETHER_IS_MULTICAST(eh->ether_dhost) ||
> >>>   memcmp(eh->ether_dhost, dev->mac,
> >>>   sizeof(eh->ether_dhost)) == 0)
> >>>   num_enq += vionet_enq_rx(dev, buf, sz, );
> >>>   } else if (sz == 0) {
> >>>   log_debug("process_rx: no data");
> >>> - hasdata = 0;
> >>>   break;
> >>>   }
> >>> + } while (spc > 0 && sz > 0);
> >>>
> >>> - hasdata = fd_hasdata(dev->fd);
> >>> - } while (spc && hasdata);
> >>> -
> >>> - dev->rx_pending = hasdata;
> >>>   return (num_enq);
> >>>  }
> >>>
> >>> @@ -1301,16 +1297,6 @@ vionet_rx_event(int fd, short kind, void
> >>>
> >>>   mutex_lock(>mutex);
> >>>
> >>> - /*
> >>> -  * We already have other data pending to be received. The data that
> >>> -  * has become available now will be enqueued to the vionet_dev
> >>> -  * later.
> >>> -  */
> >>> - if (dev->rx_pending) {
> >>> - mutex_unlock(>mutex);
> >>> - return;
> >>> - }
> >>> -
> >>>   if (vionet_rx(dev) > 0) {
> >>>   /* XXX: vcpu_id */
> >>>   vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq);
> >>> @@ -1320,40 +1306,6 @@ vionet_rx_event(int fd, short kind, void
> >>>  }
> >>>
> >>>  /*
> >>> - * vionet_process_rx
> >>> - *
> >>> - * Processes any remaining pending receivable data for a vionet device.
> >>> - * Called on VCPU exit. Although we poll on the tap file descriptor of
> >>> - * a vionet_dev in a 

Re: vmd: spurious VM restarts

2021-06-26 Thread Mike Larkin
On Sat, Jun 26, 2021 at 03:26:55PM +0200, Thomas L. wrote:
> On Wed, 7 Apr 2021 17:00:00 -0700
> Mike Larkin  wrote:
> > Depends on the exact content that got swapped out (as we didn't handle
> > TLB flushes correctly), so a crash was certainly a possibility.
> > That's why I wanted to see the VMM_DEBUG output.
> >
> > In any case, Thomas should try -current and see if this problem is
> > even reproducible.
> >
> > -ml
>
> I've been running -current with VMM_DEBUG since Apr 14 and the problem
> has not reproduced, instead I see spurious stops now. Output in
> /var/log/messages on the occasion is:
>
> Jun 19 03:31:16 golem vmd[95337]: vcpu_run_loop: vm 8 / vcpu 0 run ioctl 
> failed: Invalid argument
> Jun 19 03:31:16 golem /bsd: vcpu_run_vmx: can't read procbased ctls on exit
> Jun 19 03:31:17 golem /bsd: vmm_free_vpid: freed VPID/ASID 8
>
> There's also a lot of probably unrelated messages for all the VMs:
>
> Jun 19 01:31:10 golem vmd[66318]: vionet_enq_rx: descriptor too small for 
> packet data
>
> I realize that this is an old version, so this might be an already
> fixed bug. I can upgrade to a newer snapshot, but the bug shows about
> once per month, so by the time it shows it will be an old version
> again.
>
> Kind regards,
>
> Thomas
>

you probably want a newer snap, dv@ fixed some things in this area recently.



Re: vmd(8): add barebones vioblk GET_ID support

2021-06-17 Thread Mike Larkin
On Thu, Jun 17, 2021 at 12:07:10PM -0400, Dave Voutila wrote:
>
> Dave Voutila writes:
>
> > The virtio spec has had a de facto command for drivers to read the
> > serial number off a virtual block device. QEMU introduced this feature
> > years ago. Last November, the virtio governing group voted in favor of
> > adopting it officially into v1.2 (the next virtio spec) [1].
> >
> > The below diff adds the basics of handling the request returning an
> > empty serial number. (Serial numbers are limited to 20 bytes.) This
> > stops vmd from complaining about "unsupported command 0x8" when guests
> > send this command type.
>
> Got some feedback off-list from claudio@ that I think is sound. Instead
> of providing an "empty" serial id/number, simply return an UNSUPP status
> to indicate we don't support the value.
>
> I think this approach better than the approach I was suggesting that was
> based off QEMU's design of defaulting to "". (FreeBSD's Bhyve generates
> a serial like "BHYVE-1122-3344-5566" where the suffix is some truncated
> md5 of the backing filename. I'm not a fan of this approach.)
>
> >
> > secdata_desc{,idx} variables are renamed to just data_desc{,idx} to
> > semantically match the change since they're used for more than sector
> > data.
>
> I undid this renaming for now to reduce noise.
>
> > This is primarily part of my work to clean up and bring vmd's virtio
> > implementation more up to date and to align to our own
> > v{io,ioblk,ioscsi,etc.}(4) current capabilities. (vioblk(4) doesn't
> > support this yet, but Linux guests use it frequently.)
>
> While adding the BLK_ID support, I also switched the FLUSH/FLUSH_OUT
> response to be VIRTIO_BLK_S_UNSUPP as well since the device does not
> negotiate that feature. Any request from the guest to "flush" currently
> doesn't do anything (some hypervisors will fsync(2) the underlying fd)
> but for now I'm correcting the response code.
>
> I also noticed and added read/write checks prior to calls to
> {read,write}_mem. The virtio spec says a device MUST not write to a
> read-only descriptor and SHOULD NOT read a write-only descriptor with an
> exception being made for debugging. (See 2.6.5.1 Device requirements:
> The Virtqueue Descriptor Table.)
>
> Next steps in this are of code will be to properly implement the missing
> VIRTIO_BLK_S_IOERR results for failed i/o. Right now the device bails
> processing the command and doesn't reply to the driver, which is not
> conforming with virtio spec.
>
> > OK?
>
> Any other feedback? OK?
>

ok mlarkin

> >
> > -dv
> >
> > [1] https://www.oasis-open.org/committees/ballot.php?id=3536
>
>
> Index: virtio.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> retrieving revision 1.89
> diff -u -p -r1.89 virtio.c
> --- virtio.c  16 Jun 2021 16:55:02 -  1.89
> +++ virtio.c  17 Jun 2021 15:57:56 -
> @@ -517,6 +517,11 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   }
>
>   /* Read command from descriptor ring */
> + if (cmd_desc->flags & VRING_DESC_F_WRITE) {
> + log_warnx("vioblk: unexpected writable cmd descriptor "
> + "%d", cmd_desc_idx);
> + goto out;
> + }
>   if (read_mem(cmd_desc->addr, , sizeof(cmd))) {
>   log_warnx("vioblk: command read_mem error @ 0x%llx",
>   cmd_desc->addr);
> @@ -541,6 +546,13 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   struct ioinfo *info;
>   const uint8_t *secdata;
>
> + if ((secdata_desc->flags & VRING_DESC_F_WRITE)
> + == 0) {
> + log_warnx("vioblk: unwritable data "
> + "descriptor %d", secdata_desc_idx);
> + goto out;
> + }
> +
>   info = vioblk_start_read(dev,
>   cmd.sector + secbias, secdata_desc->len);
>
> @@ -607,6 +619,13 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   do {
>   struct ioinfo *info;
>
> + if (secdata_desc->flags & VRING_DESC_F_WRITE) {
> + log_warnx("wr vioblk: unexpected "
> + "writable data descriptor %d",
> + secdata_desc_idx);
> + goto out;
> + }
> +
>   info = vioblk_start_write(dev,
>   cmd.sector + secbias,
>   secdata_desc->addr, secdata_desc->len);
> @@ -654,7 +673,35 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   ds_desc_idx = 

Re: Document missing pledge promises

2021-06-11 Thread Mike Larkin
On Fri, Jun 11, 2021 at 09:16:46AM -0600, Theo de Raadt wrote:
> Dave Voutila  wrote:
>
> > Theo de Raadt writes:
> >
> > > Regarding the vmm chunk -- as I said in my other reply, these
> > > explanations are too precise.  They risk becoming outdated as things
> > > change.  Furthermore, some of those ioctl may work in one way, but not
> > > another way.  Which would be too complicated to describe also.  I urge
> > > simple messaging:
> > >
> > > .It Va vmm
> > > Operations required by
> > > .Xr vmd 8 .
> > >
> > > It is accurate.  If someone later wanted to use those operations, they
> > > would figure it out by reading kernel and vmd source.
> >
> > I agree simpler is better. The actual ioctls are documented in vmm.4 and
> > this is currently an all-or-nothing thing. You either get to perform all
> > operations on the vmm(4) device or none.
>
> What you just said is the truth.  But once you put it in a manual page,
> in the future the code may change, and some ioctl might be exposed
> without "vmm". it is better to be vague.
>

Agreed, simpler is better in the pledge docs.



Re: Update vmctl(8) to use TERMINATE_VM_EVENTs

2021-06-10 Thread Mike Larkin
On Thu, Jun 10, 2021 at 09:19:45AM -0400, Dave Voutila wrote:
>
> Still looking for an OK or feedback on the below. This is finishing work
> to fixes made previously to vmd(8)/vmctl(8) regarding vm
> stopping/running state corruption when using vmctl(8) to wait for a vm
> to stop.
>

Sorry for the delay. ok mlarkin@ with one comment below.

-ml

> Dave Voutila writes:
>
> > ping
> >
> > Dave Voutila writes:
> >
> >> Dave Voutila writes:
> >>
> >>> The conclusion of my previous fixes to vmd(8) [1] changes the event
> >>> handling in vmctl(8) to support receiving IMSG_VMDOP_TERMINATE_VM_EVENTs
> >>> from the control process. (This removes a XXX comment from vmd.)
> >>>
> >>> For clarity, the messaging logic was changed previously:
> >>>
> >>> - ...TERMINATE_VM_RESPONSE conveying success/failure of the request to
> >>>   terminate a guest regardless of waiting for termination
> >>> - ...TERMINATE_VM_EVENT conveying the actual termination of a guest
> >>>
> >>> This diff finishes bringing that logic from vmd(8) to vmctl(8).
> >>>
> >>> OK?
> >>
> >> Ping. Looking to close this gap.
> >>
> >> Note: this diff does preserve some errno abuse in vmd & vmctl that I'm
> >> working on separately.
> >>
> >>>
> >>> -dv
> >>>
> >>>
> >>> Index: usr.sbin/vmd/control.c
> >>> ===
> >>> RCS file: /cvs/src/usr.sbin/vmd/control.c,v
> >>> retrieving revision 1.35
> >>> diff -u -p -r1.35 control.c
> >>> --- usr.sbin/vmd/control.c26 Apr 2021 22:58:27 -  1.35
> >>> +++ usr.sbin/vmd/control.c30 Apr 2021 12:31:22 -
> >>> @@ -154,9 +154,8 @@ control_dispatch_vmd(int fd, struct priv
> >>>   if (notify->ctl_vmid != vmr.vmr_id)
> >>>   continue;
> >>>   if ((c = control_connbyfd(notify->ctl_fd)) != NULL) {
> >>> - /* XXX vmctl expects *_RESPONSE, not *_EVENT */
> >>> - imsg_compose_event(>iev,
> >>> - IMSG_VMDOP_TERMINATE_VM_RESPONSE,
> >>> + /* Forward to the vmctl(8) client */
> >>> + imsg_compose_event(>iev, imsg->hdr.type,
> >>>   0, 0, -1, imsg->data, IMSG_DATA_SIZE(imsg));
> >>>   TAILQ_REMOVE(_notify_q, notify, entry);
> >>>   free(notify);
> >>> Index: usr.sbin/vmctl/vmctl.c
> >>> ===
> >>> RCS file: /cvs/src/usr.sbin/vmctl/vmctl.c,v
> >>> retrieving revision 1.77
> >>> diff -u -p -r1.77 vmctl.c
> >>> --- usr.sbin/vmctl/vmctl.c22 Mar 2021 18:50:11 -  1.77
> >>> +++ usr.sbin/vmctl/vmctl.c30 Apr 2021 12:31:22 -
> >>> @@ -461,7 +461,7 @@ terminate_vm(uint32_t terminate_id, cons
> >>>   * terminate_vm_complete
> >>>   *
> >>>   * Callback function invoked when we are expecting an
> >>> - * IMSG_VMDOP_TERMINATE_VM_RESPONSE message indicating the completion of
> >>> + * IMSG_VMDOP_TERMINATE_VM_EVENT message indicating the completion of

It looks like this function has cases for both IMSG_VMDOP_TERMINATE_VM_RESPONSE
*and* _EVENT. Should the comment be phrased accordingly? If I read this
correctly, the comment would only state this function handles _EVENT messages.

> >>>   * a terminate vm operation.
> >>>   *
> >>>   * Parameters:
> >>> @@ -484,41 +484,50 @@ terminate_vm_complete(struct imsg *imsg,
> >>>   struct vmop_result *vmr;
> >>>   int res;
> >>>
> >>> - if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_RESPONSE) {
> >>> + switch (imsg->hdr.type) {
> >>> + case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
> >>> + IMSG_SIZE_CHECK(imsg, );
> >>>   vmr = (struct vmop_result *)imsg->data;
> >>>   res = vmr->vmr_result;
> >>> - if (res) {
> >>> - switch (res) {
> >>> - case VMD_VM_STOP_INVALID:
> >>> - fprintf(stderr,
> >>> - "cannot stop vm that is not running\n");
> >>> - *ret = EINVAL;
> >>> - break;
> >>> - case ENOENT:
> >>> - fprintf(stderr, "vm not found\n");
> >>> - *ret = EIO;
> >>> - break;
> >>> - case EINTR:
> >>> - fprintf(stderr, "interrupted call\n");
> >>> - *ret = EIO;
> >>> - break;
> >>> - default:
> >>> - errno = res;
> >>> - fprintf(stderr, "failed: %s\n",
> >>> - strerror(res));
> >>> - *ret = EIO;
> >>> - }
> >>> - } else if (flags & VMOP_WAIT) {
> >>> +
> >>> + switch (res) {
> >>> + case 0:
> >>> + fprintf(stderr, "requested to shutdown vm %d\n",
> >>> + vmr->vmr_id);

Re: limit MSR_INT_PEN_MSG use to < family 16h

2021-06-10 Thread Mike Larkin
On Wed, Jun 09, 2021 at 10:35:48PM -0700, Mike Larkin wrote:
> On Thu, Jun 10, 2021 at 03:19:43PM +1000, Jonathan Gray wrote:
> > Ilya Voronin sent a diff to misc to limit MSR_INT_PEN_MSG use to
> > < AMD family 17h prompted by a problem with an AWS t3a instance.
> >
> > https://marc.info/?l=openbsd-misc=162120066715633=2
> >
> > Digging some more the 16h bkdgs have it as RAZ/non-functional as well.
> > Bits are documented in 15h.
> >
> > BKDG for AMD Family 16h Models 00h-0Fh Processors
> > MSRC001_0055 Interrupt Pending
> > 63:0 RAZ.
> >
> > BKDG for AMD Family 16h Models 30h-3Fh Processors
> > MSRC001_0055 Interrupt Pending
> > 63:0 RAZ
> >
> > PPR for AMD Family 17h Model 71h B0
> > MSRC001_0055 [Reserved.] (Core::X86::Msr::IntPend)
> > Read-only. Reset: Fixed,___h.
> >
> > Change the test to use extended family id while here.
> >
>
> I'd be ok with this if someone reported that it works on a bare metal EPYC,
> since the fix here is for a virtualized environment (and we don't know what
> AWS is doing here).
>
> -ml
>

Seeing that people have tested this on a few machines of the right era,
ok mlarkin@.

>
> > Index: sys/arch/amd64/amd64/lapic.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> > retrieving revision 1.57
> > diff -u -p -r1.57 lapic.c
> > --- sys/arch/amd64/amd64/lapic.c6 Sep 2020 20:50:00 -   1.57
> > +++ sys/arch/amd64/amd64/lapic.c19 May 2021 09:16:37 -
> > @@ -299,8 +299,7 @@ lapic_set_lvt(void)
> >  *Family 0Fh Processors"
> >  *   #32559 revision 3.00
> >  */
> > -   if ((cpu_id & 0x0f00) == 0x0f00 &&
> > -   (cpu_id & 0x0fff) >= 0x0004) {
> > +   if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
> > uint64_t msr;
> >
> > msr = rdmsr(MSR_INT_PEN_MSG);
> > Index: sys/arch/i386/i386/lapic.c
> > ===
> > RCS file: /cvs/src/sys/arch/i386/i386/lapic.c,v
> > retrieving revision 1.47
> > diff -u -p -r1.47 lapic.c
> > --- sys/arch/i386/i386/lapic.c  30 Jul 2018 14:19:12 -  1.47
> > +++ sys/arch/i386/i386/lapic.c  19 May 2021 09:19:41 -
> > @@ -160,8 +160,7 @@ lapic_set_lvt(void)
> >  *Family 0Fh Processors"
> >  *   #32559 revision 3.00
> >  */
> > -   if ((cpu_id & 0x0f00) == 0x0f00 &&
> > -   (cpu_id & 0x0fff) >= 0x0004) {
> > +   if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
> > uint64_t msr;
> >
> > msr = rdmsr(MSR_INT_PEN_MSG);
> >
> >
>



Re: limit MSR_INT_PEN_MSG use to < family 16h

2021-06-09 Thread Mike Larkin
On Thu, Jun 10, 2021 at 03:19:43PM +1000, Jonathan Gray wrote:
> Ilya Voronin sent a diff to misc to limit MSR_INT_PEN_MSG use to
> < AMD family 17h prompted by a problem with an AWS t3a instance.
>
> https://marc.info/?l=openbsd-misc=162120066715633=2
>
> Digging some more the 16h bkdgs have it as RAZ/non-functional as well.
> Bits are documented in 15h.
>
> BKDG for AMD Family 16h Models 00h-0Fh Processors
> MSRC001_0055 Interrupt Pending
> 63:0 RAZ.
>
> BKDG for AMD Family 16h Models 30h-3Fh Processors
> MSRC001_0055 Interrupt Pending
> 63:0 RAZ
>
> PPR for AMD Family 17h Model 71h B0
> MSRC001_0055 [Reserved.] (Core::X86::Msr::IntPend)
> Read-only. Reset: Fixed,___h.
>
> Change the test to use extended family id while here.
>

I'd be ok with this if someone reported that it works on a bare metal EPYC,
since the fix here is for a virtualized environment (and we don't know what
AWS is doing here).

-ml


> Index: sys/arch/amd64/amd64/lapic.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> retrieving revision 1.57
> diff -u -p -r1.57 lapic.c
> --- sys/arch/amd64/amd64/lapic.c  6 Sep 2020 20:50:00 -   1.57
> +++ sys/arch/amd64/amd64/lapic.c  19 May 2021 09:16:37 -
> @@ -299,8 +299,7 @@ lapic_set_lvt(void)
>*Family 0Fh Processors"
>*   #32559 revision 3.00
>*/
> - if ((cpu_id & 0x0f00) == 0x0f00 &&
> - (cpu_id & 0x0fff) >= 0x0004) {
> + if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
>   uint64_t msr;
>
>   msr = rdmsr(MSR_INT_PEN_MSG);
> Index: sys/arch/i386/i386/lapic.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/lapic.c,v
> retrieving revision 1.47
> diff -u -p -r1.47 lapic.c
> --- sys/arch/i386/i386/lapic.c30 Jul 2018 14:19:12 -  1.47
> +++ sys/arch/i386/i386/lapic.c19 May 2021 09:19:41 -
> @@ -160,8 +160,7 @@ lapic_set_lvt(void)
>*Family 0Fh Processors"
>*   #32559 revision 3.00
>*/
> - if ((cpu_id & 0x0f00) == 0x0f00 &&
> - (cpu_id & 0x0fff) >= 0x0004) {
> + if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
>   uint64_t msr;
>
>   msr = rdmsr(MSR_INT_PEN_MSG);
>
>



Re: vmm(4): use monotonic base for pvclock

2021-06-01 Thread Mike Larkin
On Tue, Jun 01, 2021 at 08:03:43PM -0500, Scott Cheloha wrote:
> The documentation for the Linux pvclock is pretty sparse but I am
> pretty sure we want to use a monotonic base for ti_system_time.  We
> also have a function for converting a timespec into a 64-bit count of
> nanoseconds we can use.
>
> We may as well also use rdtsc_lfence() to ensure consistent behavior.
>
> ... this is still not quite right because the VM expects the pvclock
> to have a fixed frequency, but we have no interface to reading a raw
> timestamp.  Something to add in the future, maybe.
>
> Index: vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.284
> diff -u -p -r1.284 vmm.c
> --- vmm.c 18 May 2021 00:05:20 -  1.284
> +++ vmm.c 2 Jun 2021 00:57:31 -
> @@ -7294,8 +7294,8 @@ vmm_init_pvclock(struct vcpu *vcpu, padd
>  int
>  vmm_update_pvclock(struct vcpu *vcpu)
>  {
> + struct timespec now;
>   struct pvclock_time_info *pvclock_ti;
> - struct timespec tv;
>   struct vm *vm = vcpu->vc_parent;
>   paddr_t pvclock_hpa, pvclock_gpa;
>
> @@ -7309,10 +7309,9 @@ vmm_update_pvclock(struct vcpu *vcpu)
>   pvclock_ti->ti_version =
>   (++vcpu->vc_pvclock_version << 1) | 0x1;
>
> - pvclock_ti->ti_tsc_timestamp = rdtsc();
> - nanotime();
> - pvclock_ti->ti_system_time =
> - tv.tv_sec * 10L + tv.tv_nsec;
> + pvclock_ti->ti_tsc_timestamp = rdtsc_lfence();
> + nanouptime();
> + pvclock_ti->ti_system_time = TIMESPEC_TO_NSEC();
>   pvclock_ti->ti_tsc_shift = 12;
>   pvclock_ti->ti_tsc_to_system_mul =
>   vcpu->vc_pvclock_system_tsc_mul;
>

This probably needs to be tested on a wide variety (and versions) of Linux
guests. I've found in the past that different kernel versions do different
things and behave differently.

Did you test a few Linux guest VMs? Did this work across all of them?

-ml



Re: vio.4: mention support provided by vmd(8)

2021-05-24 Thread Mike Larkin
On Sun, May 23, 2021 at 09:50:46PM -0400, Dave Voutila wrote:
> Seems only right that vio.4 mention it's the driver used for the virtio
> networking device provided by vmd(8).
>
> OK?
>

ok mlarkin

>
> Index: vio.4
> ===
> RCS file: /cvs/src/share/man/man4/vio.4,v
> retrieving revision 1.15
> diff -u -p -r1.15 vio.4
> --- vio.4 24 Sep 2015 13:11:48 -  1.15
> +++ vio.4 24 May 2021 01:48:44 -
> @@ -27,7 +27,8 @@ The
>  .Nm
>  driver provides support for the
>  .Xr virtio 4
> -network interface provided by bhyve, KVM, QEMU, and VirtualBox.
> +network interface provided by bhyve, KVM, QEMU, VirtualBox, and
> +.Xr vmd 8 .
>  .Pp
>  Setting the bit 0x2 in the flags disables the RingEventIndex feature.
>  This can be tried as a workaround for possible bugs in host implementations 
> of
>



Re: vmd(8): add MTU feature support to vionet device

2021-05-24 Thread Mike Larkin
On Mon, May 24, 2021 at 08:25:04AM +0200, Claudio Jeker wrote:
> On Sun, May 23, 2021 at 10:25:38PM -0400, Dave Voutila wrote:
> > The following diff adds in virtio 1.1's VIRTIO_NET_F_MTU feature support
> > to vmd(8)'s virtio networking device. This allows for communicating an MTU
> > to the guest driver and then enforcing it in the emulated device.
> >
> > When the feature is offered, per Virtio v1.1, 5.1.4.1 [1]:
> >
> > "The device MUST NOT pass received packets that exceed mtu (plus low
> > level ethernet header length) size with gso_type NONE or ECN after
> > VIRTIO_NET_F_MTU has been successfully negotiated."
> >
> > (GSO is not supported or negotiated, so it's always NONE. This is
> > primarly because the vmd vionet device also doesn't support or negotiate
> > checksum offloading.)
> >
> > The prior logic in place simply checked the packet was of a allowable
> > size, which meant the largest IP packet (65535) plus an ethernet header.
> >
> > If testing the diff, you can change the VIONET_MTU definition to
> > something other than 1500 and check that a non-OpenBSD guest defaults to
> > using the value and forbids setting it higher. This is easy in an Alpine
> > or Debian Linux guest using:
> >
> > a) to view the mtu: ip link
> > b) to set the mtu: sudo ip link set dev  mtu 
> >
> > For example:
> >
> >   dave@debian:~$ sudo ip link set dev enp0s2 mtu 1501
> >   Error: mtu greater than device maximum.
> >
> > Since the diff lacks context of the goto, it jumps to section that
> > advances to the next ring
> >
> > Currently, vio(4) does not negotiate this feature and won't obey it. I'm
> > working on that separately.
> >
> > OK? Feedback?
> >
> > [1] 
> > https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-204
> >
> > Index: virtio.c
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> > retrieving revision 1.87
> > diff -u -p -r1.87 virtio.c
> > --- virtio.c18 May 2021 11:06:43 -  1.87
> > +++ virtio.c24 May 2021 01:31:22 -
> > @@ -60,6 +60,7 @@ int nr_vioblk;
> >
> >  #define MAXPHYS(64 * 1024) /* max raw I/O transfer size */
> >
> > +#define VIRTIO_NET_F_MTU   (1<<3)
> >  #define VIRTIO_NET_F_MAC   (1<<5)
> >
> >  #define VMMCI_F_TIMESYNC   (1<<0)
> > @@ -1046,6 +1047,26 @@ virtio_net_io(int dir, uint16_t reg, uin
> > *data = dev->mac[reg -
> > VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
> > break;
> > +   case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
> > +   if (sz == 2) {
> > +   *data = VIONET_MTU;
> > +   } else if (sz == 1) {
> > +   *data &= 0xFF00;
> > +   *data |= (uint32_t)(VIONET_MTU) & 0xFF;
> > +   } else {
> > +   log_warnx("%s: illegal read of vionet_mtu",
> > +   __progname);
> > +   }
> > +   break;
> > +   case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
> > +   if (sz == 1) {
> > +   *data &= 0xFF00;
> > +   *data = (uint32_t)(VIONET_MTU >> 8) & 0xFF;
> > +   } else {
> > +   log_warnx("%s: illegal read of vionet_mtu",
> > +   __progname);
> > +   }
> > +   break;
>
> Is it possible to get proper defines for these two options?
> This VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11 is ugly.
>

We could fix the + 11 part, but about the best we could do would be something
like the following:

VIRTIO_CONFIG_NET_MTU
VIRTIO_CONFIG_NET_MTU + 1
VIRTIO_CONFIG_NET_MTU + 2
VIRTIO_CONFIG_NET_MTU + 3

Since this is a pci config space access and I've seen Linux use 1, 2, and 4 byte
accesses. But, yes, we could improve the actual name.

Once dv@ gets this in I'll go back and redo the other devices (since we do a
similar thing for those as well).

-ml

> > case VIRTIO_CONFIG_DEVICE_FEATURES:
> > *data = dev->cfg.device_feature;
> > break;
> > @@ -1437,7 +1458,7 @@ vionet_notify_tx(struct vionet_dev *dev)
> > size_t pktsz, chunk_size = 0;
> > ssize_t dhcpsz;
> > int ret, num_enq, ofs, spc;
> > -   char *vr, *pkt, *dhcppkt;
> > +   char *vr, *pkt = NULL, *dhcppkt;
> > struct vring_desc *desc, *pkt_desc, *hdr_desc;
> > struct vring_avail *avail;
> > struct vring_used *used;
> > @@ -1505,12 +1526,13 @@ vionet_notify_tx(struct vionet_dev *dev)
> > /* Remove virtio header descriptor len */
> > pktsz -= hdr_desc->len;
> >
> > -   /* Only allow buffer len < max IP packet + Ethernet header */
> > -   if (pktsz > IP_MAXPACKET + ETHER_HDR_LEN) {
> > +   /* Drop frames larger than our MTU + ethernet header */
> > +   if 

Re: vmd(8): skip inspecting non-udp packets on local ifs

2021-05-23 Thread Mike Larkin
On Sat, May 22, 2021 at 10:20:37AM -0400, Dave Voutila wrote:
> tech@ & krw (since your code in question was imported to vmd),
>
> I found strange behavior running tcpbench(1) to measure the connection
> between a vmd guest and my host, as well as guest-to-guest. In short,
> it's some bogus logic in how vmd tries to intercept dhcp/bootp on local
> interfaces. Diff at the bottom addresses the issue, some background:
>
> Running tcpbench(1) for ~20-30s on my machine, vmd (with -v debug
> logging) barfs a bunch of lines like:
>
>   5 udp packets in 5 too long - dropped
>
> The tcpbench(1) throughput stalls out at that point and reports 0 Mbps
> avg bandwidth measurements.
>
> If anyone wants to reproduce, use an OpenBSD guest and just run:
>
>[host]$ tcpbench -s
>   [guest]$ tcpbench -t 180 100.64.x.2
>
> Where 'x' is the appropriate value for your guest's local interface.
>
> reyk@ imported packet.c from dhclient(8), but there's no validation that
> the packet being inspected is an IP/UDP packet vs. IP/TCP, leading to
> bogus logic related to inspecing UDP header attributes. In dhclient(8),
> the decode_udp_ip_header function is used in a place where a bpf capture
> buffer has already made sure it's a UDP packet (see sbin/dhclient/bpf.c).
>
> In addition, there was a lot of stateful counting and checking we just
> don't need in vmd(8), so I've ripped that out as well. It makes no sense
> in this context.
>
> OK?
>

ok mlarkin

>
> Index: packet.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/packet.c,v
> retrieving revision 1.1
> diff -u -p -r1.1 packet.c
> --- packet.c  19 Apr 2017 15:38:32 -  1.1
> +++ packet.c  22 May 2021 14:15:09 -
> @@ -220,12 +220,6 @@ decode_udp_ip_header(unsigned char *buf,
>   unsigned char *data;
>   u_int32_t ip_len;
>   u_int32_t sum, usum;
> - static unsigned int ip_packets_seen;
> - static unsigned int ip_packets_bad_checksum;
> - static unsigned int udp_packets_seen;
> - static unsigned int udp_packets_bad_checksum;
> - static unsigned int udp_packets_length_checked;
> - static unsigned int udp_packets_length_overflow;
>   int len;
>
>   /* Assure that an entire IP header is within the buffer. */
> @@ -236,17 +230,11 @@ decode_udp_ip_header(unsigned char *buf,
>   return (-1);
>
>   ip = (struct ip *)(buf + offset);
> - ip_packets_seen++;
> + if (ip->ip_p != IPPROTO_UDP)
> + return (-1);
>
>   /* Check the IP header checksum - it should be zero. */
>   if (wrapsum(checksum(buf + offset, ip_len, 0)) != 0) {
> - ip_packets_bad_checksum++;
> - if (ip_packets_seen > 4 && ip_packets_bad_checksum != 0 &&
> - (ip_packets_seen / ip_packets_bad_checksum) < 2) {
> - log_info("%u bad IP checksums seen in %u packets",
> - ip_packets_bad_checksum, ip_packets_seen);
> - ip_packets_seen = ip_packets_bad_checksum = 0;
> - }
>   return (-1);
>   }
>
> @@ -274,7 +262,6 @@ decode_udp_ip_header(unsigned char *buf,
>   if (buflen < offset + ip_len + sizeof(*udp))
>   return (-1);
>   udp = (struct udphdr *)(buf + offset + ip_len);
> - udp_packets_seen++;
>
>   /* Assure that the entire UDP packet is within the buffer. */
>   if (buflen < offset + ip_len + ntohs(udp->uh_ulen))
> @@ -286,20 +273,8 @@ decode_udp_ip_header(unsigned char *buf,
>* UDP header and the data. If the UDP checksum field is zero,
>* we're not supposed to do a checksum.
>*/
> - udp_packets_length_checked++;
>   len = ntohs(udp->uh_ulen) - sizeof(*udp);
>   if ((len < 0) || (len + data > buf + buflen)) {
> - udp_packets_length_overflow++;
> - if (udp_packets_length_checked > 4 &&
> - udp_packets_length_overflow != 0 &&
> - (udp_packets_length_checked /
> - udp_packets_length_overflow) < 2) {
> - log_info("%u udp packets in %u too long - dropped",
> - udp_packets_length_overflow,
> - udp_packets_length_checked);
> - udp_packets_length_overflow =
> - udp_packets_length_checked = 0;
> - }
>   return (-1);
>   }
>   if (len + data != buf + buflen)
> @@ -313,15 +288,7 @@ decode_udp_ip_header(unsigned char *buf,
>   2 * sizeof(ip->ip_src),
>   IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen);
>
> - udp_packets_seen++;
>   if (usum && usum != sum) {
> - udp_packets_bad_checksum++;
> - if (udp_packets_seen > 4 && udp_packets_bad_checksum != 0 &&
> - (udp_packets_seen / udp_packets_bad_checksum) < 2) {
> - log_info("%u bad udp checksums in %u packets",
> - 

Re: vmm(4): Mask TSC_ADJUST cpu feature

2021-05-20 Thread Mike Larkin
On Thu, May 20, 2021 at 07:36:23AM -0400, Dave Voutila wrote:
> We don't currently emulate all TSC related features yet. While hacking
> on other issues, I've found some more obnoxious guests (*cough* debian
> *cough*) constantly try to read the IA32_TSC_ADJUST msr every second,
> not getting the hint when we inject #GP. This floods the kernel message
> buffer with things like:
>
>   vmx_handle_rdmsr: unsupported rdmsr (msr=0x3b), injecting #GP
>
> (The above debug logging exists to help find msr's we're not supporting
> that guests are poking, so I guess you can say it's working as intended
> [1].)
>
> If and when we add more TSC capabilities to vmm we can always unmask.
>
> Ok?
>
> [1] https://marc.info/?l=openbsd-tech=161739346822128=2
>
> Index: sys/arch/amd64/include/vmmvar.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
> retrieving revision 1.71
> diff -u -p -r1.71 vmmvar.h
> --- sys/arch/amd64/include/vmmvar.h   5 Apr 2021 18:26:46 -   1.71
> +++ sys/arch/amd64/include/vmmvar.h   16 May 2021 16:55:06 -
> @@ -637,6 +637,7 @@ struct vm_mprotect_ept_params {
>
>  /*
>   * SEFF flags - copy from host minus:
> + *  TSC_ADJUST (SEFF0EBX_TSC_ADJUST)
>   *  SGX (SEFF0EBX_SGX)
>   *  HLE (SEFF0EBX_HLE)
>   *  INVPCID (SEFF0EBX_INVPCID)
> @@ -655,7 +656,8 @@ struct vm_mprotect_ept_params {
>   *  PT (SEFF0EBX_PT)
>   *  AVX512VBMI (SEFF0ECX_AVX512VBMI)
>   */
> -#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_SGX | SEFF0EBX_HLE | SEFF0EBX_INVPCID | 
> \
> +#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_TSC_ADJUST | SEFF0EBX_SGX | \
> +SEFF0EBX_HLE | SEFF0EBX_INVPCID | \
>  SEFF0EBX_RTM | SEFF0EBX_PQM | SEFF0EBX_MPX | \
>  SEFF0EBX_PCOMMIT | SEFF0EBX_PT | \
>  SEFF0EBX_AVX512F | SEFF0EBX_AVX512DQ | \

Yep, if we don't implement it we should not be advertising support for it.

ok mlarkin.



Re: patch: new fix for vmctl create

2021-05-12 Thread Mike Larkin
On Mon, Mar 15, 2021 at 08:21:56AM +, James Cook wrote:
> Hi tech@,
>
> The below patch removes calls to realpath(3) when looking up a qcow2
> base image. Previous thread:
> https://marc.info/?t=16156249642=1=2
>
> In short, the calls were failing inside vmctl, because of unveil. The
> other thread has alternative solutions but I think this is simplest.
>
> I included a regression test demonstrating the vmctl bug, in case
> there's interest. I tested vmd manually as described in the other
> thread.
>
> I also added a check in case dirname(3) fails --- I don't think it
> currently can, but better safe than sorry, I figure. (Noticed by Dave
> in the other thread.)
>
> - James
>

After looking at this a bit, we decided to remove the unveil parts around
the base images, since the realpath removal below would also affect vmd.

dv@ just committed that. Thanks for the diff and research!

>
> diff --git a/regress/usr.sbin/Makefile b/regress/usr.sbin/Makefile
> index 60e2178d3c7..146f9c9f322 100644
> --- a/regress/usr.sbin/Makefile
> +++ b/regress/usr.sbin/Makefile
> @@ -15,6 +15,7 @@ SUBDIR += rpki-client
>  SUBDIR += snmpd
>  SUBDIR += switchd
>  SUBDIR += syslogd
> +SUBDIR += vmctl
>
>  .if ${MACHINE} == "amd64" || ${MACHINE} == "i386"
>  SUBDIR += vmd
> diff --git a/regress/usr.sbin/vmctl/Makefile b/regress/usr.sbin/vmctl/Makefile
> new file mode 100644
> index 000..8fa87f0f6f0
> --- /dev/null
> +++ b/regress/usr.sbin/vmctl/Makefile
> @@ -0,0 +1,34 @@
> +# $OpenBSD$
> +
> +REGRESS_TARGETS = run-regress-convert-with-base-path
> +
> +run-regress-convert-with-base-path:
> + # non-relative base path
> + rm -f *.qcow2
> + vmctl create -s 1m base.qcow2
> + vmctl create -b ${PWD}/base.qcow2 source.qcow2
> + vmctl create -i source.qcow2 dest.qcow2
> +
> + # relative base path; two base images
> + rm -f *.qcow2
> + vmctl create -s 1m base0.qcow2
> + vmctl create -b base0.qcow2 base1.qcow2
> + vmctl create -b base1.qcow2 source.qcow2
> + vmctl create -i source.qcow2 dest.qcow2
> +
> + # copy from a different directory
> + rm -rf dir *.qcow2
> + vmctl create -s 1m base.qcow2
> + vmctl create -b base.qcow2 source.qcow2
> + mkdir dir
> + cd dir; vmctl create -i ../source.qcow2 dest.qcow2
> +
> + # base accessed through symlink
> + rm -rf dir sym *.qcow2
> + mkdir dir
> + cd dir; vmctl create -s 1m base.qcow2
> + cd dir; vmctl create -b base.qcow2 source.qcow2
> + ln -s dir sym
> + vmctl create -i sym/source.qcow2 dest.qcow2
> +
> +.include 
> diff --git a/usr.sbin/vmd/vioqcow2.c b/usr.sbin/vmd/vioqcow2.c
> index 34d0f116cc4..be8609f1644 100644
> --- a/usr.sbin/vmd/vioqcow2.c
> +++ b/usr.sbin/vmd/vioqcow2.c
> @@ -145,8 +145,8 @@ virtio_qcow2_init(struct virtio_backing *file, off_t 
> *szp, int *fd, size_t nfd)
>  ssize_t
>  virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
>  {
> + char pathbuf[PATH_MAX];
>   char dpathbuf[PATH_MAX];
> - char expanded[PATH_MAX];
>   struct qcheader header;
>   uint64_t backingoff;
>   uint32_t backingsz;
> @@ -180,27 +180,23 @@ virtio_qcow2_get_base(int fd, char *path, size_t npath, 
> const char *dpath)
>* rather than relative to the directory vmd happens to be running in,
>* since this is the only userful interpretation.
>*/
> - if (path[0] == '/') {
> - if (realpath(path, expanded) == NULL ||
> - strlcpy(path, expanded, npath) >= npath) {
> - log_warnx("unable to resolve %s", path);
> + if (path[0] != '/') {
> + if (strlcpy(pathbuf, path, sizeof(pathbuf)) >=
> + sizeof(pathbuf)) {
> + log_warnx("path too long: %s", path);
>   return -1;
>   }
> - } else {
>   if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
>   sizeof(dpathbuf)) {
>   log_warnx("path too long: %s", dpath);
>   return -1;
>   }
> - s = dirname(dpathbuf);
> - if (snprintf(expanded, sizeof(expanded),
> - "%s/%s", s, path) >= (int)sizeof(expanded)) {
> - log_warnx("path too long: %s/%s", s, path);
> + if ((s = dirname(dpathbuf)) == NULL) {
> + log_warn("dirname");
>   return -1;
>   }
> - if (npath < PATH_MAX ||
> - realpath(expanded, path) == NULL) {
> - log_warnx("unable to resolve %s", path);
> + if (snprintf(path, npath, "%s/%s", s, pathbuf) >= (int)npath) {
> + log_warnx("path too long: %s/%s", s, path);
>   return -1;
>   }
>   }
>



Re: potentially uninitialized string printed by vmd

2021-05-11 Thread Mike Larkin
On Mon, Mar 15, 2021 at 09:29:29AM +, James Cook wrote:
> > The array "base" which is passed to log_warnx might be uninitialized:
> > virtio_get_base doesn't necessarily touch it if it returns -1. Maybe it
> > would be better just omit base from the output, e.g.
> >
> > log_warnx("vm \"%s\" unable to read "
> > "base for disk %s", vcp->vcp_name,
> > vcp->vcp_disks[i]);
>
> Here it is as a patch.
>
> - James
>
> diff --git a/usr.sbin/vmd/config.c b/usr.sbin/vmd/config.c
> index 9ef5dca626e..3ce82052e4a 100644
> --- a/usr.sbin/vmd/config.c
> +++ b/usr.sbin/vmd/config.c
> @@ -393,8 +393,8 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, 
> uint32_t peerid, uid_t uid)
>   break;
>   if (n == -1) {
>   log_warnx("vm \"%s\" unable to read "
> - "base %s for disk %s", vcp->vcp_name,
> - base, vcp->vcp_disks[i]);
> + "base for disk %s", vcp->vcp_name,
> + vcp->vcp_disks[i]);
>   goto fail;
>   }
>   (void)strlcpy(path, base, sizeof(path));
>

Committed. I was going through old emails and found this. Sorry this took so
long.

Thanks!



Re: vmd(8): init debug logging before using logging

2021-05-03 Thread Mike Larkin
On Mon, May 03, 2021 at 08:50:36PM -0400, Dave Voutila wrote:
> If debug state in the logging routines isn't set, messages from
> fatal{,x} and warn{,x} don't get flushed to stderr, so running vmd
> un-daemonized can result in the process exiting at startup with no
> discernable message reason other than the ambiguous exit code (1).
>
> OK?
>
> Index: vmd.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v
> retrieving revision 1.123
> diff -u -p -r1.123 vmd.c
> --- vmd.c 26 Apr 2021 22:58:27 -  1.123
> +++ vmd.c 4 May 2021 00:44:56 -
> @@ -802,6 +802,9 @@ main(int argc, char **argv)
>   if (env->vmd_noaction && !env->vmd_debug)
>   env->vmd_debug = 1;
>
> + log_init(env->vmd_debug, LOG_DAEMON);
> + log_setverbose(env->vmd_verbose);
> +
>   /* check for root privileges */
>   if (env->vmd_noaction == 0) {
>   if (geteuid())
> @@ -835,9 +838,6 @@ main(int argc, char **argv)
>
>   /* Configuration will be parsed after forking the children */
>   env->vmd_conffile = conffile;
> -
> - log_init(env->vmd_debug, LOG_DAEMON);
> - log_setverbose(env->vmd_verbose);
>
>   if (env->vmd_noaction)
>   ps->ps_noaction = 1;
>

ok mlarkin



Re: added support for precompressed static files on httpd(so sorry about my other email that was unreadable)

2021-05-01 Thread Mike Larkin
On Sat, May 01, 2021 at 09:26:39PM +, alloca wrote:
> This patch adds a serve_gzip option. When enabled, If the client requests 
> path, then serve path.gz if it exists and the client accepts 
> Content-Encoding: gzip.
>
>


man style


> diff -up httpd.orig/config.c httpd/config.c
> --- httpd.orig/config.c Sat May  1 15:03:11 2021
> +++ httpd/config.c Sat May  1 15:45:43 2021
> @@ -568,12 +568,12 @@ config_getserver_config(struct httpd *env, struct serv
> >default_type, sizeof(struct media_type));
> }
>
> - f = SRVFLAG_PATH_REWRITE|SRVFLAG_NO_PATH_REWRITE;
> +/* f = SRVFLAG_PATH_REWRITE|SRVFLAG_NO_PATH_REWRITE;
> if ((srv_conf->flags & f) == 0) {
> srv_conf->flags |= parent->flags & f;
> (void)strlcpy(srv_conf->path, parent->path,
> sizeof(srv_conf->path));
> - }
> + } */
>
> f = SRVFLAG_SERVER_HSTS;
> srv_conf->flags |= parent->flags & f;
> diff -up httpd.orig/httpd.conf.5 httpd/httpd.conf.5
> --- httpd.orig/httpd.conf.5 Sat May  1 15:03:11 2021
> +++ httpd/httpd.conf.5 Sat May  1 16:02:44 2021
> @@ -397,6 +397,13 @@ a browser's preload list.
> Signal to the receiving user agent that this host and all sub domains
> of the host's domain should be considered HSTS hosts.
> .El
> +.It Ic serve_gzip
> +If the client requests
> +.Nm path ,
> +then serve
> +.Nm path.gz
> +if it exists and the client accepts
> +.Nm Content-Encoding: gzip .
> .It Ic listen on Ar address Oo Ic tls Oc Ic port Ar number
> Set the listen address and port.
> This statement can be specified multiple times.
> diff -up httpd.orig/httpd.h httpd/httpd.h
> --- httpd.orig/httpd.h Sat May  1 15:03:11 2021
> +++ httpd/httpd.h Sat May  1 15:41:58 2021
> @@ -390,17 +390,17 @@ SPLAY_HEAD(client_tree, client);
> #define SRVFLAG_SERVER_MATCH 0x0020
> #define SRVFLAG_SERVER_HSTS 0x0040
> #define SRVFLAG_DEFAULT_TYPE 0x0080
> -#define SRVFLAG_PATH_REWRITE 0x0100
> -#define SRVFLAG_NO_PATH_REWRITE 0x0200
> +/* #define SRVFLAG_PATH_REWRITE 0x0100
> +#define SRVFLAG_NO_PATH_REWRITE 0x0200 */
> #define SRVFLAG_LOCATION_FOUND 0x4000
> #define SRVFLAG_LOCATION_NOT_FOUND 0x8000
> -
> +#define SRVFLAG_SERVER_GZIP 0x0100
> #define SRVFLAG_BITS \
> "\10\01INDEX\02NO_INDEX\03AUTO_INDEX\04NO_AUTO_INDEX" \
> "\05ROOT\06LOCATION\07FCGI\10NO_FCGI\11LOG\12NO_LOG" \
> "\14SYSLOG\15NO_SYSLOG\16TLS\17ACCESS_LOG\20ERROR_LOG" \
> "\21AUTH\22NO_AUTH\23BLOCK\24NO_BLOCK\25LOCATION_MATCH" \
> - "\26SERVER_MATCH\27SERVER_HSTS\30DEFAULT_TYPE\31PATH\32NO_PATH" \
> + "\26SERVER_MATCH\27SERVER_HSTS\30DEFAULT_TYPE\31SERVER_GZIP" \
> "\37LOCATION_FOUND\40LOCATION_NOT_FOUND"
>
> #define TCPFLAG_NODELAY 0x01
> @@ -684,7 +684,7 @@ int server_headers(struct client *, void *,
> int (*)(struct client *, struct kv *, void *), void *);
> int server_writeresponse_http(struct client *);
> int server_response_http(struct client *, unsigned int,
> - struct media_type *, off_t, time_t);
> + struct media_type *, off_t, time_t, int);
> void server_reset_http(struct client *);
> void server_close_http(struct client *);
> int server_response(struct httpd *, struct client *);
> diff -up httpd.orig/parse.y httpd/parse.y
> --- httpd.orig/parse.y Sat May  1 15:03:11 2021
> +++ httpd/parse.y Sat May  1 15:48:31 2021
> @@ -138,7 +138,7 @@ typedef struct {
> %token COMBINED CONNECTION DHE DIRECTORY ECDHE ERR FCGI INDEX IP KEY LIFETIME
> %token LISTEN LOCATION LOG LOGDIR MATCH MAXIMUM NO NODELAY OCSP ON PORT 
> PREFORK
> %token PROTOCOLS REQUESTS ROOT SACK SERVER SOCKET STRIP STYLE SYSLOG TCP 
> TICKET
> -%token TIMEOUT TLS TYPE TYPES HSTS MAXAGE SUBDOMAINS DEFAULT PRELOAD REQUEST
> +%token TIMEOUT TLS TYPE TYPES HSTS SERVE_GZIP MAXAGE SUBDOMAINS DEFAULT 
> PRELOAD REQUEST
> %token ERROR INCLUDE AUTHENTICATE WITH BLOCK DROP RETURN PASS REWRITE
> %token CA CLIENT CRL OPTIONAL PARAM FORWARDED FOUND NOT
> %token  STRING
> @@ -644,6 +644,9 @@ serveroptsl : LISTEN ON STRING opttls port {
> }
> srv->srv_conf.flags |= SRVFLAG_SERVER_HSTS;
> }
> + | SERVE_GZIP {
> + srv->srv_conf.flags |= SRVFLAG_SERVER_GZIP;
> + }
> ;
>
> optfound : /* empty */ { $$ = 0; }
> @@ -925,23 +928,7 @@ requestflags_l : requestflags optcommanl requestflags_
> | requestflags optnl
> ;
>
> -requestflags : REWRITE STRING {
> - if (strlcpy(srv->srv_conf.path, $2,
> - sizeof(srv->srv_conf.path)) >=
> - sizeof(srv->srv_conf.path)) {
> - yyerror("request path too long");
> - free($2);
> - YYERROR;
> - }
> - free($2);
> - srv->srv_conf.flags |= SRVFLAG_PATH_REWRITE;
> - srv->srv_conf.flags &= ~SRVFLAG_NO_PATH_REWRITE;
> - }
> - | NO REWRITE {
> - srv->srv_conf.flags |= SRVFLAG_NO_PATH_REWRITE;
> - srv->srv_conf.flags &= ~SRVFLAG_PATH_REWRITE;
> - }
> - | STRIP NUMBER {
> +requestflags :  STRIP NUMBER {
> if ($2 < 0 || $2 > INT_MAX) {
> yyerror("invalid strip number");
> YYERROR;
> @@ -1431,6 +1418,7 @@ lookup(char *s)
> { "rewrite", REWRITE },
> { "root", ROOT },
> { "sack", SACK },
> + { "serve_gzip", SERVE_GZIP },
> { "server", SERVER },
> { "socket", SOCKET },
> { 

Re: vmd(8): remove duplicate struct definition

2021-04-29 Thread Mike Larkin
On Thu, Apr 29, 2021 at 03:24:42PM -0400, Dave Voutila wrote:
> Found this while running ctags(1)... vioqcow2.c has struct qcheader
> already defined at L53 (which stylistically is where it should be).
>
> This diff just removes the duplicate definition inside
> virtio_qcow2_create().
>
> OK?
>
>
> Index: vioqcow2.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/vioqcow2.c,v
> retrieving revision 1.14
> diff -u -p -r1.14 vioqcow2.c
> --- vioqcow2.c19 Oct 2020 19:06:49 -  1.14
> +++ vioqcow2.c29 Apr 2021 19:17:11 -
> @@ -634,27 +634,7 @@ int
>  virtio_qcow2_create(const char *imgfile_path,
>  const char *base_path, long imgsize)
>  {
> - struct qcheader {
> - char magic[4];
> - uint32_t version;
> - uint64_t backingoff;
> - uint32_t backingsz;
> - uint32_t clustershift;
> - uint64_t disksz;
> - uint32_t cryptmethod;
> - uint32_t l1sz;
> - uint64_t l1off;
> - uint64_t refoff;
> - uint32_t refsz;
> - uint32_t snapcount;
> - uint64_t snapsz;
> - /* v3 additions */
> - uint64_t incompatfeatures;
> - uint64_t compatfeatures;
> - uint64_t autoclearfeatures;
> - uint32_t reforder;
> - uint32_t headersz;
> - } __packed hdr, basehdr;
> + struct qcheader hdr, basehdr;
>   int fd, ret;
>   ssize_t base_len;
>   uint64_t l1sz, refsz, disksz, initsz, clustersz;
>

sure



Re: km_alloc(9) for i386 pmap

2021-04-23 Thread Mike Larkin
On Fri, Apr 23, 2021 at 08:07:43PM +0200, Martin Pieuchot wrote:
> Diff below convert the last uses of uvm_km_alloc(9) and uvm_km_zalloc(9)
> to km_alloc(9).
>
> One of the allocations below uses `kp_pageable' instead of `kp_zero'
> because the mapping for `pm_pdir_intel' is lost when PAE is enabled
> and need to be re-established when a fault happens.  This is consistent
> with what currently happens with uvm_km_zalloc().  Thanks to hshoexer@
> for the analysis.
>
> Fixing this is left as an exercise for the reader.  I'm currently
> concerned by getting rid of the old allocators.
>
> ok?
>

Reads ok. ok mlarkin

> Index: arch/i386/i386/pmap.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
> retrieving revision 1.211
> diff -u -p -r1.211 pmap.c
> --- arch/i386/i386/pmap.c 11 Mar 2021 11:16:57 -  1.211
> +++ arch/i386/i386/pmap.c 23 Apr 2021 17:36:57 -
> @@ -1365,7 +1365,7 @@ void
>  pmap_pinit_pd_86(struct pmap *pmap)
>  {
>   /* allocate PDP */
> - pmap->pm_pdir = uvm_km_alloc(kernel_map, NBPG);
> + pmap->pm_pdir = (vaddr_t)km_alloc(NBPG, _any, _dirty, _waitok);
>   if (pmap->pm_pdir == 0)
>   panic("pmap_pinit_pd_86: kernel_map out of virtual space!");
>   pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir,
> @@ -1397,7 +1397,8 @@ pmap_pinit_pd_86(struct pmap *pmap)
>* execution, one that lacks all kernel mappings.
>*/
>   if (cpu_meltdown) {
> - pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG);
> + pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, _any, _zero,
> + _waitok);
>   if (pmap->pm_pdir_intel == 0)
>   panic("%s: kernel_map out of virtual space!", __func__);
>
> @@ -1449,11 +1450,12 @@ pmap_destroy(struct pmap *pmap)
>   uvm_pagefree(pg);
>   }
>
> - uvm_km_free(kernel_map, pmap->pm_pdir, pmap->pm_pdirsize);
> + km_free((void *)pmap->pm_pdir, pmap->pm_pdirsize, _any, _dirty);
>   pmap->pm_pdir = 0;
>
>   if (pmap->pm_pdir_intel) {
> - uvm_km_free(kernel_map, pmap->pm_pdir_intel, pmap->pm_pdirsize);
> + km_free((void *)pmap->pm_pdir_intel, pmap->pm_pdirsize,
> + _any, _dirty);
>   pmap->pm_pdir_intel = 0;
>   }
>
> @@ -2522,8 +2524,9 @@ pmap_enter_special_86(vaddr_t va, paddr_
>   __func__, va);
>
>   if (!pmap->pm_pdir_intel) {
> - if ((pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG))
> - == 0)
> + pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, _any, _zero,
> + _waitok);
> + if (pmap->pm_pdir_intel == 0)
>   panic("%s: kernel_map out of virtual space!", __func__);
>   if (!pmap_extract(pmap, pmap->pm_pdir_intel,
>   >pm_pdirpa_intel))
> Index: arch/i386/i386/pmapae.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/pmapae.c,v
> retrieving revision 1.60
> diff -u -p -r1.60 pmapae.c
> --- arch/i386/i386/pmapae.c   23 Sep 2020 15:13:26 -  1.60
> +++ arch/i386/i386/pmapae.c   23 Apr 2021 17:59:05 -
> @@ -738,7 +738,7 @@ pmap_bootstrap_pae(void)
>   (uint32_t)VM_PAGE_TO_PHYS(ptppg));
>   }
>   }
> - uvm_km_free(kernel_map, (vaddr_t)pd, NBPG);
> + km_free(pd, NBPG, _any, _dirty);
>   DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd);
>   }
>
> @@ -944,7 +944,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
>   paddr_t pdidx[4];
>
>   /* allocate PDP */
> - pmap->pm_pdir = uvm_km_alloc(kernel_map, 4 * NBPG);
> + pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, _any, _dirty,
> + _waitok);
>   if (pmap->pm_pdir == 0)
>   panic("pmap_pinit_pd_pae: kernel_map out of virtual space!");
>   /* page index is in the pmap! */
> @@ -997,7 +998,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
>   if (cpu_meltdown) {
>   int i;
>
> - if ((va = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
> + va = (vaddr_t)km_alloc(4 * NBPG, _any, _zero, _nowait);
> + if (va == 0)
>   panic("%s: kernel_map out of virtual space!", __func__);
>   if (!pmap_extract(pmap_kernel(),
>   (vaddr_t)>pm_pdidx_intel, >pm_pdirpa_intel))
> @@ -1936,7 +1938,20 @@ pmap_enter_special_pae(vaddr_t va, paddr
>   __func__, va);
>
>   if (!pmap->pm_pdir_intel) {
> - if ((vapd = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
> +#if notyet
> + /*
> +  * XXX mapping is established via pmap_kenter() and lost
> +  * after enabling PAE.
> +  */
> + vapd = (vaddr_t)km_alloc(4 * NBPG, _any, _zero,
> + _waitok);
> 

Re: umm_map returns unaligned address?

2021-04-23 Thread Mike Larkin
On Fri, Apr 23, 2021 at 01:55:14PM +0200, Alessandro Pistocchi wrote:
> Hi all,
>
> I am fairly new to openbsd so if this is something obvious that I missed
> please be understanding.
>
> I am adding a syscall to openbsd 6.8. I am working on a raspberry pi.
>
> During the syscall I allocate some memory that I want to share between the
> kernel
> and the calling process.
>
> When it's time to wrap up and unmap the memory, I unmap it both from the
> kernel
> map and from the process map.
>
> The unmapping from the process map goes fine, the unmapping from the kernel
> map
> fails by saying that the virtual address in kernel map is not aligned to
> the page size
> ( it's actually 4 bytes off ).
>
> What have I missed? I assumed that umm_map would return a page aligned
> virtual
> address for the kernel mapping as well.
>
> Here is my code for creating the shared memory chunk:
>
> 
> // memory_size is a multiple of page size
> uvm_object = uao_create(memory_size, 0);
> if(!uvm_object) return;
>
> // TODO(ale): make sure that this memory cannot be swapped out
>
> uao_reference(uvm_object)
> if(uvm_map(kernel_map, (vaddr_t *), round_page(memory_size),
> uvm_object,
>0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
>MAP_INHERIT_SHARED, MADV_NORMAL, 0))) {
> uao_detach(uvm_object);
> uvm_object = 0;
> return;
> }
>
> uao_reference(uvm_object);
> if(uvm_map(>p_vmspace->vm_map, _in_proc_space,
> round_page(memory_size), uvm_object,
>0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
>MAP_INHERIT_NONE, MADV_NORMAL, 0))) {
> memory = 0;
> uao_detach(uvm_object);
> uao_detach(uvm_object);
> uvm_object = 0;
> return;
> }
> 
>
> Thanks,
> A

Please share the whole diff, this snippet above lacks context.



Re: vmd: spurious VM restarts

2021-04-07 Thread Mike Larkin
On Wed, Apr 07, 2021 at 07:47:28PM -0400, Dave Voutila wrote:
>
> Thomas L. writes:
>
> >> > Thomas: I looked at your host dmesg and your provided vm.conf. It
> >> > looks like 11 vm's with the default 512M memory and one (minecraft)
> >> > with 8G. Your host seems to have only 16GB of memory, some of which
> >> > is probably unavailable as it's used by the integrated gpu. I'm
> >> > wondering if you are effectively oversusbcribing your memory here.
> >> >
> >> > I know we currently don't support swapping guest memory out, but not
> >> > sure what happens if we don't have the physical memory to fault a
> >> > page in and wire it.
> >> >
> >>
> >> Something else gets swapped out.
> >
> > Wire == Can't swap out?
>
> Yes.
>
> > top shows 15G real memory available. That should be enough (8G + 11 *
> > 0.5G = 13.5G), or is this inherently risky with 6.8?
>
> With 6.8, the guests might have memory swapped out and worst case you'll
> see some performance issues. That shouldn't cause unexpected
> termination.
>

Depends on the exact content that got swapped out (as we didn't handle
TLB flushes correctly), so a crash was certainly a possibility. That's why
I wanted to see the VMM_DEBUG output.

In any case, Thomas should try -current and see if this problem is even
reproducible.

-ml

> > I can try -current as suggested in the other mail. Is this a likely
> > cause or should I run with VMM_DEBUG for further investigation? Is
> > "somewhat slower" from VMM_DEBUG still usable? I don't need full
> > performance, but ~month downtime until the problem shows again would be
> > too much.
>
> A fix is more likely to land in -current if an issue can be
> identified. Since the issue doesn't sound like it's easily reproducible
> yet, VMM_DEBUG is the best bet for having the information you'd need to
> share when the issue occurs.
>
> >> > Even without a custom kernel with VMM_DEBUG, if it's a uvm_fault
> >> > issue you should see a message in the kernel buffer. Something like:
> >> >
> >> >   vmx_fault_page: uvm_fault returns N, GPA=0x, rip=0x
> >> >
> >> > mlarkin: thoughts on my hypothesis? Am I wildly off course?
> >> >
> >> > -dv
> >> >
> >>
> >> Yeah I was trying to catch the big dump when a VM resets. That would
> >> tell us if the vm caused the reset or if vmd(8) crashed for some
> >> reason.
> >
> > But if vmd crashed it wouldn't restart automatically or does it?
> > All VMs down from vmd crashing would have been noticed.
> > That kernel message would have shown in the dmesg too, wouldn't it?
> >
>
> There are multiple factors. First is vmd(8) is multi-process and a vm's
> process can die without impacting others. Second is the vcpu could be
> reset making the guest "reboot." There are numerous reasons these things
> could happen, hence needing debug logging.
>
> -dv
>



Re: vmd: spurious VM restarts

2021-04-07 Thread Mike Larkin
On Wed, Apr 07, 2021 at 09:23:14AM -0400, Dave Voutila wrote:
>
> Dave Voutila writes:
>
> > Mike Larkin writes:
> >
> >> On Wed, Apr 07, 2021 at 12:22:23AM +0200, Thomas L. wrote:
> >>> On Tue, 6 Apr 2021 14:28:09 -0700
> >>> Mike Larkin  wrote:
> >>>
> >>> > On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> >>> > > On Tue, 6 Apr 2021 11:11:01 -0700
> >>> > > Mike Larkin  wrote:
> >>> > > > Anything in the host's dmesg?
> >>> > >
> >>> >
> >>> > *host* dmesg. I think you misread what I was after...
> >>>
> >>> The dmesg of the host was already attached to the first mail below the
> >>> vm.conf (I mistakenly called the host hypervisor, which I realize now is
> >>> not accurate). I figured since it was already attached, that
> >>> you must mean the VM, compounding the confusion ...
> >>>
> >>> Kind regards,
> >>>
> >>> Thomas
> >>>
> >>
> >> I see.
> >>
> >> You'll probably need to build a kernel with VMM_DEBUG and save that output 
> >> and
> >> send it to me once a VM crashes. Note: it will generate a lot of output and
> >> probably make things somewhat slower.
> >>
> >> -ml
> >
> > Thomas: I looked at your host dmesg and your provided vm.conf. It looks
> > like 11 vm's with the default 512M memory and one (minecraft) with
> > 8G. Your host seems to have only 16GB of memory, some of which is
> > probably unavailable as it's used by the integrated gpu. I'm wondering
> > if you are effectively oversusbcribing your memory here.
> >
> > I know we currently don't support swapping guest memory out, but not
> > sure what happens if we don't have the physical memory to fault a page
> > in and wire it.
>
> Looked a bit further and since your host is running 6.8 it doesn't have
> wiring memory logic, but I'd still be cautious about oversubscribing
> memory.
>

Yep. Try -current and see if this can be reproduced.

> >
> > Even without a custom kernel with VMM_DEBUG, if it's a uvm_fault issue
> > you should see a message in the kernel buffer. Something like:
> >
> >   vmx_fault_page: uvm_fault returns N, GPA=0x, rip=0x
> >
>
> You can also run vmd(8) with debug logging (-v or -vv) and maybe capture
> these events. Like with vmm(4) logging, it can be excessively verbose.
>
> > mlarkin: thoughts on my hypothesis? Am I wildly off course?
> >
> > -dv
>



Re: vmd: spurious VM restarts

2021-04-07 Thread Mike Larkin
On Wed, Apr 07, 2021 at 07:26:41AM -0400, Dave Voutila wrote:
>
> Mike Larkin writes:
>
> > On Wed, Apr 07, 2021 at 12:22:23AM +0200, Thomas L. wrote:
> >> On Tue, 6 Apr 2021 14:28:09 -0700
> >> Mike Larkin  wrote:
> >>
> >> > On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> >> > > On Tue, 6 Apr 2021 11:11:01 -0700
> >> > > Mike Larkin  wrote:
> >> > > > Anything in the host's dmesg?
> >> > >
> >> >
> >> > *host* dmesg. I think you misread what I was after...
> >>
> >> The dmesg of the host was already attached to the first mail below the
> >> vm.conf (I mistakenly called the host hypervisor, which I realize now is
> >> not accurate). I figured since it was already attached, that
> >> you must mean the VM, compounding the confusion ...
> >>
> >> Kind regards,
> >>
> >> Thomas
> >>
> >
> > I see.
> >
> > You'll probably need to build a kernel with VMM_DEBUG and save that output 
> > and
> > send it to me once a VM crashes. Note: it will generate a lot of output and
> > probably make things somewhat slower.
> >
> > -ml
>
> Thomas: I looked at your host dmesg and your provided vm.conf. It looks
> like 11 vm's with the default 512M memory and one (minecraft) with
> 8G. Your host seems to have only 16GB of memory, some of which is
> probably unavailable as it's used by the integrated gpu. I'm wondering
> if you are effectively oversusbcribing your memory here.
>
> I know we currently don't support swapping guest memory out, but not
> sure what happens if we don't have the physical memory to fault a page
> in and wire it.
>

Something else gets swapped out.

> Even without a custom kernel with VMM_DEBUG, if it's a uvm_fault issue
> you should see a message in the kernel buffer. Something like:
>
>   vmx_fault_page: uvm_fault returns N, GPA=0x, rip=0x
>
> mlarkin: thoughts on my hypothesis? Am I wildly off course?
>
> -dv
>

Yeah I was trying to catch the big dump when a VM resets. That would tell
us if the vm caused the reset or if vmd(8) crashed for some reason.



Re: vmd: spurious VM restarts

2021-04-06 Thread Mike Larkin
On Wed, Apr 07, 2021 at 12:22:23AM +0200, Thomas L. wrote:
> On Tue, 6 Apr 2021 14:28:09 -0700
> Mike Larkin  wrote:
>
> > On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> > > On Tue, 6 Apr 2021 11:11:01 -0700
> > > Mike Larkin  wrote:
> > > > Anything in the host's dmesg?
> > >
> >
> > *host* dmesg. I think you misread what I was after...
>
> The dmesg of the host was already attached to the first mail below the
> vm.conf (I mistakenly called the host hypervisor, which I realize now is
> not accurate). I figured since it was already attached, that
> you must mean the VM, compounding the confusion ...
>
> Kind regards,
>
> Thomas
>

I see.

You'll probably need to build a kernel with VMM_DEBUG and save that output and
send it to me once a VM crashes. Note: it will generate a lot of output and
probably make things somewhat slower.

-ml



Re: vmd: spurious VM restarts

2021-04-06 Thread Mike Larkin
On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> On Tue, 6 Apr 2021 11:11:01 -0700
> Mike Larkin  wrote:
> > Anything in the host's dmesg?
>

*host* dmesg. I think you misread what I was after...

> Below is the dmesg and latest syslog from one of the VMs.
>
> OpenBSD 6.8 (GENERIC) #1: Tue Nov  3 09:04:47 MST 2020
> r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC
> real mem = 520085504 (495MB)
> avail mem = 489435136 (466MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 2.4 @ 0xf3f40 (10 entries)
> bios0: vendor SeaBIOS version "1.11.0p3-OpenBSD-vmm" date 01/01/2011
> bios0: OpenBSD VMM
> acpi at bios0 not configured
> cpu0 at mainbus0: (uniprocessor)
> cpu0: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz, 3403.18 MHz, 06-3a-09
> cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,CX8,SEP,PGE,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,CX16,SSE4.1,SSE4.2,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,LONG,LAHF,ITSC,FSGSBASE,SMEP,ERMS,MD_CLEAR,MELTDOWN
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, package 0
> cpu0: using VERW MDS workaround
> pvbus0 at mainbus0: OpenBSD
> pvclock0 at pvbus0
> pci0 at mainbus0 bus 0
> pchb0 at pci0 dev 0 function 0 "OpenBSD VMM Host" rev 0x00
> virtio0 at pci0 dev 1 function 0 "Qumranet Virtio RNG" rev 0x00
> viornd0 at virtio0
> virtio0: irq 3
> virtio1 at pci0 dev 2 function 0 "Qumranet Virtio Network" rev 0x00
> vio0 at virtio1: address fe:e1:ba:d0:00:04
> virtio1: irq 5
> virtio2 at pci0 dev 3 function 0 "Qumranet Virtio Storage" rev 0x00
> vioblk0 at virtio2
> scsibus1 at vioblk0: 1 targets
> sd0 at scsibus1 targ 0 lun 0: 
> sd0: 307200MB, 512 bytes/sector, 629145600 sectors
> virtio2: irq 6
> virtio3 at pci0 dev 4 function 0 "OpenBSD VMM Control" rev 0x00
> vmmci0 at virtio3
> virtio3: irq 7
> isa0 at mainbus0
> isadma0 at isa0
> com0 at isa0 port 0x3f8/8 irq 4: ns8250, no fifo
> com0: console
> vscsi0 at root
> scsibus2 at vscsi0: 256 targets
> softraid0 at root
> scsibus3 at softraid0: 256 targets
> root on sd0a (c14ce37920a910f7.a) swap on sd0b dump on sd0b
> WARNING: / was not properly unmounted
>
> Apr  6 14:39:33 schleuder /bsd: OpenBSD 6.8 (GENERIC) #1: Tue Nov  3 09:04:47 
> MST 2020
> Apr  6 14:39:33 schleuder /bsd: 
> r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC
> Apr  6 14:39:33 schleuder /bsd: real mem = 520085504 (495MB)
> Apr  6 14:39:33 schleuder /bsd: avail mem = 489435136 (466MB)
> Apr  6 14:39:33 schleuder /bsd: random: good seed from bootblocks
> Apr  6 14:39:33 schleuder /bsd: mpath0 at root
> Apr  6 14:39:33 schleuder /bsd: scsibus0 at mpath0: 256 targets
> Apr  6 14:39:33 schleuder /bsd: mainbus0 at root
> Apr  6 14:39:33 schleuder /bsd: bios0 at mainbus0: SMBIOS rev. 2.4 @ 0xf3f40 
> (10 entries)
> Apr  6 14:39:33 schleuder /bsd: bios0: vendor SeaBIOS version 
> "1.11.0p3-OpenBSD-vmm" date 01/01/2011
> Apr  6 14:39:33 schleuder /bsd: bios0: OpenBSD VMM
> Apr  6 14:39:33 schleuder /bsd: acpi at bios0 not configured
> Apr  6 14:39:33 schleuder /bsd: cpu0 at mainbus0: (uniprocessor)
> Apr  6 14:39:33 schleuder /bsd: cpu0: Intel(R) Core(TM) i7-3770 CPU @ 
> 3.40GHz, 3403.18 MHz, 06-3a-09
> Apr  6 14:39:33 schleuder /bsd: cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,CX8,SEP,PGE,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,CX16,SSE4.1,SSE4.2,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,LONG,LAHF,ITSC,FSGSBASE,SMEP,ERMS,MD_CLEAR,MELTDOWN
> Apr  6 14:39:33 schleuder /bsd: cpu0: 256KB 64b/line 8-way L2 cache
> Apr  6 14:39:33 schleuder /bsd: cpu0: smt 0, core 0, package 0
> Apr  6 14:39:33 schleuder /bsd: cpu0: using VERW MDS workaround
> Apr  6 14:39:33 schleuder /bsd: pvbus0 at mainbus0: OpenBSD
> Apr  6 14:39:33 schleuder /bsd: pvclock0 at pvbus0
> Apr  6 14:39:33 schleuder /bsd: pci0 at mainbus0 bus 0
> Apr  6 14:39:33 schleuder /bsd: pchb0 at pci0 dev 0 function 0 "OpenBSD VMM 
> Host" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: virtio0 at pci0 dev 1 function 0 "Qumranet 
> Virtio RNG" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: viornd0 at virtio0
> Apr  6 14:39:33 schleuder /bsd: virtio0: irq 3
> Apr  6 14:39:33 schleuder /bsd: virtio1 at pci0 dev 2 function 0 "Qumranet 
> Virtio Network" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: vio0 at virtio1: address fe:e1:ba:d0:00:04
> Apr  6 14:39:33 schleuder /bsd: virtio1: irq 5
> Apr  6 14:39:33 schleuder /bsd: virtio2 at pci0 dev 3 function 0 "Qumranet 
> Virtio Storage" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: vioblk0 at virtio2
> Apr  6 

Re: vmd(8): send correct response on unpause error

2021-04-06 Thread Mike Larkin
On Fri, Apr 02, 2021 at 07:14:34PM -0400, Dave Voutila wrote:
> If vmctl(8) sends an unpause request for a vm that doesn't exist, vmd(8)
> should be responding with the IMSG_VMDOP_UNPAUSE_VM_RESPONSE imsg_type
> with an ENOENT error code. (Similarly if the request comes from a user
> without permissions to unpause, the error is EPERM but the imsg_type is
> wrong.)
>
> Since the handling for pause/unpause are the same code path, vmd(8) is
> sending an IMSG_VMDOP_PAUSE_VM_RESPONSE in these situations (i.e. on an
> error unpausing).
>
> The below diff sets the cmd correctly based on the imsg being
> processed.
>
> For context, case statement in this switch block looks like:
>
>   case IMSG_VMDOP_PAUSE_VM:
>   case IMSG_VMDOP_UNPAUSE_VM:
>   IMSG_SIZE_CHECK(imsg, );
>   memcpy(, imsg->data, sizeof(vid));
> ..
>
> OK?
>
> -dv
>

This is ok mlarkin@ if it wasn't already committed.

-ml

>
> Index: usr.sbin/vmd/vmd.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v
> retrieving revision 1.121
> diff -u -p -r1.121 vmd.c
> --- usr.sbin/vmd/vmd.c29 Mar 2021 23:37:01 -  1.121
> +++ usr.sbin/vmd/vmd.c2 Apr 2021 23:06:47 -
> @@ -203,20 +203,26 @@ vmd_dispatch_control(int fd, struct priv
>   if (vid.vid_id == 0) {
>   if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
>   res = ENOENT;
> - cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
> + cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
> + ? IMSG_VMDOP_PAUSE_VM_RESPONSE
> + : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
>   break;
>   } else {
>   vid.vid_id = vm->vm_vmid;
>   }
>   } else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
>   res = ENOENT;
> - cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
> + cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
> + ? IMSG_VMDOP_PAUSE_VM_RESPONSE
> + : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
>   break;
>   }
>   if (vm_checkperm(vm, >vm_params.vmc_owner,
>   vid.vid_uid) != 0) {
>   res = EPERM;
> - cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
> + cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
> + ? IMSG_VMDOP_PAUSE_VM_RESPONSE
> + : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
>   break;
>   }
>   proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
>



Re: vmd: spurious VM restarts

2021-04-06 Thread Mike Larkin
On Tue, Apr 06, 2021 at 07:47:52PM +0200, Thomas L. wrote:
> Hi,
>
> I'm running OpenBSD 6.8 as hypervisor with multiple OpenBSD VMs.
> Regularly, it happens that all VM are restarted, not at the same time
> but clustered. The indication that this happend is reduced uptime on the
> VMs, some services that fail to come up again and the following logs:
>
> # grep vmd /var/log/daemon
> Apr  1 18:10:35 golem vmd[31367]: wiki: started vm 12 successfully, tty 
> /dev/ttyp0
> Apr  6 13:24:52 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
> Apr  6 13:25:55 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
> Apr  6 13:26:45 golem vmd[18933]: vmd: LSR UART write 0x8203d260 unsupported
> Apr  6 13:26:45 golem vmd[31367]: ticketfrei: started vm 5 successfully, tty 
> /dev/ttyp5
> Apr  6 14:22:34 golem vmd[31367]: www: started vm 4 successfully, tty 
> /dev/ttyp4
> Apr  6 14:33:54 golem vmd[31367]: kibicara: started vm 8 successfully, tty 
> /dev/ttyp8
> Apr  6 14:35:02 golem vmd[31367]: vpn: started vm 3 successfully, tty 
> /dev/ttyp3
> Apr  6 14:36:38 golem vmd[31367]: relay: started vm 1 successfully, tty 
> /dev/ttyp1
> Apr  6 14:37:51 golem vmd[31367]: schleuder: started vm 2 successfully, tty 
> /dev/ttyp2
> Apr  6 14:40:34 golem vmd[31367]: mumble: started vm 6 successfully, tty 
> /dev/ttyp6
> Apr  6 14:41:58 golem vmd[31367]: minecraft: started vm 9 successfully, tty 
> /dev/ttyp9
>
> The restarts seem to be non-graceful, since the matrix vm needed manual
> fsck on /var. Going back over the logs this seems to happen about every
> month (not all restarts are this phenomenon, but Mar 8/10 and Feb
> 17/20/22 seem like it):
>
> # zgrep vmd /var/log/daemon.0.gz
> Mar  8 19:43:07 golem vmd[31367]: wiki: started vm 12 successfully, tty 
> /dev/ttyp0
> Mar  8 19:43:37 golem vmd[31367]: ticketfrei: started vm 5 successfully, tty 
> /dev/ttyp5
> Mar 10 09:21:20 golem vmd[31367]: www: started vm 4 successfully, tty 
> /dev/ttyp4
> Mar 10 09:24:13 golem vmd[31367]: kibicara: started vm 8 successfully, tty 
> /dev/ttyp8
> Mar 10 09:26:13 golem vmd[31367]: vpn: started vm 3 successfully, tty 
> /dev/ttyp3
> Mar 10 09:28:40 golem vmd[31367]: gitea: started vm 7 successfully, tty 
> /dev/ttyp7
> Mar 10 09:29:01 golem vmd[31367]: relay: started vm 1 successfully, tty 
> /dev/ttyp1
> Mar 10 09:31:29 golem vmd[31367]: schleuder: started vm 2 successfully, tty 
> /dev/ttyp2
> Mar 10 09:34:02 golem vmd[31367]: mumble: started vm 6 successfully, tty 
> /dev/ttyp6
> Mar 10 09:35:44 golem vmd[31367]: minecraft: started vm 9 successfully, tty 
> /dev/ttyp9
> Mar 13 01:46:37 golem vmd[31367]: gitea: started vm 7 successfully, tty 
> /dev/ttyp7
> golem# zgrep vmd /var/log/daemon.1.gz
> Feb 17 21:18:45 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypc
> Feb 20 08:32:28 golem vmd[31367]: wiki: started vm 12 successfully, tty 
> /dev/ttyp0
> Feb 20 08:33:14 golem vmd[31367]: ticketfrei: started vm 5 successfully, tty 
> /dev/ttyp5
> Feb 20 08:35:20 golem vmd[31367]: www: started vm 4 successfully, tty 
> /dev/ttyp4
> Feb 20 11:09:01 golem vmd[31367]: kibicara: started vm 8 successfully, tty 
> /dev/ttyp8
> Feb 20 11:10:18 golem vmd[31367]: vpn: started vm 3 successfully, tty 
> /dev/ttyp3
> Feb 20 11:11:52 golem vmd[31367]: gitea: started vm 7 successfully, tty 
> /dev/ttyp7
> Feb 22 00:51:03 golem vmd[31367]: relay: started vm 1 successfully, tty 
> /dev/ttyp1
> Feb 22 00:52:44 golem vmd[31367]: schleuder: started vm 2 successfully, tty 
> /dev/ttyp2
> Feb 22 00:53:59 golem vmd[31367]: mumble: started vm 6 successfully, tty 
> /dev/ttyp6
> Feb 22 00:54:45 golem vmd[31367]: minecraft: started vm 9 successfully, tty 
> /dev/ttyp9
> Feb 24 23:01:50 golem vmd[31367]: vmd_sighdlr: reload requested with SIGHUP
> Feb 24 23:01:51 golem vmd[31367]: test: started vm 10 successfully, tty 
> /dev/ttypa
> Feb 24 23:01:51 golem vmd[52735]: test: unsupported refcount size
> Feb 24 23:06:27 golem vmd[31367]: vmd_sighdlr: reload requested with SIGHUP
> Feb 24 23:06:27 golem vmd[1230]: test: unsupported refcount size
> Feb 24 23:06:27 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
> Feb 24 23:06:27 golem vmd[31367]: test: started vm 10 successfully, tty 
> /dev/ttypc
> Feb 24 23:10:20 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
>
> vm.conf and dmesg of the hypervisor are below. How would I go
> about debugging this?
>
> Kind regards,
>
> Thomas
>

Anything in the host's dmesg?

>
> switch internal {
>   interface bridge0
>   locked lladdr
>   group internal
> }
>
>
> vm relay {
>   disk /data/vmd/relay.qcow2
>   interface {
>   switch internal
>   lladdr fe:e1:ba:d0:00:03
>   }
> }
>
> vm schleuder {
>   disk /data/vmd/schleuder.qcow2
>   interface {
>   switch internal
>   lladdr fe:e1:ba:d0:00:04
>   }
> }
>
> vm vpn {
>   disk 

Re: amd64: add MSR_TSC_ADJUST

2021-04-06 Thread Mike Larkin
On Mon, Apr 05, 2021 at 07:37:51PM -0500, Scott Cheloha wrote:
> Intel calls it "IA32_TSC_ADJUST".  Is "MSR_TSC_ADJUST" fine or should
> it be "MSR_IA32_TSC_ADJUST"?
>
> We have a feature flag for this one already, SEFF0EBX_TSC_ADJUST.
>
> Index: specialreg.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v
> retrieving revision 1.89
> diff -u -p -r1.89 specialreg.h
> --- specialreg.h  29 Mar 2021 12:39:02 -  1.89
> +++ specialreg.h  6 Apr 2021 00:31:58 -
> @@ -352,6 +352,7 @@
>  #define MSR_EBC_FREQUENCY_ID0x02c   /* Pentium 4 only */
>  #define  MSR_TEST_CTL0x033
>  #define MSR_IA32_FEATURE_CONTROL 0x03a
> +#define MSR_TSC_ADJUST   0x03b
>  #define MSR_SPEC_CTRL0x048   /* Speculation Control IBRS / 
> STIBP */
>  #define SPEC_CTRL_IBRS   (1ULL << 0)
>  #define SPEC_CTRL_STIBP  (1ULL << 1)
>

This seems fine to me. ok mlarkin



Re: monotonic time going back by wrong skews

2021-04-05 Thread Mike Larkin
On Sat, Apr 03, 2021 at 10:21:02PM -0500, Scott Cheloha wrote:
> On Fri, Apr 02, 2021 at 10:37:36AM -0700, Mike Larkin wrote:
> > On Thu, Apr 01, 2021 at 06:43:30PM -0500, Scott Cheloha wrote:
> > >
> > > [...]
> > >
> > > Hmmm.  Being able to work around this would be nice.
> > >
> > > FreeBSD has code that uses WRMSR to synchronize the TSC:
> > >
> > > https://cgit.freebsd.org/src/commit/sys/x86/x86/tsc.c?id=b2c63698d4b81576e0c8842263ee86e86cd34e76
> > >
> > > My guess is that support for writing the TSC is not implemented by
> > > every hypervisor, so we would need to be very careful in deciding when
> > > to try it.  Otherwise we end up with protection faults and other crap
> > > we don't want.
> > >
> >
> > We implemented rdmsr_safe for things like this. We could probably do the 
> > same
> > for wrmsr.
>
> Like this?
>
> Sorry if this is not idiomatic.  I don't write much assembly.
>
> I tested this a bit on my laptop.  Stuff like:
>
>   wrmsr_safe(MSR_TSC, rdtsc() + 100);
>
> Which seems to desync the normally synchronized TSCs here.
>
> Unclear what the rules are for RETGUARD.  I just copied what was in
> rdmsr_safe().  We're not using R10 so we can use R10?
>
> -Scott
>
> Index: include/cpufunc.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/cpufunc.h,v
> retrieving revision 1.36
> diff -u -p -r1.36 cpufunc.h
> --- include/cpufunc.h 13 Sep 2020 11:53:16 -  1.36
> +++ include/cpufunc.h 4 Apr 2021 03:16:48 -
> @@ -398,6 +398,7 @@ struct cpu_info_full;
>  void cpu_enter_pages(struct cpu_info_full *);
>
>  int rdmsr_safe(u_int msr, uint64_t *);
> +int wrmsr_safe(uint32_t msr, uint64_t);
>
>  #endif /* _KERNEL */
>
> Index: amd64/locore.S
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/locore.S,v
> retrieving revision 1.122
> diff -u -p -r1.122 locore.S
> --- amd64/locore.S3 Nov 2020 18:19:31 -   1.122
> +++ amd64/locore.S4 Apr 2021 03:16:48 -
> @@ -1154,6 +1154,30 @@ NENTRY(rdmsr_resume)
>   ret
>  END(rdmsr_safe)
>
> +/* int wrmsr_safe(uint32_t msr, uint64_t val) */
> +ENTRY(wrmsr_safe)
> + RETGUARD_SETUP(wrmsr_safe, r10)
> +
> + movl%edi,   %ecx/* uint32_t msr */
> +
> + movl%esi,   %eax/* uint64_t val */
> + sarq$32,%rsi
> + movl%esi,   %edx
> +
> + .globl  wrmsr_safe_fault
> +wrmsr_safe_fault:
> + wrmsr
> +
> + xorq%rax,   %rax
> + RETGUARD_CHECK(rdmsr_safe, r10)
> + ret
> +
> +NENTRY(wrmsr_resume)
> + movq$0x1,   %rax
> + RETGUARD_CHECK(wrmsr_safe, r10)
> + ret
> +END(wrmsr_safe)
> +
>  #if NXEN > 0
>   /* Hypercall page needs to be page aligned */
>   .text
>

You will need the handler case in vector.S also (like we did for rdmsr_safe).

(Sorry if this reply hits the list twice; mailer error on previous attempt).

-ml



Re: vmm.4: document supported ioctls

2021-04-02 Thread Mike Larkin
On Fri, Apr 02, 2021 at 03:24:55AM +0200, Klemens Nanni wrote:
> On Thu, Apr 01, 2021 at 08:34:37PM -0400, Dave Voutila wrote:
> > I've updated the diff using your feedback. See below.
> Thanks, OK kn
>

ok mlarkin@ also



Re: monotonic time going back by wrong skews

2021-04-02 Thread Mike Larkin
On Thu, Apr 01, 2021 at 06:43:30PM -0500, Scott Cheloha wrote:
> On Thu, Apr 01, 2021 at 03:41:24PM -0400, Josh Rickmar wrote:
> > On Thu, Apr 01, 2021 at 03:22:00PM -0400, Josh Rickmar wrote:
> > > On Thu, Apr 01, 2021 at 02:15:48PM -0500, Scott Cheloha wrote:
> > > > On Sat, Mar 27, 2021 at 02:20:21AM +, Stefmorino wrote:
> > > > > > Feel free to share your raw data.
> > > > >
> > > > > Also includes some standard sendbug dumps: https://0x0.st/-qng.tgz
> > > >
> > > > Thanks!
> > > >
> > > > TL;DR:
> > > >
> > > > Two things:
> > > >
> > > > 1. Could you check whether Linux will use the TSC as a clocksource on
> > > >this machine?  The dmesg output on any given distribution should
> > > >contain lines about the TSC.
> > > >
> > > >[...]
> > > >
> > > Hey, thanks for the reminder to try this out with Linux.  Will give it
> > > a shot shortly.
> > >
> > > As for the BIOS, 1.58 is the current version (found here):
> > >
> > > https://support.lenovo.com/us/en/downloads/ds503790
> > >
> > > This same issue was happening with all older BIOS versions that I have
> > > used as well.
>
> Okay, not great news, but at least the behavior is consistent.
>
> > Seems Linux doesn't like it either:
> >
> > localhost:~# dmesg | egrep -i 'tsc|clocksource'
> > [0.00] tsc: Fast TSC calibration using PIT
> > [0.00] tsc: Detected 1996.173 MHz processor
> > [0.043227] clocksource: refined-jiffies: mask: 0x max_cycles: 
> > 0x, max_idle_ms: 6370452778343963 ns
> > [0.114728] clocksource: hpet: mask: 0x max_cycles: 0x, 
> > max_idle_ns: 133484873504 ns
> > [0.131435] clocksource: tsc-early: mask: 0x max_cycles: 
> > 0x398c1ebcd00, max_idle_ns: 881590807727 ns
> > [0.244772] TSC synchronization [CPU#0 -> CPU#1]:
> > [0.244772] Measured 7296391160 warp between CPUs, turning off TSC clock.
> > [0.244772] tsc: Marking TSC unstable due to check_tsc_sync_source_failed
> > [0.252185] clocksource: jiffies: mask: 0x max_cycles: 
> > 0x, max_idle_ns: 6370867519511994 ns
> > [0.316884] clocksource: Switched to clocksource hpet
> > [0.335046] clocksource: acpi_pm: mask: 0xff max_cycles: 0xff, 
> > max_idle_ns: 2085701024 ns
>
> Hmmm.  Being able to work around this would be nice.
>
> FreeBSD has code that uses WRMSR to synchronize the TSC:
>
> https://cgit.freebsd.org/src/commit/sys/x86/x86/tsc.c?id=b2c63698d4b81576e0c8842263ee86e86cd34e76
>
> My guess is that support for writing the TSC is not implemented by
> every hypervisor, so we would need to be very careful in deciding when
> to try it.  Otherwise we end up with protection faults and other crap
> we don't want.
>

We implemented rdmsr_safe for things like this. We could probably do the same
for wrmsr.

-ml

> Doing this via TSC_ADJUST (instead of writing the TSC directly) is
> nicer because you just check for the CPUID level and bit.  No
> guesswork.  But we can't in your case because, as I said, no
> TSC_ADJUST support on your CPU.
>



Re: vmctl: off-by-one error handling mixing -a with a VM id

2021-03-29 Thread Mike Larkin
On Fri, Mar 26, 2021 at 07:24:32AM -0400, Dave Voutila wrote:
>
> Theo Buehler writes:
>
> > On Thu, Mar 25, 2021 at 08:07:53PM +0100, Preben Guldberg wrote:
> >> Dave Voutila wrote:
> >> > Preben Guldberg writes:
> >> > > The patch below addresses an off-by-one error reading argv when
> >> > > generating the error message.
> >>
> >> > > I personally find it clearer if the condition of mixing -a with an id
> >> > > is highlighted. I included a suggestion in the patch below.
> >>
> >> > Since -a and providing an id are mutually exclusive, I think it's more
> >> > helpful to print usage information via ctl_usage(res->ctl). From the
> >> > usage details, it's self explanatory what's wrong.
> >>
> >> >   usage:  vmctl [-v] stop [-fw] [id | -a]
> >>
> >> The updated diff below would do just that:
> >>
> >> % vmctl stop -a testvm
> >> usage:  vmctl [-v] stop [-fw] [id | -a]
> >
> > Yes, your diff would do that.
> >
> > However, I think the current logic is both wrong and the wrong way
> > around.  I believe the following is much clearer. It doesn't have a dead
> > else branch and it deletes 'ret', so it doesn't use it uninitialized when
> > checking 'res->action == CMD_STOPALL && ret != -1' (e.g. 'vmctl stop -a').
> > Since the diff is slightly messy, this is the result:
> >
> > if (res->action == CMD_STOPALL) {
> > if (argc != 0)
> > ctl_usage(res->ctl);
> > } else {
> > if (argc != 1)
> > ctl_usage(res->ctl);
> > if (parse_vmid(res, argv[0], 0) == -1)
> > errx(1, "invalid id: %s", argv[0]);
> > }
> >
> > return (vmmaction(res));
>
> I like this a lot better. The only thing to note is the only code path I
> can identify that will result in "invalid id" is using '-' as the
> id...parse_vmid prints warnings itself for other use cases. Having the
> errx here though is a nice guard if someone changes parse_vmid in the future.
>
> OK dv@
>

also ok mlarkin@

> >
> > Index: main.c
> > ===
> > RCS file: /cvs/src/usr.sbin/vmctl/main.c,v
> > retrieving revision 1.62
> > diff -u -p -r1.62 main.c
> > --- main.c  3 Jan 2020 05:32:00 -   1.62
> > +++ main.c  25 Mar 2021 19:23:16 -
> > @@ -927,7 +927,7 @@ ctl_start(struct parse_result *res, int
> >  int
> >  ctl_stop(struct parse_result *res, int argc, char *argv[])
> >  {
> > -   int  ch, ret;
> > +   int  ch;
> >
> > while ((ch = getopt(argc, argv, "afw")) != -1) {
> > switch (ch) {
> > @@ -948,20 +948,15 @@ ctl_stop(struct parse_result *res, int a
> > argc -= optind;
> > argv += optind;
> >
> > -   if (argc == 0) {
> > -   if (res->action != CMD_STOPALL)
> > +   if (res->action == CMD_STOPALL) {
> > +   if (argc != 0)
> > ctl_usage(res->ctl);
> > -   } else if (argc > 1)
> > -   ctl_usage(res->ctl);
> > -   else if (argc == 1)
> > -   ret = parse_vmid(res, argv[0], 0);
> > -   else
> > -   ret = -1;
> > -
> > -   /* VM id is only expected without the -a flag */
> > -   if ((res->action != CMD_STOPALL && ret == -1) ||
> > -   (res->action == CMD_STOPALL && ret != -1))
> > -   errx(1, "invalid id: %s", argv[1]);
> > +   } else {
> > +   if (argc != 1)
> > +   ctl_usage(res->ctl);
> > +   if (parse_vmid(res, argv[0], 0) == -1)
> > +   errx(1, "invalid id: %s", argv[0]);
> > +   }
> >
> > return (vmmaction(res));
> >  }
>
>
> --
> -Dave Voutila
>



Re: patch: vamm(4) IA32_EPT_VPID_CAP_XO_TRANSLATIONS specified incorrectly.

2021-03-29 Thread Mike Larkin
On Sat, Mar 27, 2021 at 10:15:27AM -0400, Dave Voutila wrote:
>
> Adam Steen writes:
>
> > Hi
> >
> > IA32_EPT_VPID_CAP_XO_TRANSLATIONS is specified incorrectly, see the
> > patch below.
>
> Adam's diff looks correct to me based on reading Intel SDM Vol 3D,
> Appendix A.10 (VPID and EPT Capabilities) [1]:
>
>   The IA32_VMX_EPT_VPID_CAP MSR (index 48CH) reports information about
>   the capabilities of the logical processor with regard to
>   virtual-processor identifiers (VPIDs, Section 28.1) and extended page
>   tables (EPT, Section 28.2):
>
> * If bit 0 is read as 1, the processor supports execute-only
> translations by EPT. This support allows software to configure EPT
> paging-structure entries in which bits 1:0 are clear (indicating
> that data accesses are not allowed) and bit 2 is set (indicating
> that instruction fetches are allowed).
>
> ...
>
> IA32_EPT_VPID_CAP_XO_TRANSLATIONS is only referenced in vmm.c.
>
> I've updated the diff so it applies cleanly, but didn't change the name
> of the capability as it's more accurate with "TRANSLATIONS" included
> imo.
>
> OK?
>
> >
> > Cheers
> > Adam
> >
> > On Fri, Feb 26, 2021 at 01:08:17PM +0800, Adam Steen wrote:
> >> Hi
> >>
> >> IA32_EPT_VPID_CAP_XO_TRANSLATIONS is specified as 0x0 and not (1ULL << 0)
> >> ie 0 and not bit 0 as on.
> >>
> >> Please see the attach diff to correct this and rename
> >> IA32_EPT_VPID_CAP_XO_TRANSLATIONS to IA32_EPT_VPID_CAP_XO to reduce
> >> wordyness.
> >>
> >> Cheers
> >> Adam
> >>
>
> [1] 
> https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-sdm-volume-3d-system-programming-guide-part-4.html
>
> -Dave
>
>
> Index: sys/arch/amd64/include/specialreg.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v
> retrieving revision 1.88
> diff -u -p -r1.88 specialreg.h
> --- sys/arch/amd64/include/specialreg.h   13 Sep 2020 05:57:28 -  
> 1.88
> +++ sys/arch/amd64/include/specialreg.h   27 Mar 2021 14:14:13 -
> @@ -957,7 +957,7 @@
>  #define IA32_VMX_TRUE_ENTRY_CTLS 0x490
>  #define IA32_VMX_VMFUNC  0x491
>
> -#define IA32_EPT_VPID_CAP_XO_TRANSLATIONS0x0
> +#define IA32_EPT_VPID_CAP_XO_TRANSLATIONS(1ULL << 0)
>  #define IA32_EPT_VPID_CAP_PAGE_WALK_4(1ULL << 6)
>  #define IA32_EPT_VPID_CAP_WB (1ULL << 14)
>  #define IA32_EPT_VPID_CAP_AD_BITS(1ULL << 21)
>

ok mlarkin@ if you want to commit this.



Re: vmm(4): fix boot issue for 9front guests

2021-03-29 Thread Mike Larkin
On Sun, Mar 28, 2021 at 09:28:11AM -0400, Bryan Steele wrote:
> On Sun, Mar 28, 2021 at 08:38:13AM -0400, Dave Voutila wrote:
> > abieber@ found the latest 9front release ends up in a boot loop if
> > hosted on an AMD system. I tracked it down to 9front (oddly) trying to
> > read the PAT msr prior to writing it. [1] The problem is vmm(4)'s msr
> > handling for svm injects #GP exceptions into the guest for most msr
> > reads (since we don't emulate more than a few).
> >
> > For those (two? few? dozen?) 9front users of AMD hardware and -current,
> > can you try the below diff?
> >
> > vmm(4)'s vmx msr handlers ignores this instruction and only logs the
> > rdmsr information if the kernel is built with VMM_DEBUG. vmm(4) will
> > advance the instruction pointer regardless and it's up to the guest to
> > deal with any resulting issues.
> >
> > The diff syncs the logic between the svm and vmx msr vm-exit handlers by
> > injecting #GP *ONLY* on attempts to read the SMBASE msr.
> >
> > For context, this is the vmx rdmsr handler's (vmx_handle_rdmsr) logic:
> >
> > switch (*rcx) {
> > case MSR_SMBASE:
> > /*
> >  * 34.15.6.3 - Saving Guest State (SMM)
> >  *
> >  * Unsupported, so inject #GP and return without
> >  * advancing %rip.
> >  */
> > ret = vmm_inject_gp(vcpu);
> > return (ret);
> > }
> >
> > It is *not* a design for emulating PAT access and manipulation by a
> > guest.
> >
> > (As an aside, OpenBSD doesn't bother reading the msr [2] before writing
> > to it, neither does Linux. Why is 9front special? ¯\_(ツ)_/¯)
> >
> > -Dave
> >
> > [1] https://code.9front.org/hg/plan9front/rev/10cd3e23a8c1
> > [2] 
> > https://github.com/openbsd/src/blob/36fd90dcf1acf2ddb4ef5dbabe5313b3a8d46ee2/sys/arch/amd64/amd64/cpu.c#L1145-L1168
> >

IIRC I had to advertise PAT support or some guest OS didn't work. I can't recall
what OS that was though (this was years ago).

I'd say just allow reading of the host PAT but discard all writes. Do the same 
on
both SVM and VMX, for now. See if this helps 9front.

Bonus points: there are rules for accessing and manipulating the PAT in a guest,
we could probably emulate that if desired.

-ml

> >
> > Index: sys/arch/amd64/amd64/vmm.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> > retrieving revision 1.278
> > diff -u -p -r1.278 vmm.c
> > --- sys/arch/amd64/amd64/vmm.c  11 Mar 2021 11:16:55 -  1.278
> > +++ sys/arch/amd64/amd64/vmm.c  28 Mar 2021 00:45:08 -
> > @@ -6545,10 +6545,16 @@ svm_handle_msr(struct vcpu *vcpu)
> > *rax = 0;
> > *rdx = 0;
> > break;
> > -   default:
> > -   DPRINTF("%s: guest read msr 0x%llx, injecting "
> > -   "#GP\n", __func__, *rcx);
> > +   case MSR_SMBASE:
> > +   /* Unsupported, inject #GP w/o advancing %rip */
> > ret = vmm_inject_gp(vcpu);
> > return (ret);
> > +#ifdef VMM_DEBUG
> > +   default:
> > +   /* Log the access to identify unknown MSRs */
> > +   DPRINTF("%s: rdmsr exit, msr=0x%llx, data "
> > +   "returned to guest=0x%llx:0x%llx\n",
> > +   __func__, *rcx, *rdx, *rax);
> > +#endif /* VMM_DEBUG */
> > }
> > }
>
> I'm not sure this is correct, doesn't this mean that registers will
> contain whatevever garbage that was in them beforehand, without
> injecting #GP host does the guest kernel to know the MSR read failed?
>
> I was initially concerned as this touches the codepath pd@ fixed last
> Feb where MSR reads were being passed through to the host, but still
> I think that injecting the #GP for unsupported MSR reads is right.
>
> -Bryan.
>



Re: UVM return(val)

2021-03-23 Thread Mike Larkin
On Tue, Mar 23, 2021 at 01:52:20PM +0100, Martin Pieuchot wrote:
> Diff below convert multiple "return(val)" and "return (val)" to
> "return val".  I only changed those that help decrease the size
> of the diff with NetBSD or didn't change anything.
>
> ok?
>

I read through these and agree this should not change any behaviour.

ok mlarkin if this helps you move forward by improving diffability.

> Index: uvm/uvm_amap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> retrieving revision 1.88
> diff -u -p -r1.88 uvm_amap.c
> --- uvm/uvm_amap.c20 Mar 2021 10:24:21 -  1.88
> +++ uvm/uvm_amap.c23 Mar 2021 12:14:26 -
> @@ -342,7 +342,7 @@ amap_alloc1(int slots, int waitf, int la
>   amap = pool_get(_small_amap_pool[slots - 1],
>   pwaitf | PR_ZERO);
>   if (amap == NULL)
> - return(NULL);
> + return NULL;
>
>   amap->am_lock = NULL;
>   amap->am_ref = 1;
> @@ -355,7 +355,7 @@ amap_alloc1(int slots, int waitf, int la
>
>   if (UVM_AMAP_SMALL(amap)) {
>   amap->am_small.ac_nslot = slots;
> - return (amap);
> + return amap;
>   }
>
>   amap->am_ncused = 0;
> @@ -392,14 +392,14 @@ amap_alloc1(int slots, int waitf, int la
>   }
>   }
>
> - return(amap);
> + return amap;
>
>  fail1:
>   free(amap->am_buckets, M_UVMAMAP, buckets * sizeof(*amap->am_buckets));
>   TAILQ_FOREACH_SAFE(chunk, >am_chunks, ac_list, tmp)
>   pool_put(_amap_chunk_pool, chunk);
>   pool_put(_amap_pool, amap);
> - return (NULL);
> + return NULL;
>  }
>
>  static void
> @@ -423,7 +423,7 @@ amap_alloc(vaddr_t sz, int waitf, int la
>
>   AMAP_B2SLOT(slots, sz); /* load slots */
>   if (slots > INT_MAX)
> - return (NULL);
> + return NULL;
>
>   amap = amap_alloc1(slots, waitf, lazyalloc);
>   if (amap != NULL) {
> @@ -431,7 +431,7 @@ amap_alloc(vaddr_t sz, int waitf, int la
>   amap_list_insert(amap);
>   }
>
> - return(amap);
> + return amap;
>  }
>
>
> Index: uvm/uvm_anon.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
> retrieving revision 1.53
> diff -u -p -r1.53 uvm_anon.c
> --- uvm/uvm_anon.c20 Mar 2021 10:24:21 -  1.53
> +++ uvm/uvm_anon.c23 Mar 2021 12:01:03 -
> @@ -67,7 +67,7 @@ uvm_analloc(void)
>   anon->an_page = NULL;
>   anon->an_swslot = 0;
>   }
> - return(anon);
> + return anon;
>  }
>
>  /*
> Index: uvm/uvm_aobj.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
> retrieving revision 1.92
> diff -u -p -r1.92 uvm_aobj.c
> --- uvm/uvm_aobj.c20 Mar 2021 10:24:21 -  1.92
> +++ uvm/uvm_aobj.c23 Mar 2021 12:17:00 -
> @@ -211,7 +211,7 @@ uao_find_swhash_elt(struct uvm_aobj *aob
>*/
>   LIST_FOREACH(elt, swhash, list) {
>   if (elt->tag == page_tag)
> - return(elt);
> + return elt;
>   }
>
>   if (!create)
> @@ -234,7 +234,7 @@ uao_find_swhash_elt(struct uvm_aobj *aob
>   LIST_INSERT_HEAD(swhash, elt, list);
>   elt->tag = page_tag;
>
> - return(elt);
> + return elt;
>  }
>
>  /*
> @@ -248,7 +248,7 @@ uao_find_swslot(struct uvm_aobj *aobj, i
>* if noswap flag is set, then we never return a slot
>*/
>   if (aobj->u_flags & UAO_FLAG_NOSWAP)
> - return(0);
> + return 0;
>
>   /*
>* if hashing, look in hash table.
> @@ -258,15 +258,15 @@ uao_find_swslot(struct uvm_aobj *aobj, i
>   uao_find_swhash_elt(aobj, pageidx, FALSE);
>
>   if (elt)
> - return(UAO_SWHASH_ELT_PAGESLOT(elt, pageidx));
> + return UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
>   else
> - return(0);
> + return 0;
>   }
>
>   /*
>* otherwise, look in the array
>*/
> - return(aobj->u_swslots[pageidx]);
> + return aobj->u_swslots[pageidx];
>  }
>
>  /*
> @@ -289,7 +289,7 @@ uao_set_swslot(struct uvm_object *uobj,
>*/
>   if (aobj->u_flags & UAO_FLAG_NOSWAP) {
>   if (slot == 0)
> - return(0);  /* a clear is ok */
> + return 0;   /* a clear is ok */
>
>   /* but a set is not */
>   printf("uao_set_swslot: uobj = %p\n", uobj);
> @@ -309,7 +309,7 @@ uao_set_swslot(struct uvm_object *uobj,
>   uao_find_swhash_elt(aobj, pageidx, slot ? TRUE : FALSE);
>   if (elt == NULL) {
>   KASSERT(slot == 0);
> - return (0);
> + return 0;
>   }
>
>   oldslot = 

Re: Remove booting from kernels in raw/qcow2 images in vmd(8)

2021-03-17 Thread Mike Larkin
On Wed, Mar 17, 2021 at 10:29:32PM +0100, Klemens Nanni wrote:
> On Sun, Mar 14, 2021 at 11:00:22AM -0400, Dave Voutila wrote:
> > Any takers?
> Yes, I plan to commit the updated diff at the end until friday
> unless someone objects.
>

no objection, thanks everyone.

ok mlarkin

> > Here's an updated diff also removes some logic in config.c related to
> > checking the value sent by vmctl(8)'s -b flag to see if it's the same as
> > the root disk image (-d).
> Both your first and this diff fail to apply, see inline.
>
> I've fixed both and tested the following diff (incl. mail to myself and
> apply from there).
>
> > Index: Makefile
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
> > retrieving revision 1.24
> > diff -u -p -u -p -r1.24 Makefile
> > --- Makefile23 Sep 2020 19:18:18 -  1.24
> > +++ Makefile14 Mar 2021 14:56:06 -
> > @@ -5,7 +5,7 @@
> >  PROG=  vmd
> >  SRCS=  vmd.c control.c log.c priv.c proc.c config.c vmm.c
> >  SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
> > -SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c 
> > packet.c
> > +SRCS+= ns8250.c i8253.c dhcp.c packet.c
> You remove disklabel.c here but not with `cvs rm';
> fixed in the diff below.
>
> >  SRCS+= parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c 
> > fw_cfg.c
> >
> >  CFLAGS+=   -Wall -I${.CURDIR}
>
> > Index: loadfile_elf.c
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/loadfile_elf.c,v
> > retrieving revision 1.36
> > diff -u -p -u -p -r1.36 loadfile_elf.c
> > --- loadfile_elf.c  26 Oct 2020 04:04:31 -  1.36
> > +++ loadfile_elf.c  14 Mar 2021 14:56:06 -
>
> > @@ -414,15 +407,6 @@ push_bootargs(bios_memmap_t *memmap, siz
> > memcpy([i + 3], , sizeof(bios_consdev_t));
> > i += consdev_sz / sizeof(int);
> >
> > -   if (bootmac) {
> > -   bootmac_sz = 3 * sizeof(int) + (sizeof(bios_bootmac_t) + 3) & 
> > ~3;
> > -   ba[i] = 0x7;   /* bootmac */
> > -   ba[i + 1] = bootmac_sz;
> > -   ba[i + 2] = bootmac_sz;
> > -   memcpy([i + 3], bootmac, sizeof(bios_bootmac_t));
> > -   i += bootmac_sz / sizeof(int);
> > -   }
> This line in the file ends with a single whitespace, but your diff does
> not have it;
> fixed in the diff below.
>
> > -
> > ba[i++] = 0x; /* BOOTARG_END */
> >
> > write_mem(BOOTARGS_PAGE, ba, PAGE_SIZE);
>
>
>
> Index: Makefile
> ===
> RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
> retrieving revision 1.24
> diff -u -p -r1.24 Makefile
> --- Makefile  23 Sep 2020 19:18:18 -  1.24
> +++ Makefile  17 Mar 2021 21:04:06 -
> @@ -5,7 +5,7 @@
>  PROG=vmd
>  SRCS=vmd.c control.c log.c priv.c proc.c config.c vmm.c
>  SRCS+=   vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
> -SRCS+=   ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c 
> packet.c
> +SRCS+=   ns8250.c i8253.c dhcp.c packet.c
>  SRCS+=   parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c 
> fw_cfg.c
>
>  CFLAGS+= -Wall -I${.CURDIR}
> Index: config.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/config.c,v
> retrieving revision 1.59
> diff -u -p -r1.59 config.c
> --- config.c  28 Feb 2021 22:56:09 -  1.59
> +++ config.c  17 Mar 2021 21:04:06 -
> @@ -216,7 +216,7 @@ config_setvm(struct privsep *ps, struct
>   struct vmop_create_params *vmc = >vm_params;
>   struct vm_create_params *vcp = >vmc_params;
>   unsigned int i, j;
> - int  fd = -1, vmboot = 0;
> + int  fd = -1;
>   int  kernfd = -1;
>   int *tapfds = NULL;
>   int  cdromfd = -1;
> @@ -295,16 +295,8 @@ config_setvm(struct privsep *ps, struct
>
>   if (!(vm->vm_state & VM_STATE_RECEIVED)) {
>   if (strlen(vcp->vcp_kernel)) {
> - /*
> -  * Boot kernel from disk image if path matches the
> -  * root disk.
> -  */
> - if (vcp->vcp_ndisks &&
> - strcmp(vcp->vcp_kernel, vcp->vcp_disks[0]) == 0)
> - vmboot = 1;
>   /* Open external kernel for child */
> - else if ((kernfd =
> - open(vcp->vcp_kernel, O_RDONLY)) == -1) {
> + if ((kernfd = open(vcp->vcp_kernel, O_RDONLY)) == -1) {
>   log_warn("%s: can't open kernel or BIOS "
>   "boot image %s", __func__, vcp->vcp_kernel);
>   

Re: vmm crash on 6.9-beta

2021-03-13 Thread Mike Larkin
On Wed, Mar 10, 2021 at 08:30:32PM +0100, Mischa wrote:
> On 10 Mar at 18:59, Mike Larkin  wrote:
> > On Wed, Mar 10, 2021 at 03:08:21PM +0100, Mischa wrote:
> > > Hi All,
> > >
> > > Currently I am running 6.9-beta on one of my hosts to test 
> > > veb(4)/vport(4).
> > >
> > > root@server14:~ # sysctl kern.version
> > > kern.version=OpenBSD 6.9-beta (GENERIC.MP) #385: Mon Mar  8 12:57:12 MST 
> > > 2021
> > > dera...@amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> > >
> > > On order to add some load to the system I created 41 additional VMs based 
> > > on a single qcow2 base image.
> > > A couple of those VMs crashed with the following ddb output.
> > >
> > > ddb> show panic
> > > ffs_valloc: dup alloc
> > > ddb> trace
> > > db_enter() at db_enter+0x10
> > > panic(81dc0709) at panic+0x12a
> > > ffs_inode_alloc(fd80269831e0,8180,fd803f7bb540,800014e1e3e8) 
> > > at ffs
> > > _inode_alloc+0x442
> > > ufs_makeinode(8180,fd8026a386a0,800014e1e6e0,800014e1e730) at 
> > > ufs_m
> > > akeinode+0x7f
> > > ufs_create(800014e1e490) at ufs_create+0x3c
> > > VOP_CREATE(fd8026a386a0,800014e1e6e0,800014e1e730,800014e1e4f0)
> > >  at VOP_CREATE+0x4a
> > > vn_open(800014e1e6b0,10602,180) at vn_open+0x182
> > > doopenat(800014e8a518,ff9c,70e0e92a500,10601,1b6,800014e1e8b0)
> > >  at d
> > > oopenat+0x1d0
> > > syscall(800014e1e920) at syscall+0x315
> > > Xsyscall() at Xsyscall+0x128
> > > end of kernel
> > > end trace frame: 0x7f7e5000, count: -10
> > >
> > > Mischa
> > >
> >
> > Probably not vmm(4) related but thanks for reporting!
>
> Could it be qcow2 related? or is this general disk? At least that is what I 
> think ffs_ is. :)
>
> Mischa
>

likely completely unrelated to anything vmd(8) is doing.



Re: Remove booting from kernels in raw/qcow2 images in vmd(8)

2021-03-11 Thread Mike Larkin
On Thu, Mar 11, 2021 at 06:11:03PM -0500, Dave Voutila wrote:
> tl;dr: tedu vmboot.{c,h}, ufs.c from vmd(8) to remove broken ability to
> exract and boot a kernel image from a raw or qcow2 disk image
>
> The following diff removes the ability to boot directly from a disk
> image containing a FFS filesystem. No new functionality is added. It's
> still possible to boot via a kernel image or with either disk or iso
> images via seabios. (PXE booting should still work via a kernel image,
> but I haven't tested it personally.)
>
> Why remove this?
>
> - since 6.7 switched to FFS2 as the default filesystem for new installs,
>   the ability for vmd(8) to load a kernel and boot.conf from a disk
>   image directly (without seabios) has been broken. tb@ apparently sent
>   a diff to update support for FFS2 awhile back, but it never made it
>   into the tree.
>
> - on 5th Jan 2021, new ramdisks for amd64 have started shipping gzip'd,
>   breaking the ability to load the bsd.rd directly as a kernel image for
>   a vmd(8) guest without first uncompressing the image
>
> Why not fix it?
>
> - using bios (via seabios) works
>
> - the FFS2 change happened ten months ago and afaict few if any have
>   complained about the breakage, so I'm not sure the value in fixing
>   it. vmctl(8) is still vague about supporting it per its man page and
>   you still have to pass the disk image twice as a -b and -d arg if
>   you're trying to avoid using seabios to boot an OpenBSD guest.
>
> - Josh Rickmar reported the gzip issue on bugs@ and provided patches to
>   add in support for compressed ramdisks and kernel images. In doing so,
>   we found the easiest way to add gzip kernel image support was to drop
>   support for FFS images since they require a call to fmemopen(3) while
>   all the other logic uses fopen(3)/fdopen(3) calls and a file
>   descriptor. I think it would be easier to get his patches into vmd(8)
>   if they don't have to account for extracting kernels from disk
>   images.
>
> I can understand an argument to shy away from relying on seabios for
> booting, but given it's readily available via fw_update(1) and is part
> of the default behavior, I'd imagine most won't miss this feature.
>
> If people ARE using direct booting of raw/qcow2 images (without using
> seabios) please speak up and instead I can look into dusting off tb@'s
> old diff.
>

reyk@ wrote that ffs module for vmd but since he has not stepped up to
maintain it after the ffs2 switch, I vote to remove it. If someone wants
to come back and fixup ffs2 support with the tb@ diff we can look at that
when said person steps up.

ok mlarkin

> --
> -Dave Voutila
>
>
> Index: Makefile
> ===
> RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
> retrieving revision 1.24
> diff -u -p -u -p -r1.24 Makefile
> --- Makefile  23 Sep 2020 19:18:18 -  1.24
> +++ Makefile  11 Mar 2021 22:10:08 -
> @@ -5,7 +5,7 @@
>  PROG=vmd
>  SRCS=vmd.c control.c log.c priv.c proc.c config.c vmm.c
>  SRCS+=   vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
> -SRCS+=   ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c 
> packet.c
> +SRCS+=   ns8250.c i8253.c dhcp.c packet.c
>  SRCS+=   parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c 
> fw_cfg.c
>
>  CFLAGS+= -Wall -I${.CURDIR}
> Index: loadfile.h
> ===
> RCS file: /cvs/src/usr.sbin/vmd/loadfile.h,v
> retrieving revision 1.12
> diff -u -p -u -p -r1.12 loadfile.h
> --- loadfile.h16 May 2019 21:16:04 -  1.12
> +++ loadfile.h11 Mar 2021 22:10:08 -
> @@ -73,8 +73,6 @@
>  #define PML2_PAGE 0x13000
>  #define NPTE_PG (PAGE_SIZE / sizeof(uint64_t))
>
> -int loadfile_elf(FILE *, struct vm_create_params *,
> -struct vcpu_reg_state *, uint32_t, uint32_t, unsigned int);
> +int loadfile_elf(FILE *, struct vm_create_params *, struct vcpu_reg_state *);
>
>  size_t mread(FILE *, paddr_t, size_t);
> -
> Index: loadfile_elf.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/loadfile_elf.c,v
> retrieving revision 1.36
> diff -u -p -u -p -r1.36 loadfile_elf.c
> --- loadfile_elf.c26 Oct 2020 04:04:31 -  1.36
> +++ loadfile_elf.c11 Mar 2021 22:10:10 -
> @@ -118,8 +118,8 @@ static void setsegment(struct mem_segmen
>  static int elf32_exec(FILE *, Elf32_Ehdr *, u_long *, int);
>  static int elf64_exec(FILE *, Elf64_Ehdr *, u_long *, int);
>  static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
> -static uint32_t push_bootargs(bios_memmap_t *, size_t, bios_bootmac_t *);
> -static size_t push_stack(uint32_t, uint32_t, uint32_t, uint32_t);
> +static uint32_t push_bootargs(bios_memmap_t *, size_t);
> +static size_t push_stack(uint32_t, uint32_t);
>  static void push_gdt(void);
>  static void push_pt_32(void);
>  

Re: vmm crash on 6.9-beta

2021-03-10 Thread Mike Larkin
On Wed, Mar 10, 2021 at 03:08:21PM +0100, Mischa wrote:
> Hi All,
>
> Currently I am running 6.9-beta on one of my hosts to test veb(4)/vport(4).
>
> root@server14:~ # sysctl kern.version
> kern.version=OpenBSD 6.9-beta (GENERIC.MP) #385: Mon Mar  8 12:57:12 MST 2021
> dera...@amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
>
> On order to add some load to the system I created 41 additional VMs based on 
> a single qcow2 base image.
> A couple of those VMs crashed with the following ddb output.
>
> ddb> show panic
> ffs_valloc: dup alloc
> ddb> trace
> db_enter() at db_enter+0x10
> panic(81dc0709) at panic+0x12a
> ffs_inode_alloc(fd80269831e0,8180,fd803f7bb540,800014e1e3e8) at 
> ffs
> _inode_alloc+0x442
> ufs_makeinode(8180,fd8026a386a0,800014e1e6e0,800014e1e730) at 
> ufs_m
> akeinode+0x7f
> ufs_create(800014e1e490) at ufs_create+0x3c
> VOP_CREATE(fd8026a386a0,800014e1e6e0,800014e1e730,800014e1e4f0)
>  at VOP_CREATE+0x4a
> vn_open(800014e1e6b0,10602,180) at vn_open+0x182
> doopenat(800014e8a518,ff9c,70e0e92a500,10601,1b6,800014e1e8b0) at 
> d
> oopenat+0x1d0
> syscall(800014e1e920) at syscall+0x315
> Xsyscall() at Xsyscall+0x128
> end of kernel
> end trace frame: 0x7f7e5000, count: -10
>
> Mischa
>

Probably not vmm(4) related but thanks for reporting!

-ml



Re: veb(4) support for vmd(8)?

2021-02-26 Thread Mike Larkin
On Sat, Feb 27, 2021 at 09:44:03AM +1000, David Gwynne wrote:
>
>
> > On 27 Feb 2021, at 7:50 am, Klemens Nanni  wrote:
> >
> > On Sat, Feb 27, 2021 at 07:30:56AM +1000, David Gwynne wrote:
> >> i think this is enough to let vmd wire guests up to veb interfaces.
> > But please update vm.conf(5) to mention veb(4) and vport(4) in as well
> > SWITCH CONFIGURATION.
>
> How would you fit wording about vport(4) in?
>
> >
> > OK kn
>

Do we want to just talk only about veb/vport and remove all the old discussion
around bridge/vether?



Re: uvm_fault: Comments & style cleanup

2021-02-15 Thread Mike Larkin
On Mon, Feb 15, 2021 at 01:15:33PM +0100, Martin Pieuchot wrote:
> On 15/02/21(Mon) 11:47, Martin Pieuchot wrote:
> > Diff below includes non-functional changes:
> >
> > - Sync comments with NetBSD including locking details.
> > - Remove superfluous parenthesis and spaces.
> > - Add brackets, even if questionable, to reduce diff with NetBSD
> > - Use for (;;) instead of while(1)
> > - Rename a variable from 'result' into 'error'.
> > - Move uvm_fault() and uvm_fault_upper_lookup()
> > - Add an locking assert in uvm_fault_upper_lookup()
>
> Updated diff on top of recent fix, still ok?
>

I reviewed the diff and agree it introduces no functional changes. If you are
still looking for oks and it helps you with the locking work, ok mlarkin.

-ml

> Index: uvm/uvm_fault.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.114
> diff -u -p -r1.114 uvm_fault.c
> --- uvm/uvm_fault.c   15 Feb 2021 12:12:54 -  1.114
> +++ uvm/uvm_fault.c   15 Feb 2021 12:14:08 -
> @@ -55,11 +55,11 @@
>   *read/write1 write>1  read/write   +-cow_write/zero
>   * | | ||
>   *  +--|--+   +--|--+ +-+   +  |  + | +-+
> - * amap |  V  |   |  --->new|  || |  ^  |
> + * amap |  V  |   |  -> new |  || |  ^  |
>   *  +-+   +-+ +-+   +  |  + | +--|--+
>   * |||
>   *  +-+   +-+   +--|--+ | +--|--+
> - * uobj | d/c |   | d/c |   |  V  | +|  |
> + * uobj | d/c |   | d/c |   |  V  | ++  |
>   *  +-+   +-+   +-+   +-+
>   *
>   * d/c = don't care
> @@ -69,7 +69,7 @@
>   *
>   *   case [1]: upper layer fault [anon active]
>   * 1A: [read] or [write with anon->an_ref == 1]
> - *   I/O takes place in top level anon and uobj is not touched.
> + *   I/O takes place in upper level anon and uobj is not touched.
>   * 1B: [write with anon->an_ref > 1]
>   *   new anon is alloc'd and data is copied off ["COW"]
>   *
> @@ -89,7 +89,7 @@
>   * the code is structured as follows:
>   *
>   * - init the "IN" params in the ufi structure
> - *   ReFault:
> + *   ReFault: (ERESTART returned to the loop in uvm_fault)
>   * - do lookups [locks maps], check protection, handle needs_copy
>   * - check for case 0 fault (error)
>   * - establish "range" of fault
> @@ -136,8 +136,8 @@
>   *by multiple map entries, and figuring out what should wait could be
>   *complex as well...).
>   *
> - * we use alternative 2 currently.   maybe alternative 3 would be useful
> - * in the future.XXX keep in mind for future consideration//rechecking.
> + * we use alternative 2.  given that we are multi-threaded now we may want
> + * to reconsider the choice.
>   */
>
>  /*
> @@ -177,7 +177,7 @@ uvmfault_anonflush(struct vm_anon **anon
>   int lcv;
>   struct vm_page *pg;
>
> - for (lcv = 0 ; lcv < n ; lcv++) {
> + for (lcv = 0; lcv < n; lcv++) {
>   if (anons[lcv] == NULL)
>   continue;
>   KASSERT(rw_lock_held(anons[lcv]->an_lock));
> @@ -222,14 +222,14 @@ uvmfault_init(void)
>  /*
>   * uvmfault_amapcopy: clear "needs_copy" in a map.
>   *
> + * => called with VM data structures unlocked (usually, see below)
> + * => we get a write lock on the maps and clear needs_copy for a VA
>   * => if we are out of RAM we sleep (waiting for more)
>   */
>  static void
>  uvmfault_amapcopy(struct uvm_faultinfo *ufi)
>  {
> -
> - /* while we haven't done the job */
> - while (1) {
> + for (;;) {
>   /* no mapping?  give up. */
>   if (uvmfault_lookup(ufi, TRUE) == FALSE)
>   return;
> @@ -258,36 +258,46 @@ uvmfault_amapcopy(struct uvm_faultinfo *
>   * uvmfault_anonget: get data in an anon into a non-busy, non-released
>   * page in that anon.
>   *
> - * => we don't move the page on the queues [gets moved later]
> - * => if we allocate a new page [we_own], it gets put on the queues.
> - *either way, the result is that the page is on the queues at return time
> + * => Map, amap and thus anon should be locked by caller.
> + * => If we fail, we unlock everything and error is returned.
> + * => If we are successful, return with everything still locked.
> + * => We do not move the page on the queues [gets moved later].  If we
> + *allocate a new page [we_own], it gets put on the queues.  Either way,
> + *the result is that the page is on the queues at return time
>   */
>  int
>  uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
>  struct vm_anon *anon)
>  {
> - boolean_t we_own;   /* we own anon's page? */
> -   

Re: Increase timeout length for VMs trying to fully shutdown

2021-01-05 Thread Mike Larkin
On Tue, Jan 05, 2021 at 12:49:29PM -0700, Tracey Emery wrote:
> Hello tech@,
>
> Some of us have been having shutdown issues with our VMs on OpenBSDAms.
> I tracked down the problem to too short of a timeout for the shutdown
> event.
>
> If there are an additional 1 or 2 package daemons running on the instance,
> the timeout triggers before the VM has shutdown the package daemons and
> properly synced the disks, resulting in a dirty startup.
>
> I've increased the timeout to 2 minutes instead of 30 seconds. My test
> VM on my laptop with 7 additional package daemons succeeded in 60
> seconds, but that might not be fast enough for slower disks.
>
> Am I being conservative enough with this number? Should it be another
> minute or two?
>
> Thoughts? Ok?
>
> --
>
> Tracey Emery
>
> diff 7a6bb14936050379800deb10d4a137c4d2d4a3c4 /usr/src
> blob - 9a64973ab998accb810d56c386c1bb92c204ab20
> file + usr.sbin/vmd/virtio.h
> --- usr.sbin/vmd/virtio.h
> +++ usr.sbin/vmd/virtio.h
> @@ -38,7 +38,7 @@
>
>  /* VMM Control Interface shutdown timeout (in seconds) */
>  #define VMMCI_TIMEOUT3
> -#define VMMCI_SHUTDOWN_TIMEOUT   30
> +#define VMMCI_SHUTDOWN_TIMEOUT   120
>
>  /* All the devices we support have either 1, 2 or 3 queues */
>  /* viornd - 1 queue
>

I took a look through the code. I'd say this bump is fine, there is no
side effect aside from just waiting for the VM to shutdown, *except*
possibly when waiting for the host to shutdown (/etc/rc in the shutdown
path), that might take longer if some VMs get stuck in their shutdown
code. But if you got impatient, you could always ^C at that point...

ok mlarkin

-ml



Re: PATCH: Fix PCI Config Space union size on VMM

2020-09-09 Thread Mike Larkin
On Mon, Sep 07, 2020 at 06:03:00PM -0500, Jordan Hargrave wrote:
> This code fixes the pci device union for accessing PCI config space >= 0x40
>
> Running pcidump -xxx in a virtual machine would return garbage data due to 
> union overlap
>

Thanks, looks good from my perspective.

-ml

> On Mon, Sep 07, 2020 at 05:52:55PM -0500, Jordan Hargrave wrote:
> > Index: pci.h
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/pci.h,v
> > retrieving revision 1.7
> > diff -u -p -u -r1.7 pci.h
> > --- pci.h   17 Sep 2017 23:07:56 -  1.7
> > +++ pci.h   7 Sep 2020 22:48:09 -
> > @@ -32,43 +32,44 @@ typedef int (*pci_iobar_fn_t)(int dir, u
> >  void *, uint8_t);
> >  typedef int (*pci_mmiobar_fn_t)(int dir, uint32_t ofs, uint32_t *data);
> >
> > -union pci_dev {
> > -   uint32_t pd_cfg_space[PCI_CONFIG_SPACE_SIZE / 4];
> >
> > -   struct {
> > -   uint16_t pd_vid;
> > -   uint16_t pd_did;
> > -   uint16_t pd_cmd;
> > -   uint16_t pd_status;
> > -   uint8_t pd_rev;
> > -   uint8_t pd_prog_if;
> > -   uint8_t pd_subclass;
> > -   uint8_t pd_class;
> > -   uint8_t pd_cache_size;
> > -   uint8_t pd_lat_timer;
> > -   uint8_t pd_header_type;
> > -   uint8_t pd_bist;
> > -   uint32_t pd_bar[PCI_MAX_BARS];
> > -   uint32_t pd_cardbus_cis;
> > -   uint16_t pd_subsys_vid;
> > -   uint16_t pd_subsys_id;
> > -   uint32_t pd_exp_rom_addr;
> > -   uint8_t pd_cap;
> > -   uint32_t pd_reserved0 : 24;
> > -   uint32_t pd_reserved1;
> > -   uint8_t pd_irq;
> > -   uint8_t pd_int;
> > -   uint8_t pd_min_grant;
> > -   uint8_t pd_max_grant;
> > +struct pci_dev {
> > +   union {
> > +   uint32_t pd_cfg_space[PCI_CONFIG_SPACE_SIZE / 4];
> > +   struct {
> > +   uint16_t pd_vid;
> > +   uint16_t pd_did;
> > +   uint16_t pd_cmd;
> > +   uint16_t pd_status;
> > +   uint8_t pd_rev;
> > +   uint8_t pd_prog_if;
> > +   uint8_t pd_subclass;
> > +   uint8_t pd_class;
> > +   uint8_t pd_cache_size;
> > +   uint8_t pd_lat_timer;
> > +   uint8_t pd_header_type;
> > +   uint8_t pd_bist;
> > +   uint32_t pd_bar[PCI_MAX_BARS];
> > +   uint32_t pd_cardbus_cis;
> > +   uint16_t pd_subsys_vid;
> > +   uint16_t pd_subsys_id;
> > +   uint32_t pd_exp_rom_addr;
> > +   uint8_t pd_cap;
> > +   uint32_t pd_reserved0 : 24;
> > +   uint32_t pd_reserved1;
> > +   uint8_t pd_irq;
> > +   uint8_t pd_int;
> > +   uint8_t pd_min_grant;
> > +   uint8_t pd_max_grant;
> > +   } __packed;
> > +   };
> > +   uint8_t pd_bar_ct;
> > +   pci_cs_fn_t pd_csfunc;
> >
> > -   uint8_t pd_bar_ct;
> > -   pci_cs_fn_t pd_csfunc;
> > -
> > -   uint8_t pd_bartype[PCI_MAX_BARS];
> > -   uint32_t pd_barsize[PCI_MAX_BARS];
> > -   void *pd_barfunc[PCI_MAX_BARS];
> > -   void *pd_bar_cookie[PCI_MAX_BARS];
> > -   } __packed;
> > +   uint8_t pd_bartype[PCI_MAX_BARS];
> > +   uint32_t pd_barsize[PCI_MAX_BARS];
> > +   void *pd_barfunc[PCI_MAX_BARS];
> > +   void *pd_bar_cookie[PCI_MAX_BARS];
> >  };
> >
> >  struct pci {
> > @@ -79,7 +80,7 @@ struct pci {
> > uint32_t pci_addr_reg;
> > uint32_t pci_data_reg;
> >
> > -   union pci_dev pci_devices[PCI_CONFIG_MAX_DEV];
> > +   struct pci_dev pci_devices[PCI_CONFIG_MAX_DEV];
> >  };
> >
> >  void pci_handle_address_reg(struct vm_run_params *);
> >
>



Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-25 Thread Mike Larkin
On Tue, Aug 25, 2020 at 12:12:36PM -0700, Mike Larkin wrote:
> On Mon, Aug 24, 2020 at 01:55:45AM +0200, Mark Kettenis wrote:
> > > Date: Sun, 23 Aug 2020 18:11:12 -0500
> > > From: Scott Cheloha 
> > >
> > > Hi,
> > >
> > > Other BSDs use the TSC to implement delay(9) if the TSC is constant
> > > and invariant.  Here's a patch to add something similar to our kernel.
> >
> > If the TSC is fine as a timecounter it should be absolutely fine for
> > use as delay().  And we could even use if the TSC isn't synchronized
> > between CPUs.
> >
> > >
> > > This patch (or something equivalent) is a prerequisite to running the
> > > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > > implement delay(9) when it isn't running in periodic mode is too
> > > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > > need an alternative.
> >
> > Hmm, but what are we going to use on machines where the TSC isn't
> > constant/invariant?
> >
> > In what respect is the i8254 too slow?  Does it take more than a
> > microsecond to read it?
> >
>
> It's 3 outb/inb pairs to ensure you get the reading correct. So that could
> be quite a long time (as cheloha@ points out). Also, that's 6 VM exits if
> running virtually (I realize that's not the main use case here but just
> saying...)
>
> IIRC the 3 in/out pairs are the latch command followed by reading the LSB/MSB
> of the counter. It's not MMIO like the HPET or ACPI timer.
>
> And as cheloha@ also points out, it is highly likely that none of us have a
> real i8254 anymore, much of this is probably implemented in some EC somewhere
> and it's unlikely the developer of said EC put a lot of effort into optimizing
> the implementation of a legacy device like this.
>
> On the topic of virtualization:
>
> while (rdtsc() - start < want)
>  rdtsc();
>

I just realized the original diff didn't do two rdtscs. It did a pause inside 
the
loop. So the effect is not *as* bad as I described but it's still *somewhat* 
bad.

PS - pause loop exiting can be enabled to improve performance in this situation.

> ..produces two VM exits (generally, on most hypervisors) since the TSC is
> usually time corrected. That's a lot of exits, and it gets worse on faster
> machines. I don't have a better idea, however. There may be a PV clock option
> that is more optimized in some scenarios.
>
> -ml
>
>
> > We could use the HPET I suppose, whic may be a bit better.
> >
> > > As for the patch, it works for me here, though I'd appreciate a few
> > > tests.  I admit that comparing function pointers is ugly, but I think
> > > this is as simple as it can be without implementing some sort of
> > > framework for "registering" delay(9) implementations and comparing
> > > them and selecting the "best" implementation.
> >
> > What about:
> >
> > if (delay_func == NULL)
> > delay_func = lapic_delay;
> >
> > > I'm not sure I put the prototypes in the right headers.  We don't have
> > > a tsc.h but cpuvar.h looks sorta-correct for tsc_delay().
> >
> > I think cpuvar.h is fine since it has other TSC-related stuff.
> > However, with my suggestion above you can drop that.
> >
> > > FreeBSD's x86/delay.c may be of note:
> > >
> > > https://github.com/freebsd/freebsd/blob/ed96335a07b688c39e16db8856232e5840bc22ac/sys/x86/x86/delay.c
> > >
> > > Thoughts?
> > >
> > > Index: amd64/tsc.c
> > > ===
> > > RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
> > > retrieving revision 1.20
> > > diff -u -p -r1.20 tsc.c
> > > --- amd64/tsc.c   23 Aug 2020 21:38:47 -  1.20
> > > +++ amd64/tsc.c   23 Aug 2020 22:59:25 -
> > > @@ -26,6 +26,7 @@
> > >
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #define RECALIBRATE_MAX_RETRIES  5
> > >  #define RECALIBRATE_SMI_THRESHOLD5
> > > @@ -252,7 +253,8 @@ tsc_timecounter_init(struct cpu_info *ci
> > >   tsc_timecounter.tc_quality = -1000;
> > >   tsc_timecounter.tc_user = 0;
> > >   tsc_is_invariant = 0;
> > > - }
> > > + } else
> > > + delay_func = tsc_delay;
> > >
> > >   tc_init(_timecounter);
> > >  }
> > > @@ -342,4 +344,15 @@ tsc_sync_ap(struct cpu_info *ci)
> &

Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-25 Thread Mike Larkin
On Mon, Aug 24, 2020 at 12:29:15AM -0500, Scott Cheloha wrote:
> On Sun, Aug 23, 2020 at 11:45:22PM -0500, Scott Cheloha wrote:
> >
> > [...]
> >
> > > > This patch (or something equivalent) is a prerequisite to running the
> > > > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > > > implement delay(9) when it isn't running in periodic mode is too
> > > > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > > > need an alternative.
> > >
> > > Hmm, but what are we going to use on machines where the TSC isn't
> > > constant/invariant?
> >
> > Probably fall back on the i8254?  Unless someone wants to add yet
> > another delay(9) implementation to amd64...
> >
> > > In what respect is the i8254 too slow?  Does it take more than a
> > > microsecond to read it?
> >
> > On my machine, the portion of gettick() *within* the mutex runs in ~19
> > microseconds.
> >
> > That's before any overhead from mtx_enter(9).  I think having multiple
> > threads in delay(9) should be relatively rare, but you have to keep
> > that in mind.
> >
> > No idea what the overhead would look like on real hardware.  I'm
> > pretty sure my i8254 is emulated.
> >
> > > We could use the HPET I suppose, whic may be a bit better.
> >
> > It's better.  No mutex.  On my machine it takes ~11 microseconds.
> > It's a start.
>
> Hmmm, now I'm worried I have screwed something up or misconfigured
> something.
>
> It doesn't seem right that it would take 20K cycles to read the HPET
> on this machine.
>
> Am I way off?  Or is 20K actually a reasonable number?
>

There have been reports of the HPET being really slow on some machines.
IIRC this is why we ended up getting a tsc timecounter a number of years
ago. Someone (reyk@?) found his skylake had a super slow HPET and that
ended up being part of the impetus to to a tsc timecounter.

Also, 20k cycles is totally expected if you are on a VM (not sure if
this is the case).


> For comparison, lapic_gettick() completes in... 80 nanoseconds (?) on
> the same machine.  Relevant sysctls:
>

LAPIC memory page accesses go to the CPU. It's not always the case that
the HPET does the same (they may be accessed via PCI). Also, in a VM,
on new CPUs, LAPIC virtualization can be enabled which means no exits
for LAPIC accesses. So, yeah, these numbers you are seeing aren't surprising.

> $ sysctl hw.{model,setperf,perfpolicy} machdep.{tscfreq,invarianttsc}
> hw.model=Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz
> hw.setperf=100
> hw.perfpolicy=high
> machdep.tscfreq=211200
> machdep.invarianttsc=1
>
> ... if it really takes that long, then "high precision" is a bit of a
> misnomer.
>



Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-25 Thread Mike Larkin
On Mon, Aug 24, 2020 at 01:55:45AM +0200, Mark Kettenis wrote:
> > Date: Sun, 23 Aug 2020 18:11:12 -0500
> > From: Scott Cheloha 
> >
> > Hi,
> >
> > Other BSDs use the TSC to implement delay(9) if the TSC is constant
> > and invariant.  Here's a patch to add something similar to our kernel.
>
> If the TSC is fine as a timecounter it should be absolutely fine for
> use as delay().  And we could even use if the TSC isn't synchronized
> between CPUs.
>
> >
> > This patch (or something equivalent) is a prerequisite to running the
> > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > implement delay(9) when it isn't running in periodic mode is too
> > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > need an alternative.
>
> Hmm, but what are we going to use on machines where the TSC isn't
> constant/invariant?
>
> In what respect is the i8254 too slow?  Does it take more than a
> microsecond to read it?
>

It's 3 outb/inb pairs to ensure you get the reading correct. So that could
be quite a long time (as cheloha@ points out). Also, that's 6 VM exits if
running virtually (I realize that's not the main use case here but just
saying...)

IIRC the 3 in/out pairs are the latch command followed by reading the LSB/MSB
of the counter. It's not MMIO like the HPET or ACPI timer.

And as cheloha@ also points out, it is highly likely that none of us have a
real i8254 anymore, much of this is probably implemented in some EC somewhere
and it's unlikely the developer of said EC put a lot of effort into optimizing
the implementation of a legacy device like this.

On the topic of virtualization:

while (rdtsc() - start < want)
 rdtsc();

..produces two VM exits (generally, on most hypervisors) since the TSC is
usually time corrected. That's a lot of exits, and it gets worse on faster
machines. I don't have a better idea, however. There may be a PV clock option
that is more optimized in some scenarios.

-ml


> We could use the HPET I suppose, whic may be a bit better.
>
> > As for the patch, it works for me here, though I'd appreciate a few
> > tests.  I admit that comparing function pointers is ugly, but I think
> > this is as simple as it can be without implementing some sort of
> > framework for "registering" delay(9) implementations and comparing
> > them and selecting the "best" implementation.
>
> What about:
>
>   if (delay_func == NULL)
>   delay_func = lapic_delay;
>
> > I'm not sure I put the prototypes in the right headers.  We don't have
> > a tsc.h but cpuvar.h looks sorta-correct for tsc_delay().
>
> I think cpuvar.h is fine since it has other TSC-related stuff.
> However, with my suggestion above you can drop that.
>
> > FreeBSD's x86/delay.c may be of note:
> >
> > https://github.com/freebsd/freebsd/blob/ed96335a07b688c39e16db8856232e5840bc22ac/sys/x86/x86/delay.c
> >
> > Thoughts?
> >
> > Index: amd64/tsc.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
> > retrieving revision 1.20
> > diff -u -p -r1.20 tsc.c
> > --- amd64/tsc.c 23 Aug 2020 21:38:47 -  1.20
> > +++ amd64/tsc.c 23 Aug 2020 22:59:25 -
> > @@ -26,6 +26,7 @@
> >
> >  #include 
> >  #include 
> > +#include 
> >
> >  #define RECALIBRATE_MAX_RETRIES5
> >  #define RECALIBRATE_SMI_THRESHOLD  5
> > @@ -252,7 +253,8 @@ tsc_timecounter_init(struct cpu_info *ci
> > tsc_timecounter.tc_quality = -1000;
> > tsc_timecounter.tc_user = 0;
> > tsc_is_invariant = 0;
> > -   }
> > +   } else
> > +   delay_func = tsc_delay;
> >
> > tc_init(_timecounter);
> >  }
> > @@ -342,4 +344,15 @@ tsc_sync_ap(struct cpu_info *ci)
> >  {
> > tsc_post_ap(ci);
> > tsc_post_ap(ci);
> > +}
> > +
> > +void
> > +tsc_delay(int usecs)
> > +{
> > +   uint64_t interval, start;
> > +
> > +   interval = (uint64_t)usecs * tsc_frequency / 100;
> > +   start = rdtsc_lfence();
> > +   while (rdtsc_lfence() - start < interval)
> > +   CPU_BUSY_CYCLE();
> >  }
> > Index: amd64/lapic.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> > retrieving revision 1.55
> > diff -u -p -r1.55 lapic.c
> > --- amd64/lapic.c   3 Aug 2019 14:57:51 -   1.55
> > +++ amd64/lapic.c   23 Aug 2020 22:59:25 -
> > @@ -41,6 +41,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -569,7 +570,8 @@ skip_calibration:
> >  * Now that the timer's calibrated, use the apic timer routines
> >  * for all our timing needs..
> >  */
> > -   delay_func = lapic_delay;
> > +   if (delay_func != tsc_delay)
> > +   delay_func = lapic_delay;
> > initclock_func = lapic_initclocks;
> > }
> >  }
> > Index: include/cpuvar.h
> > 

Re: kernel crash in setrunqueue

2020-07-29 Thread Mike Larkin
On Wed, Jul 29, 2020 at 10:14:11PM +0200, Mark Kettenis wrote:
> > Date: Wed, 29 Jul 2020 13:03:43 -0700
> > From: Mike Larkin 
> >
> > Hi,
> >
> >  I'm seeing crashes on amd64 GENERIC.MP on a few VMs recently. This happens
> > on GENERIC.MP regardless of whether or not the VM has one cpu or more than
> > one. It does not happen on GENERIC kernels.
> >
> >  The crash will happen fairly quickly after the kernel starts executing
> > processes. Sometimes it crashes instantly, sometimes it lasts for a minute
> > or two. It rarely makes it to the login prompt. The problem is 100%
> > reproducible on two different VMs I have, running on two different
> > hypervisors (Hyper-V and ESXi6.7U2).
> >
> >  I first started noticing the problem on the 24th July snap, but TBH these
> > machines were not frequently updated, so the previous snap I had installed
> > might have been a couple months old. Whatever older snap was on them before
> > worked fine.
> >
> >  Since this is happening on two different machines with two different VMs,
> > I'm gonna rule out hardware issues.
> >
> >  Crash:
> >
> > kernel: pretection fault trap, code=0
> > Stopped at  setrunqueue+0xa2:   addl$0x1,0x288(%r13)
> >
> >  Trace:
> > ddb{2}> trace
> > setrunqueue(27b3d6c24c3fab80, 800015e874e0,32) at setrunqueue+0xa2
> > sched_barrier_task(800015f1a168) at sched_barrier_task+0x6c
> > taskq_thread(82121548) at taskq_thread+0x8d
> > end trace frame: 0x0, count: -3
> >
> >  Registers:
> > ddb{2}> sh r
> > rdi 0x821ee728  sched_lock
> > rsi 0x800014cc6ff0
> > rbp 0x800015ea0e40
> > rbx  0
> > rdx   0x23ca94  acpi_pdirpa_0x2288fc
> > rcx0xc
> > rax0xc
> > r8   0x202
> > r9 0x2
> > r10  0
> > r11 0x57f79bf6968709d8
> > r12 0x800015e874e0
> > r13 0x27b3d6c24c3fab80
> > r14   0x32
> > r15 0x27b3d6c24c3fab80
> > rip 0x81b9df22  setrunqueue+0xa2
> > cs 0x8
> > rflags 0x10207  __ALIGN_SIZE+0xf207
> > rsp 0x800015ea0df0
> > ss0x10
> >
> >
> > The offending instruction is in kern_sched.c:260:
> >
> > spc->spc_nrun++;
> >
> > ... which indicates 'spc' is trash (and it is, based on %r13 above). In my
> > tests, %r13 always is this same trash value. That comes from 'ci', which is
> > either passed in or chosen by sched_choosecpu. Neither of these functions
> > have changed recently, so I'm guessing this corruption is coming from 
> > something
> > else.
> >
> >  Anyone have ideas where to start looking? I suppose I could start 
> > bisecting,
> > but does anyone know of any changes that would affect this area?
> >
> >  I can send dmesgs if needed, but these are pretty standard VMs,
> > nothing fancy configured in them. 4 CPUs, 8GB RAM, etc.
>
> They're VMs and it turns out that many of the "PV" drivers are/were
> using the intr_barrier() interface the wrong way.
>
> For Hyper-V, see my reply in the "Panic on boot with Hyper-V since Jun
> 17 snapshot" thread on bugs@ from earlier today.
>
> Cheers,
>
> Mark
>

Thanks. I don't subscribe to bugs@ anymore, so that's why I likely missed it.

-ml



Re: kernel crash in setrunqueue

2020-07-29 Thread Mike Larkin
On Wed, Jul 29, 2020 at 01:03:43PM -0700, Mike Larkin wrote:
> Hi,
>
>  I'm seeing crashes on amd64 GENERIC.MP on a few VMs recently. This happens
> on GENERIC.MP regardless of whether or not the VM has one cpu or more than
> one. It does not happen on GENERIC kernels.
>
>  The crash will happen fairly quickly after the kernel starts executing
> processes. Sometimes it crashes instantly, sometimes it lasts for a minute
> or two. It rarely makes it to the login prompt. The problem is 100%
> reproducible on two different VMs I have, running on two different
> hypervisors (Hyper-V and ESXi6.7U2).
>
>  I first started noticing the problem on the 24th July snap, but TBH these
> machines were not frequently updated, so the previous snap I had installed
> might have been a couple months old. Whatever older snap was on them before
> worked fine.
>
>  Since this is happening on two different machines with two different VMs,
> I'm gonna rule out hardware issues.
>
>  Crash:
>
> kernel: pretection fault trap, code=0
> Stopped atsetrunqueue+0xa2:   addl$0x1,0x288(%r13)
>
>  Trace:
> ddb{2}> trace
> setrunqueue(27b3d6c24c3fab80, 800015e874e0,32) at setrunqueue+0xa2
> sched_barrier_task(800015f1a168) at sched_barrier_task+0x6c
> taskq_thread(82121548) at taskq_thread+0x8d
> end trace frame: 0x0, count: -3
>
>  Registers:
> ddb{2}> sh r
> rdi   0x821ee728  sched_lock
> rsi   0x800014cc6ff0
> rbp   0x800015ea0e40
> rbx0
> rdx 0x23ca94  acpi_pdirpa_0x2288fc
> rcx  0xc
> rax  0xc
> r8 0x202
> r9   0x2
> r100
> r11   0x57f79bf6968709d8
> r12   0x800015e874e0
> r13   0x27b3d6c24c3fab80
> r14 0x32
> r15   0x27b3d6c24c3fab80
> rip   0x81b9df22  setrunqueue+0xa2
> cs   0x8
> rflags   0x10207  __ALIGN_SIZE+0xf207
> rsp   0x800015ea0df0
> ss  0x10
>
>
> The offending instruction is in kern_sched.c:260:
>
>   spc->spc_nrun++;
>
> ... which indicates 'spc' is trash (and it is, based on %r13 above). In my
> tests, %r13 always is this same trash value. That comes from 'ci', which is
> either passed in or chosen by sched_choosecpu. Neither of these functions
> have changed recently, so I'm guessing this corruption is coming from 
> something
> else.
>
>  Anyone have ideas where to start looking? I suppose I could start bisecting,
> but does anyone know of any changes that would affect this area?
>
>  I can send dmesgs if needed, but these are pretty standard VMs, nothing fancy
> configured in them. 4 CPUs, 8GB RAM, etc.
>
> -ml
>

Also I should note that the problem happens with snaps as well as kernels built
from source (-current), so this isn't likely something that is in snaps but not
yet in tree.

-ml



kernel crash in setrunqueue

2020-07-29 Thread Mike Larkin
Hi,

 I'm seeing crashes on amd64 GENERIC.MP on a few VMs recently. This happens
on GENERIC.MP regardless of whether or not the VM has one cpu or more than
one. It does not happen on GENERIC kernels.

 The crash will happen fairly quickly after the kernel starts executing
processes. Sometimes it crashes instantly, sometimes it lasts for a minute
or two. It rarely makes it to the login prompt. The problem is 100%
reproducible on two different VMs I have, running on two different
hypervisors (Hyper-V and ESXi6.7U2).

 I first started noticing the problem on the 24th July snap, but TBH these
machines were not frequently updated, so the previous snap I had installed
might have been a couple months old. Whatever older snap was on them before
worked fine.

 Since this is happening on two different machines with two different VMs,
I'm gonna rule out hardware issues.

 Crash:

kernel: pretection fault trap, code=0
Stopped at  setrunqueue+0xa2:   addl$0x1,0x288(%r13)

 Trace:
ddb{2}> trace
setrunqueue(27b3d6c24c3fab80, 800015e874e0,32) at setrunqueue+0xa2
sched_barrier_task(800015f1a168) at sched_barrier_task+0x6c
taskq_thread(82121548) at taskq_thread+0x8d
end trace frame: 0x0, count: -3

 Registers:
ddb{2}> sh r
rdi 0x821ee728  sched_lock
rsi 0x800014cc6ff0
rbp 0x800015ea0e40
rbx  0
rdx   0x23ca94  acpi_pdirpa_0x2288fc
rcx0xc
rax0xc
r8   0x202
r9 0x2
r10  0
r11 0x57f79bf6968709d8
r12 0x800015e874e0
r13 0x27b3d6c24c3fab80
r14   0x32
r15 0x27b3d6c24c3fab80
rip 0x81b9df22  setrunqueue+0xa2
cs 0x8
rflags 0x10207  __ALIGN_SIZE+0xf207
rsp 0x800015ea0df0
ss0x10


The offending instruction is in kern_sched.c:260:

spc->spc_nrun++;

... which indicates 'spc' is trash (and it is, based on %r13 above). In my
tests, %r13 always is this same trash value. That comes from 'ci', which is
either passed in or chosen by sched_choosecpu. Neither of these functions
have changed recently, so I'm guessing this corruption is coming from something
else.

 Anyone have ideas where to start looking? I suppose I could start bisecting,
but does anyone know of any changes that would affect this area?

 I can send dmesgs if needed, but these are pretty standard VMs, nothing fancy
configured in them. 4 CPUs, 8GB RAM, etc.

-ml



Re: Edgerouter 4 available for any OpenBSD dev that needs an octeon

2020-07-29 Thread Mike Larkin
On Tue, Jul 28, 2020 at 06:16:01PM -0700, Mike Larkin wrote:
> Someone (can't recall who) gave me an ER4. I found it while cleaning
> out my closet. Since I'm not active anymore, if any openbsd developer
> wants it, reach out to me privately and I'll see about sending it
> to you.
>
> Thanks.
>
> -ml
>

Thanks everyone, this is heading to an OpenBSD developer.

-ml



Edgerouter 4 available for any OpenBSD dev that needs an octeon

2020-07-28 Thread Mike Larkin
Someone (can't recall who) gave me an ER4. I found it while cleaning
out my closet. Since I'm not active anymore, if any openbsd developer
wants it, reach out to me privately and I'll see about sending it
to you.

Thanks.

-ml



Re: amd64: lapic: refactor lapic timer programming

2020-07-06 Thread Mike Larkin
On Fri, Jul 03, 2020 at 07:41:45PM -0500, Scott Cheloha wrote:
> Hi,
>
> I want to run the lapic timer in one-shot mode on amd64 as we do with
> other interrupt clocks on other platforms.  I aim to make the clock
> interrupt code MD where possible.
>
> However, nobody is going to test my MD clock interrupt work unless
> amd64 is ready to use it.  amd64 doesn't run in oneshot mode so there
> is preliminary work to do first.
>
> --
>
> Before we can run the lapic timer in one-shot mode we need to simplify
> the process of actually programming it.
>
> This patch refactors all lapic timer programming into a single
> routine.  We don't use any divisor other than 1 so I don't see a need
> to make it a parameter to lapic_timer_arm().  We can add TSC deadline
> support later if someone wants it.
>
> The way we program the timer differs from how e.g. Darwin and FreeBSD
> and Linux do it.  They write:
>
>  - lvtt (mode + vector + (maybe) mask)
>  - dcr
>  - icr
>
> while we do:
>
>  - lvtt (mode + mask)
>  - dcr
>  - icr
>  - (maybe) lvtt (mode + vector)
>
> I don't see a reason to arm the timer with four writes instead of
> three, so in this patch I use the three-write ordering.
>
> Am I missing something?  Do I need to disable interrupts before I
> reprogram the timer?
>

This reads ok to me. I am not aware of any requirements to disable
interrupts while reprogramming the timer.

-ml

> -Scott
>
> Index: lapic.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> retrieving revision 1.55
> diff -u -p -r1.55 lapic.c
> --- lapic.c   3 Aug 2019 14:57:51 -   1.55
> +++ lapic.c   4 Jul 2020 00:40:26 -
> @@ -413,6 +413,42 @@ u_int32_t lapic_frac_usec_per_cycle;
>  u_int64_t lapic_frac_cycle_per_usec;
>  u_int32_t lapic_delaytab[26];
>
> +void lapic_timer_arm(uint32_t, int, uint32_t);
> +void lapic_timer_arm_once(int, uint32_t);
> +void lapic_timer_arm_period(int, uint32_t);
> +
> +/*
> + * Start the local apic countdown timer.
> + *
> + * First set the mode, vector, and (maybe) the mask.
> + * then set the divisor,
> + * and finally set the cycle count.
> + */
> +void
> +lapic_timer_arm(uint32_t mode, int masked, uint32_t cycles)
> +{
> + uint32_t lvtt;
> +
> + lvtt = mode | LAPIC_TIMER_VECTOR;
> + lvtt |= (masked) ? LAPIC_LVTT_M : 0;
> +
> + lapic_writereg(LAPIC_LVTT, lvtt);
> + lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> + lapic_writereg(LAPIC_ICR_TIMER, cycles);
> +}
> +
> +void
> +lapic_timer_arm_once(int masked, uint32_t cycles)
> +{
> + lapic_timer_arm(LAPIC_LVTT_TM_ONESHOT, masked, cycles);
> +}
> +
> +void
> +lapic_timer_arm_period(int masked, uint32_t cycles)
> +{
> + lapic_timer_arm(LAPIC_LVTT_TM_PERIODIC, masked, cycles);
> +}
> +
>  void
>  lapic_clockintr(void *arg, struct intrframe frame)
>  {
> @@ -430,17 +466,7 @@ lapic_clockintr(void *arg, struct intrfr
>  void
>  lapic_startclock(void)
>  {
> - /*
> -  * Start local apic countdown timer running, in repeated mode.
> -  *
> -  * Mask the clock interrupt and set mode,
> -  * then set divisor,
> -  * then unmask and set the vector.
> -  */
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_TM|LAPIC_LVTT_M);
> - lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> - lapic_writereg(LAPIC_ICR_TIMER, lapic_tval);
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_TM|LAPIC_TIMER_VECTOR);
> + lapic_timer_arm_period(0, lapic_tval);
>  }
>
>  void
> @@ -498,9 +524,7 @@ lapic_calibrate_timer(struct cpu_info *c
>* Configure timer to one-shot, interrupt masked,
>* large positive number.
>*/
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_M);
> - lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> - lapic_writereg(LAPIC_ICR_TIMER, 0x8000);
> + lapic_timer_arm_once(1, 0x8000);
>
>   s = intr_disable();
>
> @@ -540,10 +564,7 @@ skip_calibration:
>   lapic_tval = (lapic_per_second * 2) / hz;
>   lapic_tval = (lapic_tval / 2) + (lapic_tval & 0x1);
>
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_TM | LAPIC_LVTT_M |
> - LAPIC_TIMER_VECTOR);
> - lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> - lapic_writereg(LAPIC_ICR_TIMER, lapic_tval);
> + lapic_timer_arm_period(0, lapic_tval);
>
>   /*
>* Compute fixed-point ratios between cycles and
>



Re: 11n Tx aggregation for iwm(4)

2020-06-26 Thread Mike Larkin
On Fri, Jun 26, 2020 at 09:01:03PM -0700, Mike Larkin wrote:
> On Fri, Jun 26, 2020 at 02:45:53PM +0200, Stefan Sperling wrote:
> > This patch adds support for 11n Tx aggregation to iwm(4).
> >
> > Please help with testing if you can by running the patch and using wifi
> > as usual. Nothing should change, except that Tx speed may potentially
> > improve. If you have time to run before/after performance measurements with
> > tcpbench or such, that would be nice. But it's not required for testing.
> >
> > If Tx aggregation is active then netstat will show a non-zero output block 
> > ack
> > agreement counter:
> >
> > $ netstat -W iwm0 | grep 'output block'
> > 3 new output block ack agreements
> > 0 output block ack agreements timed out
> >
> > It would be great to get at least one test for all the chipsets the driver
> > supports: 7260, 7265, 3160, 3165, 3168, 8260, 8265, 9260, 9560
> > The behaviour of the access point also matters a great deal. It won't
> > hurt to test the same chipset against several different access points.
> >
> > I have tested this version on 8265 only so far. I've run older revisions
> > of this patch on 7265 so I'm confident that this chip will work, too.
> > So far, the APs I have tested against are athn(4) in 11a mode and in 11n
> > mode with the 'nomimo' nwflag, and a Sagemcom 11ac AP. All on 5Ghz channels.
>
> I tested this on my T490 Thinkpad:
>
> iwm0 at pci0 dev 20 function 3 "Intel Dual Band Wireless AC 9560" rev 0x30, 
> msix
> iwm0: hw rev 0x310, fw ver 34.3125811985.0
>
> It ended up having a heck of a time connecting to anything, most/all
> connections ended up timing out or just taking a really long time to complete.
>
> I looked in dmesg, and found a stream of fatal firmware errors and other
> errors (see end of this email).
>
> My iwm-firmware was updated before I tried the new kernel:
>
> -innsmouth- ~> pkg_info iwm-firmware
> Information for inst:iwm-firmware-20191022p1
>
> Comment:
> firmware binary images for iwm(4) driver
>
> Description:
> Firmware binary images for use with the iwm(4) driver.
>
> Maintainer: The OpenBSD ports mailing-list 
>
> WWW: https://wireless.wiki.kernel.org/en/users/Drivers/iwlwifi
>

PS, I did see 5 new output block ack agreements when I was running the diff,
so apparently at least it is doing ... something?

-ml

>
>
> I still have the kernel around if you want me to test something else. There
> is nothing in this tree except this Txagg diff. LMK if you need any more
> info.
>
> OpenBSD 6.7-current (GENERIC.MP) #1: Fri Jun 26 14:01:06 PDT 2020
> 
> mlar...@innsmouth.int.azathoth.net:/u/bin/src/OpenBSD/openbsd/sys/arch/amd64/compile/GENERIC.MP
> real mem = 51260506112 (48885MB)
> avail mem = 49691906048 (47389MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 3.1 @ 0x604f5000 (67 entries)
> bios0: vendor LENOVO version "N2IET61W (1.39 )" date 05/16/2019
> bios0: LENOVO 20N20046US
> acpi0 at bios0: ACPI 6.1
> acpi0: sleep states S0 S3 S4 S5
> acpi0: tables DSDT FACP SSDT SSDT SSDT SSDT UEFI SSDT HPET APIC MCFG ECDT 
> SSDT SSDT BOOT SLIC SSDT LPIT WSMT SSDT DBGP DBG2 MSDM BATB DMAR NHLT ASF! 
> FPDT UEFI
> acpi0: wakeup devices GLAN(S4) XHC_(S3) XDCI(S4) HDAS(S4) RP01(S4) PXSX(S4) 
> RP02(S4) PXSX(S4) RP03(S4) PXSX(S4) RP04(S4) PXSX(S4) RP05(S4) PXSX(S4) 
> RP06(S4) PXSX(S4) [...]
> acpitimer0 at acpi0: 3579545 Hz, 24 bits
> acpihpet0 at acpi0: 2399 Hz
> acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
> cpu0 at mainbus0: apid 0 (boot processor)
> cpu0: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1586.72 MHz, 06-8e-0c
> cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,SGX,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MPX,RDSEED,ADX,SMAP,CLFLUSHOPT,PT,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, package 0
> mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
> cpu0: apic clock running at 24MHz
> cpu0: mwait min=64, max=64, C-substates=0.2.1.2.4.1.1.1, IBE
> cpu1 at mainbus0: apid 2 (application processor)
> cpu1: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1333.05 MHz, 06-8e-0c
> cpu1: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,

Re: 11n Tx aggregation for iwm(4)

2020-06-26 Thread Mike Larkin
On Fri, Jun 26, 2020 at 02:45:53PM +0200, Stefan Sperling wrote:
> This patch adds support for 11n Tx aggregation to iwm(4).
>
> Please help with testing if you can by running the patch and using wifi
> as usual. Nothing should change, except that Tx speed may potentially
> improve. If you have time to run before/after performance measurements with
> tcpbench or such, that would be nice. But it's not required for testing.
>
> If Tx aggregation is active then netstat will show a non-zero output block ack
> agreement counter:
>
> $ netstat -W iwm0 | grep 'output block'
> 3 new output block ack agreements
>   0 output block ack agreements timed out
>
> It would be great to get at least one test for all the chipsets the driver
> supports: 7260, 7265, 3160, 3165, 3168, 8260, 8265, 9260, 9560
> The behaviour of the access point also matters a great deal. It won't
> hurt to test the same chipset against several different access points.
>
> I have tested this version on 8265 only so far. I've run older revisions
> of this patch on 7265 so I'm confident that this chip will work, too.
> So far, the APs I have tested against are athn(4) in 11a mode and in 11n
> mode with the 'nomimo' nwflag, and a Sagemcom 11ac AP. All on 5Ghz channels.

I tested this on my T490 Thinkpad:

iwm0 at pci0 dev 20 function 3 "Intel Dual Band Wireless AC 9560" rev 0x30, msix
iwm0: hw rev 0x310, fw ver 34.3125811985.0

It ended up having a heck of a time connecting to anything, most/all
connections ended up timing out or just taking a really long time to complete.

I looked in dmesg, and found a stream of fatal firmware errors and other
errors (see end of this email).

My iwm-firmware was updated before I tried the new kernel:

-innsmouth- ~> pkg_info iwm-firmware
Information for inst:iwm-firmware-20191022p1

Comment:
firmware binary images for iwm(4) driver

Description:
Firmware binary images for use with the iwm(4) driver.

Maintainer: The OpenBSD ports mailing-list 

WWW: https://wireless.wiki.kernel.org/en/users/Drivers/iwlwifi



I still have the kernel around if you want me to test something else. There
is nothing in this tree except this Txagg diff. LMK if you need any more
info.

OpenBSD 6.7-current (GENERIC.MP) #1: Fri Jun 26 14:01:06 PDT 2020

mlar...@innsmouth.int.azathoth.net:/u/bin/src/OpenBSD/openbsd/sys/arch/amd64/compile/GENERIC.MP
real mem = 51260506112 (48885MB)
avail mem = 49691906048 (47389MB)
random: good seed from bootblocks
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 3.1 @ 0x604f5000 (67 entries)
bios0: vendor LENOVO version "N2IET61W (1.39 )" date 05/16/2019
bios0: LENOVO 20N20046US
acpi0 at bios0: ACPI 6.1
acpi0: sleep states S0 S3 S4 S5
acpi0: tables DSDT FACP SSDT SSDT SSDT SSDT UEFI SSDT HPET APIC MCFG ECDT SSDT 
SSDT BOOT SLIC SSDT LPIT WSMT SSDT DBGP DBG2 MSDM BATB DMAR NHLT ASF! FPDT UEFI
acpi0: wakeup devices GLAN(S4) XHC_(S3) XDCI(S4) HDAS(S4) RP01(S4) PXSX(S4) 
RP02(S4) PXSX(S4) RP03(S4) PXSX(S4) RP04(S4) PXSX(S4) RP05(S4) PXSX(S4) 
RP06(S4) PXSX(S4) [...]
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpihpet0 at acpi0: 2399 Hz
acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1586.72 MHz, 06-8e-0c
cpu0: 
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,SGX,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MPX,RDSEED,ADX,SMAP,CLFLUSHOPT,PT,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES
cpu0: 256KB 64b/line 8-way L2 cache
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
cpu0: apic clock running at 24MHz
cpu0: mwait min=64, max=64, C-substates=0.2.1.2.4.1.1.1, IBE
cpu1 at mainbus0: apid 2 (application processor)
cpu1: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1333.05 MHz, 06-8e-0c
cpu1: 
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,SGX,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MPX,RDSEED,ADX,SMAP,CLFLUSHOPT,PT,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES
cpu1: 256KB 64b/line 8-way L2 cache
cpu1: smt 0, core 1, package 0
cpu2 at mainbus0: apid 4 (application processor)
cpu2: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1125.81 MHz, 06-8e-0c
cpu2: 

Re: vmm(4): unterminated vm_name after strncpy

2020-03-15 Thread Mike Larkin
On Thu, Mar 12, 2020 at 10:31:13PM +0100, Tobias Heider wrote:
> vmm uses 'strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN)' to copy
> to buffers of size VMM_MAX_NAME_LEN, which can leave the resulting string
> unterminated.
> From strncpy(3):
>   strncpy() only NUL terminates the destination string when the length of
>   the source string is less than the length parameter.
> 
> I propose replacing it with 'strlcpy' which does the right thing and
> only copies up to dstsize - 1 characters.
> 
> ok?
> 

good find. Thanks!

> CID 1453255
> 
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /mount/openbsd/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.266
> diff -u -p -r1.266 vmm.c
> --- sys/arch/amd64/amd64/vmm.c11 Mar 2020 16:38:42 -  1.266
> +++ sys/arch/amd64/amd64/vmm.c12 Mar 2020 21:15:01 -
> @@ -1167,7 +1167,7 @@ vm_create(struct vm_create_params *vcp, 
>   memcpy(vm->vm_memranges, vcp->vcp_memranges,
>   vm->vm_nmemranges * sizeof(vm->vm_memranges[0]));
>   vm->vm_memory_size = memsize;
> - strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
> + strlcpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
>  
>   rw_enter_write(_softc->vm_lock);
>  
> @@ -3718,7 +3718,7 @@ vm_get_info(struct vm_info_params *vip)
>   out[i].vir_ncpus = vm->vm_vcpu_ct;
>   out[i].vir_id = vm->vm_id;
>   out[i].vir_creator_pid = vm->vm_creator_pid;
> - strncpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN);
> + strlcpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN);
>   rw_enter_read(>vm_vcpu_lock);
>   for (j = 0; j < vm->vm_vcpu_ct; j++) {
>   out[i].vir_vcpu_state[j] = VCPU_STATE_UNKNOWN;
> 



  1   2   3   4   5   6   >