Re: Silence vmd rtc_update_rega non-32KHz timebase spam

2022-01-15 Thread Mike Larkin
On Wed, Dec 08, 2021 at 07:45:50PM -0600, Brian Conway wrote:
> Ping with complete diff. Thanks.
>
> Brian Conway
>

Catching up on old emails. Committed. Thanks.

-ml

> diff --git usr.sbin/vmd/mc146818.c usr.sbin/vmd/mc146818.c
> index e3599c685..001c1a055 100644
> --- usr.sbin/vmd/mc146818.c
> +++ usr.sbin/vmd/mc146818.c
> @@ -34,7 +34,6 @@
>  #include "vmd.h"
>  #include "vmm.h"
>
> -#define MC_DIVIDER_MASK 0xe0
>  #define MC_RATE_MASK 0xf
>
>  #define NVRAM_CENTURY 0x32
> @@ -236,10 +235,6 @@ rtc_reschedule_per(void)
>  static void
>  rtc_update_rega(uint32_t data)
>  {
> -if ((data & MC_DIVIDER_MASK) != MC_BASE_32_KHz)
> -log_warnx("%s: set non-32KHz timebase not supported",
> -__func__);
> -
>  rtc.regs[MC_REGA] = data;
>  if ((rtc.regs[MC_REGA] ^ data) & 0x0f)
>  vm_pipe_send(_pipe, MC146818_RESCHEDULE_PER);
>
>
> On Thu, Nov 18, 2021 at 8:02 AM Brian Conway  wrote:
> >
> > Per https://marc.info/?l=openbsd-misc=159113575425726 , mlarkin@
> > suggested someone can remove it. It's still pretty spammy at the
> > current time for me.
> >
> > Brian Conway
> > Software Engineer, Owner
> > RCE Software, LLC
> >
> > diff --git usr.sbin/vmd/mc146818.c usr.sbin/vmd/mc146818.c
> > index e3599c68504..17cf21221e5 100644
> > --- usr.sbin/vmd/mc146818.c
> > +++ usr.sbin/vmd/mc146818.c
> > @@ -236,10 +236,6 @@ rtc_reschedule_per(void)
> >  static void
> >  rtc_update_rega(uint32_t data)
> >  {
> > -if ((data & MC_DIVIDER_MASK) != MC_BASE_32_KHz)
> > -log_warnx("%s: set non-32KHz timebase not supported",
> > -__func__);
> > -
> >  rtc.regs[MC_REGA] = data;
> >  if ((rtc.regs[MC_REGA] ^ data) & 0x0f)
> >  vm_pipe_send(_pipe, MC146818_RESCHEDULE_PER);
>



Re: mpsafe dwxe(4)

2022-01-04 Thread Mike Larkin
On Mon, Jan 03, 2022 at 09:24:15PM +1000, Jonathan Matthew wrote:
> This is almost identical to the changes I made to dwge(4) recently, since
> these drivers are very closely related.  Unfortunately the only machine I
> have with dwxe(4) in it is armv7, so I can't test this properly, but it
> does still work there.
>
> Could someone with an arm64 allwinner board try this out more extensively?
>

Tested on my sopine with dwxe(4) on GENERIC.MP and it seems to work fine.

>
> Index: if_dwxe.c
> ===
> RCS file: /cvs/src/sys/dev/fdt/if_dwxe.c,v
> retrieving revision 1.19
> diff -u -p -r1.19 if_dwxe.c
> --- if_dwxe.c 24 Oct 2021 17:52:26 -  1.19
> +++ if_dwxe.c 3 Jan 2022 11:21:19 -
> @@ -275,6 +275,7 @@ struct dwxe_softc {
>   bus_space_tag_t sc_iot;
>   bus_space_handle_t  sc_ioh;
>   bus_dma_tag_t   sc_dmat;
> + void*sc_ih;
>
>   struct arpcom   sc_ac;
>  #define sc_lladdrsc_ac.ac_enaddr
> @@ -287,7 +288,6 @@ struct dwxe_softc {
>   struct dwxe_buf *sc_txbuf;
>   struct dwxe_desc*sc_txdesc;
>   int sc_tx_prod;
> - int sc_tx_cnt;
>   int sc_tx_cons;
>
>   struct dwxe_dmamem  *sc_rxring;
> @@ -322,7 +322,7 @@ uint32_t dwxe_read(struct dwxe_softc *,
>  void dwxe_write(struct dwxe_softc *, bus_addr_t, uint32_t);
>
>  int  dwxe_ioctl(struct ifnet *, u_long, caddr_t);
> -void dwxe_start(struct ifnet *);
> +void dwxe_start(struct ifqueue *);
>  void dwxe_watchdog(struct ifnet *);
>
>  int  dwxe_media_change(struct ifnet *);
> @@ -345,7 +345,7 @@ void  dwxe_rx_proc(struct dwxe_softc *);
>  void dwxe_up(struct dwxe_softc *);
>  void dwxe_down(struct dwxe_softc *);
>  void dwxe_iff(struct dwxe_softc *);
> -int  dwxe_encap(struct dwxe_softc *, struct mbuf *, int *);
> +int  dwxe_encap(struct dwxe_softc *, struct mbuf *, int *, int *);
>
>  void dwxe_reset(struct dwxe_softc *);
>  void dwxe_stop_dma(struct dwxe_softc *);
> @@ -431,8 +431,9 @@ dwxe_attach(struct device *parent, struc
>   ifp = >sc_ac.ac_if;
>   ifp->if_softc = sc;
>   ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
> + ifp->if_xflags = IFXF_MPSAFE;
>   ifp->if_ioctl = dwxe_ioctl;
> - ifp->if_start = dwxe_start;
> + ifp->if_qstart = dwxe_start;
>   ifp->if_watchdog = dwxe_watchdog;
>   ifq_set_maxlen(>if_snd, DWXE_NTXDESC - 1);
>   bcopy(sc->sc_dev.dv_xname, ifp->if_xname, IFNAMSIZ);
> @@ -460,8 +461,10 @@ dwxe_attach(struct device *parent, struc
>   if_attach(ifp);
>   ether_ifattach(ifp);
>
> - fdt_intr_establish(faa->fa_node, IPL_NET, dwxe_intr, sc,
> - sc->sc_dev.dv_xname);
> + sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE,
> + dwxe_intr, sc, sc->sc_dev.dv_xname);
> + if (sc->sc_ih == NULL)
> + printf("%s: can't establish interrupt\n", sc->sc_dev.dv_xname);
>  }
>
>  void
> @@ -584,11 +587,12 @@ dwxe_lladdr_write(struct dwxe_softc *sc)
>  }
>
>  void
> -dwxe_start(struct ifnet *ifp)
> +dwxe_start(struct ifqueue *ifq)
>  {
> + struct ifnet *ifp = ifq->ifq_if;
>   struct dwxe_softc *sc = ifp->if_softc;
>   struct mbuf *m;
> - int error, idx;
> + int error, idx, left, used;
>
>   if (!(ifp->if_flags & IFF_RUNNING))
>   return;
> @@ -600,27 +604,29 @@ dwxe_start(struct ifnet *ifp)
>   return;
>
>   idx = sc->sc_tx_prod;
> - while ((sc->sc_txdesc[idx].sd_status & DWXE_TX_DESC_CTL) == 0) {
> - m = ifq_deq_begin(>if_snd);
> - if (m == NULL)
> + left = sc->sc_tx_cons;
> + if (left <= idx)
> + left += DWXE_NTXDESC;
> + left -= idx;
> + used = 0;
> +
> + for (;;) {
> + if (used + DWXE_NTXSEGS + 1 > left) {
> + ifq_set_oactive(ifq);
>   break;
> + }
>
> - error = dwxe_encap(sc, m, );
> - if (error == ENOBUFS) {
> - ifq_deq_rollback(>if_snd, m);
> - ifq_set_oactive(>if_snd);
> + m = ifq_dequeue(ifq);
> + if (m == NULL)
>   break;
> - }
> +
> + error = dwxe_encap(sc, m, , );
>   if (error == EFBIG) {
> - ifq_deq_commit(>if_snd, m);
>   m_freem(m); /* give up: drop it */
>   ifp->if_oerrors++;
>   continue;
>   }
>
> - /* Now we are committed to transmit the packet. */
> - ifq_deq_commit(>if_snd, m);
> -
>  #if NBPFILTER > 0
>   if (ifp->if_bpf)
>   bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
> @@ -632,6 +638,9 @@ dwxe_start(struct ifnet *ifp)
>
>   /* Set a timeout in case the chip goes out to lunch. */
>   

Re: Remove gtp from amd64 GENERIC kernel

2022-01-03 Thread Mike Larkin
On Mon, Jan 03, 2022 at 06:43:29PM -0800, Greg Steuck wrote:
> Crystal Kolipe  writes:
>
> > The gpt driver was completely deleted from the tree in 2016, and removed 
> > from the i386 GENERIC config in revision 1.819.
> >
> > It has, however, remained in the amd64 GENERIC config commented out,
> > which seems like an oversight.
>
> I agree, thanks!
>
> >
> > This patch removes it from amd64 GENERIC:
>
> OK gnezdo, if somebody wants to commit. Or tell me that I should.
>

ok mlarkin if not done already

> >
> > --- GENERIC.origMon Jan  3 08:42:52 2022
> > +++ GENERIC Mon Jan  3 08:45:14 2022
> > @@ -670,11 +670,7 @@
> >
> >  bktr0  at pci?
> >
> > -# FM-Radio devices
> > -#gtp*  at pci? # Gemtek/Guillemot Radio PCI Radio Card
> > -
> >  # FM-Radio support
> > -#radio*at gtp?
> >  radio* at bktr?
> >
> >  #wdt0  at pci? # Ind Computer Source PCI-WDT50x driver
>



Re: vmm(4): restore vmcs after sleep points [vmx 2/3]

2021-12-03 Thread Mike Larkin
On Mon, Nov 29, 2021 at 08:41:22PM -0500, Dave Voutila wrote:
>
> Dave Voutila  writes:
>
> > This diff removes instability from VMX-based hosts by either removing
> > the possibility of the process sleeping while the VMCS is active or
> > reloading it if we had no choice.
> >
> > A mutex is added to help guard the VMCS state so testing with witness
> > has helped verify the diff.
> >
>
> Removed the mutex as it has served its purpose in ferreting out some
> sleep points.
>
> > The rwlock on the cpu originally used in the remote vmclear routine is
> > changed to a mutex accordingly.
> >
>
> Reverted this. This update doesn't change the rwlock to a mutex...it's
> fine if we sleep while we wait for a remote clear as it doesn't matter
> which CPU we wake up on as we're about to reload the VMCS anyways.
>
> > This diff does not remote possible calls to printf(9) via the DPRINTF
> > macro as that's part of the next diff.
> >
>
> Moot at this point.
>
> > One area of note: in vmx_load_pdptes() there's a XXX to call out that
> > because of the printf(9) call on failure to km_alloc that the VMCS is
> > potentially no longer valid. The upcoming diff to swap out printf(9) for
> > log(9) will remove that.
> >
>
> Revisited the above now that we're holding off on this printf -> log
> changeover.
>
> It was in the previous diff as well, but just to point out this removes
> the KERNEL_LOCK dance around uvm_fault. We were only doing this on Intel
> hosts as it wasn't understood (at that time) what was causing the VMCS
> corruption. AMD hosts haven't done this during nested page fault exit
> handling since my work to unlock vmm(4) at k2k21.
>
> ok?
>

ok mlarkin, and thanks for tracking these down.

-ml

>
> blob - 8e588f7dcbd1cec2e61e7b7292ee32ff4eb9a2e1
> blob + ac91b74fd4d5da774808ad1c78d75469ff89b458
> --- sys/arch/amd64/amd64/vmm.c
> +++ sys/arch/amd64/amd64/vmm.c
> @@ -3028,12 +3028,22 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg
>   IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
>   if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
>   IA32_VMX_ENABLE_VPID, 1)) {
> - if (vmm_alloc_vpid()) {
> +
> + /* We may sleep during allocation, so reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
> + ret = vmm_alloc_vpid();
> + if (vcpu_reload_vmcs_vmx(vcpu)) {
> + printf("%s: failed to reload vmcs\n", __func__);
> + ret = EINVAL;
> + goto exit;
> + }
> + if (ret) {
>   DPRINTF("%s: could not allocate VPID\n",
>   __func__);
>   ret = EINVAL;
>   goto exit;
>   }
> +
>   if (vmwrite(VMCS_GUEST_VPID, vpid)) {
>   DPRINTF("%s: error setting guest VPID\n",
>   __func__);
> @@ -5549,7 +5559,7 @@ svm_handle_np_fault(struct vcpu *vcpu)
>   *
>   * Return Values:
>   *  0: if successful
> - *  EINVAL: if fault type could not be determined
> + *  EINVAL: if fault type could not be determined or VMCS reload fails
>   *  EAGAIN: if a protection fault occurred, ie writing to a read-only page
>   *  errno: if uvm_fault(9) fails to wire in the page
>   */
> @@ -5569,10 +5579,14 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
>   return (EAGAIN);
>   }
>
> - KERNEL_LOCK();
> + /* We may sleep during uvm_fault(9), so reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
>   ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, VM_FAULT_WIRE,
>   PROT_READ | PROT_WRITE | PROT_EXEC);
> - KERNEL_UNLOCK();
> + if (vcpu_reload_vmcs_vmx(vcpu)) {
> + printf("%s: failed to reload vmcs\n", __func__);
> + return (EINVAL);
> + }
>
>   if (ret)
>   printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n",
> @@ -5962,7 +5976,16 @@ vmx_load_pdptes(struct vcpu *vcpu)
>
>   ret = 0;
>
> - cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, _any, _none, 
> _waitok);
> + /* We may sleep during km_alloc(9), so reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
> + cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, _any, _none,
> + _waitok);
> + if (vcpu_reload_vmcs_vmx(vcpu)) {
> + printf("%s: failed to reload vmcs\n", __func__);
> + ret = EINVAL;
> + goto exit;
> + }
> +
>   if (!cr3_host_virt) {
>   printf("%s: can't allocate address for guest CR3 mapping\n",
>   __func__);
> @@ -5998,7 +6021,15 @@ vmx_load_pdptes(struct vcpu *vcpu)
>
>  exit:
>   pmap_kremove(cr3_host_virt, PAGE_SIZE);
> +
> + /* km_free(9) might sleep, so we need to reload VMCS. */
> + vcpu->vc_last_pcpu = curcpu();
>

Re: vmm(4): bump vmclear spinout [vmx 1/3]

2021-11-28 Thread Mike Larkin
On Sun, Nov 28, 2021 at 10:32:47PM -0500, Dave Voutila wrote:
> Smallest of the VMX/VMCS stability diffs. This bumps the spinout to be
> the same number of ticks used by the mplock debug. This is needed on
> older/slower hosts.
>
> ok?
>
> -dv
>
> diff e8c587551f20ba6fdaa0f483ea768aade9f66f7d 
> 981a8cfd4e1dfe412e9c72fb5b47e7e46813bfbb
> blob - a7b21ec75899c81f076143fbe59f14279334ea09
> blob + e335a1dc5e8a400b4bbf49cac2ec8853dffcdae3
> --- sys/arch/amd64/amd64/vmm.c
> +++ sys/arch/amd64/amd64/vmm.c
> @@ -1373,7 +1373,7 @@ vmclear_on_cpu(struct cpu_info *ci)
>  static int
>  vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *vcpu)
>  {
> - int ret = 0, nticks = 10;
> + int ret = 0, nticks = 2;
>
>   mtx_enter(>ci_vmcs_mtx);
>   atomic_swap_ulong(>ci_vmcs_pa, vcpu->vc_control_pa);

ok mlarkin



Re: vmm(4): copyout guest regs, irqready on VM_EXIT_NONE

2021-11-21 Thread Mike Larkin
On Sat, Nov 20, 2021 at 09:14:31PM -0500, Dave Voutila wrote:
> The below diff fixes an issue reported by kn@ on bugs@ [1]. joshe@ also
> observed the issue and confirmed the below diff resolves it.
>
> The symptoms were quite odd: errors from fdc(4) during an OpenBSD guest
> booting under vmm(4)/vmd(8). We don't emulate a floppy disk drive!!!
>
> I introduced a bug in r1.287 [2] when simplifying parts of
> vcpu_run_{svm,vmx} by letting the functions return 0 instead of
> voluntarily yielding. The edge case I didn't account for is if after a
> vmexit for an IN instruction, the io port address isn't one emulated by
> vmd(8) in userland, vmm(4) will perform the emulation (not the bug) by
> writing the appropriate number of 0xff bytes to AL/AX/EAX. IF the
> scheduler would like us to yield, we return setting a vrp exit code of
> VM_EXIT_NONE (since we aren't asking userland/vmd to help with any
> emulation).
>
> vmd(8) correctly handles this exit, but vmm(4) never copies out the
> current vcpu registers and irqready state. When vmd(8) runs the vcpu
> again, the vcpu's guest state still has a vmexit related to the IO
> operation and presumes vmd(8) modified RAX and overwrites the vcpu's
> RAX before re-entering the guest.
>
> This behavior occurs on both Intel and AMD. To confirm, I added some
> printfs to fdc(4) and specifically checked when the dma reads returned
> something other than 0xff on instances of both types of host. (Since
> it's probabilistic, it's not uncommon to see it happen only 3-4 times
> out of the 100k bus reads out_fdc() attempts, but it seems more
> reproducible on older hardware.)
>
> ok?
>
> -dv
>

ok mlarkin if not already committed

> [1] https://marc.info/?l=openbsd-bugs=163682062027764=2
> [2] 
> https://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/arch/amd64/amd64/vmm.c.diff?r1=1.286=1.287
>
>
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /opt/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.294
> diff -u -p -r1.294 vmm.c
> --- sys/arch/amd64/amd64/vmm.c26 Oct 2021 16:29:49 -  1.294
> +++ sys/arch/amd64/amd64/vmm.c20 Nov 2021 21:46:07 -
> @@ -4301,9 +4301,10 @@ vm_run(struct vm_run_params *vrp)
>   rw_exit_write(_softc->vm_lock);
>   }
>   ret = 0;
> - } else if (ret == EAGAIN) {
> + } else if (ret == 0 || ret == EAGAIN) {
>   /* If we are exiting, populate exit data so vmd can help. */
> - vrp->vrp_exit_reason = vcpu->vc_gueststate.vg_exit_reason;
> + vrp->vrp_exit_reason = (ret == 0) ? VM_EXIT_NONE
> + : vcpu->vc_gueststate.vg_exit_reason;
>   vrp->vrp_irqready = vcpu->vc_irqready;
>   vcpu->vc_state = VCPU_STATE_STOPPED;
>
> @@ -4312,9 +4313,6 @@ vm_run(struct vm_run_params *vrp)
>   ret = EFAULT;
>   } else
>   ret = 0;
> - } else if (ret == 0) {
> - vrp->vrp_exit_reason = VM_EXIT_NONE;
> - vcpu->vc_state = VCPU_STATE_STOPPED;
>   } else {
>   vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
>   vcpu->vc_state = VCPU_STATE_TERMINATED;



Re: vmd(8): fix broken bootorder for cdrom

2021-11-04 Thread Mike Larkin
On Thu, Nov 04, 2021 at 08:09:16PM +0100, Jan Klemkow wrote:
> On Thu, Nov 04, 2021 at 10:43:46AM -0400, Dave Voutila wrote:
> > Jan Klemkow  writes:
> > > This fix [1] in seabios breaks our "boot device cdrom" feature.
> > >
> > > # vmctl start -Lc -d disk.img -r cd70.iso -B cdrom vm
> > > ...
> > > No bootable device.  Retrying in 60 seconds.
> > >
> > > # vmctl start -Lc -d disk.img -r cd70.iso vm
> > > doas vmctl start -c -r cd70.iso vm
> > > ...
> > > CD-ROM: E0
> > > Loading /7.0/AMD64/CDBOOT
> > > probing: pc0 com0 mem[638K 510M a20=on]
> > > disk: cd0
> > >>> OpenBSD/amd64 CDBOOT 3.53
> > > boot>
> > >
> > > The diff below, fixes the lun number of the bootorder string for cdrom.
> > >
> > > OK?
> >
> > This change definitely fixes -B cdrom, but -B disk seems broken as well.
> >
> > ok dv to fix the -B cdrom issue, but do you also have an idea how to fix
> > the -B disk option?
>
> The diff below, fixes the -B disk option. BUT...
>
> The bootorder for disk and cdrom are hard coded strings for a runtime
> dynamic PCI bus.  The current disk bootorder string just works, if there
> is no network device configured.  With the diff below, it will work, if
> there is just one network device.  The current cdrom bootorder string
> just works, with one network and one disk device, or with no network and
> two disk devices.
>
> One example of vmd(8)'s PCI bus:
>
> PCI: init bdf=00:00.0 id=0b5d:0666/* VMM Host */
> PCI: init bdf=00:01.0 id=1af4:1005/* Virtio RNG */
> PCI: init bdf=00:02.0 id=1af4:1000/* Virtio Network */
> PCI: init bdf=00:03.0 id=1af4:1001/* Virtio Storage (disk) */
> PCI: init bdf=00:04.0 id=1af4:1004/* Virtio SCSI (cdrom) */
> PCI: init bdf=00:05.0 id=0b5d:0777/* VMM Control */
>
> We should assemble dynamic bootorder strings, which fits to our dynamic
> assembled PCI bus.  This would be a general solution for this problem.
>
> For now, this diff will fix the -B disk option for the most common case
> of one NIC.  Which fits to the cdrom bootorder string for one NIC, one
> disk and a cdrom.
>
> OK?
>
> bye,
> Jan
>
> Index: fw_cfg.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/fw_cfg.c,v
> retrieving revision 1.4
> diff -u -p -r1.4 fw_cfg.c
> --- fw_cfg.c  4 Nov 2021 17:50:05 -   1.4
> +++ fw_cfg.c  4 Nov 2021 18:48:48 -
> @@ -77,7 +77,7 @@ fw_cfg_init(struct vmop_create_params *v
>
>   switch (vmc->vmc_bootdevice) {
>   case VMBOOTDEV_DISK:
> - bootorder = "/pci@i0cf8/*@2\nHALT";
> + bootorder = "/pci@i0cf8/*@3\nHALT";
>   break;
>   case VMBOOTDEV_CDROM:
>   bootorder = "/pci@i0cf8/*@4/*@0/*@0,4100\nHALT";
>

Thanks. ok mlarkin@

And I do agree that we probably need dynamic bootorder strings.

-ml



Re: More pchgpio(4)

2021-10-21 Thread Mike Larkin
On Tue, Oct 12, 2021 at 01:19:55PM -0700, Mike Larkin wrote:
> On Sun, Oct 10, 2021 at 11:42:31PM +0200, Mark Kettenis wrote:
> > > Date: Sat, 9 Oct 2021 22:27:52 +0200 (CEST)
> > > From: Mark Kettenis 
> > >
> > > > Date: Sat, 9 Oct 2021 20:55:10 +0200 (CEST)
> > > > From: Mark Kettenis 
> > > >
> > > > This time adding support for Sunrisepoint-H and Sunrisepoint-LP.
> > > > Because of all the failed attempts by Intel to get their 10nm process
> > > > under control, this may cover Intel Mobile CPUs marketed as 6th, 7th,
> > > > 8th, 9th and 10th generation.  So if you have a Laptop that isn't at
> > > > least 5 years old, give this a try if pchgpio(4) doesn't attach.  This
> > > > may fix all sorts of issues with keyboards, touchpads or
> > > > suspend/resume.
> > > >
> > > > ok?
> > >
> > > Updated diff that masks unhandled interrupts like we do in amdgpio(4).
> >
> > And another update to fix a typo in the pin groups for Sunrisepoint-LP.
> >
> >
>
> Thanks. I'll give this a try but it might take me a couple days since I'm
> traveling.
>
> -ml
>

Sorry for taking so long to get to this. The new diff below didn't seem to make
any difference, the Sgo3 is still very very slow with this, same interrupt 
storm.

-ml

> > Index: dev/acpi/pchgpio.c
> > ===
> > RCS file: /cvs/src/sys/dev/acpi/pchgpio.c,v
> > retrieving revision 1.8
> > diff -u -p -r1.8 pchgpio.c
> > --- dev/acpi/pchgpio.c  29 Sep 2021 22:03:33 -  1.8
> > +++ dev/acpi/pchgpio.c  10 Oct 2021 21:40:45 -
> > @@ -107,13 +107,76 @@ struct cfdriver pchgpio_cd = {
> >  };
> >
> >  const char *pchgpio_hids[] = {
> > +   "INT344B",
> > "INT3450",
> > +   "INT3451",
> > +   "INT345D",
> > "INT34BB",
> > "INT34C5",
> > "INT34C6",
> > NULL
> >  };
> >
> > +/* Sunrisepoint-LP */
> > +
> > +const struct pchgpio_group spt_lp_groups[] =
> > +{
> > +   /* Community 0 */
> > +   { 0, 0, 0, 23, 0 }, /* GPP_A */
> > +   { 0, 1, 24, 47, 24 },   /* GPP_B */
> > +
> > +   /* Community 1 */
> > +   { 1, 0, 48, 71, 48 },   /* GPP_C */
> > +   { 1, 1, 72, 95, 72 },   /* GPP_D */
> > +   { 1, 2, 96, 119, 96 },  /* GPP_E */
> > +
> > +   /* Community 3 */
> > +   { 2, 0, 120, 143, 120 },/* GPP_F */
> > +   { 2, 1, 144, 151, 144 },/* GPP_G */
> > +};
> > +
> > +const struct pchgpio_device spt_lp_device =
> > +{
> > +   .pad_size = 16,
> > +   .gpi_is = 0x100,
> > +   .gpi_ie = 0x120,
> > +   .groups = spt_lp_groups,
> > +   .ngroups = nitems(spt_lp_groups),
> > +   .npins = 176,
> > +};
> > +
> > +/* Sunrisepoint-H */
> > +
> > +const struct pchgpio_group spt_h_groups[] =
> > +{
> > +   /* Community 0 */
> > +   { 0, 0, 0, 23, 0 }, /* GPP_A */
> > +   { 0, 1, 24, 47, 24 },   /* GPP_B */
> > +
> > +   /* Community 1 */
> > +   { 1, 0, 48, 71, 48 },   /* GPP_C */
> > +   { 1, 1, 72, 95, 72 },   /* GPP_D */
> > +   { 1, 2, 96, 108, 96 },  /* GPP_E */
> > +   { 1, 3, 109, 132, 120 },/* GPP_F */
> > +   { 1, 4, 133, 156, 144 },/* GPP_G */
> > +   { 1, 5, 157, 180, 168 },/* GPP_H */
> > +
> > +   /* Community 3 */
> > +   { 2, 0, 181, 191, 192 },/* GPP_I */
> > +};
> > +
> > +const struct pchgpio_device spt_h_device =
> > +{
> > +   .pad_size = 16,
> > +   .gpi_is = 0x100,
> > +   .gpi_ie = 0x120,
> > +   .groups = spt_h_groups,
> > +   .ngroups = nitems(spt_h_groups),
> > +   .npins = 224,
> > +};
> > +
> > +/* Cannon Lake-H */
> > +
> >  const struct pchgpio_group cnl_h_groups[] =
> >  {
> > /* Community 0 */
> > @@ -146,6 +209,8 @@ const struct pchgpio_device cnl_h_device
> > .npins = 384,
> >  };
> >
> > +/* Cannon Lake-LP */
> > +
> >  const struct pchgpio_group cnl_lp_groups[] =
> >  {
> > /* Community 0 */
> > @@ -173,6 +238,8 @@ const struct pchgpio_device cnl_lp_devic
> > .npins = 320,
> >  };
> >
> > +/* Tiger Lake-LP */
> > +
> >  const struct pchgpio_group tgl_lp_groups[] =
> >  {
> > /* Community 0 */
> >

Re: More pchgpio(4)

2021-10-12 Thread Mike Larkin
On Sun, Oct 10, 2021 at 11:42:31PM +0200, Mark Kettenis wrote:
> > Date: Sat, 9 Oct 2021 22:27:52 +0200 (CEST)
> > From: Mark Kettenis 
> >
> > > Date: Sat, 9 Oct 2021 20:55:10 +0200 (CEST)
> > > From: Mark Kettenis 
> > >
> > > This time adding support for Sunrisepoint-H and Sunrisepoint-LP.
> > > Because of all the failed attempts by Intel to get their 10nm process
> > > under control, this may cover Intel Mobile CPUs marketed as 6th, 7th,
> > > 8th, 9th and 10th generation.  So if you have a Laptop that isn't at
> > > least 5 years old, give this a try if pchgpio(4) doesn't attach.  This
> > > may fix all sorts of issues with keyboards, touchpads or
> > > suspend/resume.
> > >
> > > ok?
> >
> > Updated diff that masks unhandled interrupts like we do in amdgpio(4).
>
> And another update to fix a typo in the pin groups for Sunrisepoint-LP.
>
>

Thanks. I'll give this a try but it might take me a couple days since I'm
traveling.

-ml

> Index: dev/acpi/pchgpio.c
> ===
> RCS file: /cvs/src/sys/dev/acpi/pchgpio.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 pchgpio.c
> --- dev/acpi/pchgpio.c29 Sep 2021 22:03:33 -  1.8
> +++ dev/acpi/pchgpio.c10 Oct 2021 21:40:45 -
> @@ -107,13 +107,76 @@ struct cfdriver pchgpio_cd = {
>  };
>
>  const char *pchgpio_hids[] = {
> + "INT344B",
>   "INT3450",
> + "INT3451",
> + "INT345D",
>   "INT34BB",
>   "INT34C5",
>   "INT34C6",
>   NULL
>  };
>
> +/* Sunrisepoint-LP */
> +
> +const struct pchgpio_group spt_lp_groups[] =
> +{
> + /* Community 0 */
> + { 0, 0, 0, 23, 0 }, /* GPP_A */
> + { 0, 1, 24, 47, 24 },   /* GPP_B */
> +
> + /* Community 1 */
> + { 1, 0, 48, 71, 48 },   /* GPP_C */
> + { 1, 1, 72, 95, 72 },   /* GPP_D */
> + { 1, 2, 96, 119, 96 },  /* GPP_E */
> +
> + /* Community 3 */
> + { 2, 0, 120, 143, 120 },/* GPP_F */
> + { 2, 1, 144, 151, 144 },/* GPP_G */
> +};
> +
> +const struct pchgpio_device spt_lp_device =
> +{
> + .pad_size = 16,
> + .gpi_is = 0x100,
> + .gpi_ie = 0x120,
> + .groups = spt_lp_groups,
> + .ngroups = nitems(spt_lp_groups),
> + .npins = 176,
> +};
> +
> +/* Sunrisepoint-H */
> +
> +const struct pchgpio_group spt_h_groups[] =
> +{
> + /* Community 0 */
> + { 0, 0, 0, 23, 0 }, /* GPP_A */
> + { 0, 1, 24, 47, 24 },   /* GPP_B */
> +
> + /* Community 1 */
> + { 1, 0, 48, 71, 48 },   /* GPP_C */
> + { 1, 1, 72, 95, 72 },   /* GPP_D */
> + { 1, 2, 96, 108, 96 },  /* GPP_E */
> + { 1, 3, 109, 132, 120 },/* GPP_F */
> + { 1, 4, 133, 156, 144 },/* GPP_G */
> + { 1, 5, 157, 180, 168 },/* GPP_H */
> +
> + /* Community 3 */
> + { 2, 0, 181, 191, 192 },/* GPP_I */
> +};
> +
> +const struct pchgpio_device spt_h_device =
> +{
> + .pad_size = 16,
> + .gpi_is = 0x100,
> + .gpi_ie = 0x120,
> + .groups = spt_h_groups,
> + .ngroups = nitems(spt_h_groups),
> + .npins = 224,
> +};
> +
> +/* Cannon Lake-H */
> +
>  const struct pchgpio_group cnl_h_groups[] =
>  {
>   /* Community 0 */
> @@ -146,6 +209,8 @@ const struct pchgpio_device cnl_h_device
>   .npins = 384,
>  };
>
> +/* Cannon Lake-LP */
> +
>  const struct pchgpio_group cnl_lp_groups[] =
>  {
>   /* Community 0 */
> @@ -173,6 +238,8 @@ const struct pchgpio_device cnl_lp_devic
>   .npins = 320,
>  };
>
> +/* Tiger Lake-LP */
> +
>  const struct pchgpio_group tgl_lp_groups[] =
>  {
>   /* Community 0 */
> @@ -205,6 +272,8 @@ const struct pchgpio_device tgl_lp_devic
>   .npins = 360,
>  };
>
> +/* Tiger Lake-H */
> +
>  const struct pchgpio_group tgl_h_groups[] =
>  {
>   /* Community 0 */
> @@ -242,7 +311,10 @@ const struct pchgpio_device tgl_h_device
>  };
>
>  struct pchgpio_match pchgpio_devices[] = {
> + { "INT344B", _lp_device },
>   { "INT3450", _h_device },
> + { "INT3451", _h_device },
> + { "INT345D", _h_device },
>   { "INT34BB", _lp_device },
>   { "INT34C5", _lp_device },
>   { "INT34C6", _h_device },
> @@ -473,11 +545,38 @@ pchgpio_intr_establish(void *cookie, int
>  }
>
>  int
> +pchgpio_intr_handle(struct pchgpio_softc *sc, int group, int bit)
> +{
> + uint32_t enable;
> + int gpiobase, pin, handled = 0;
> + uint8_t bank, bar;
> +
> + bar = sc->sc_device->groups[group].bar;
> + bank = sc->sc_device->groups[group].bank;
> + gpiobase = sc->sc_device->groups[group].gpiobase;
> +
> + pin = gpiobase + bit;
> + if (sc->sc_pin_ih[pin].ih_func) {
> + sc->sc_pin_ih[pin].ih_func(sc->sc_pin_ih[pin].ih_arg);
> + handled = 1;
> + } else {
> + /* Mask unhandled interrupt. */
> + enable = bus_space_read_4(sc->sc_memt[bar], sc->sc_memh[bar],
> + 

Re: vmm(4): set global vcpu limit to 512

2021-09-11 Thread Mike Larkin
On Sat, Sep 11, 2021 at 01:44:33PM -0400, Dave Voutila wrote:
> Syzbot recently discovered that since we don't have any bounds in place
> for number of vms or vcpus it's possible to completely exhuast kernel
> memory or at least put the system in a state where malloc(9) or
> km_alloc(9) fail in systems (e.g. DRM, unveil, etc.) resulting in
> panics. Actually, it first discovered some lock ordering issues, but
> once those were fixed this issue surfaced via the reproducer [1].
>
> I chose 512 as a conservative bound based on the idea that vcpu's have a
> few wired pages of memory each for various VMX/SVM things like VMCS/VMCB
> structures.
>
> Given we also wire guest memory on a page fault and only support 1 vcpu
> per guest currently, it's highly unlikely someone is successfully
> running 512 guests. Once we finish fixing the tlb issues forcing us to
> wire or implement SMP, we can revisit this number.
>
> I checked with openbsd.amsterdam and this is well over their current
> densities. (If anyone *IS* somehow running > 512 guests as of this
> moment, please speak up.)
>
> ok?
>
> [1] https://syzkaller.appspot.com/text?tag=ReproC=11f507de30
>

ok mlarkin

> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.292
> diff -u -p -r1.292 vmm.c
> --- sys/arch/amd64/amd64/vmm.c5 Sep 2021 16:36:34 -   1.292
> +++ sys/arch/amd64/amd64/vmm.c11 Sep 2021 17:36:28 -
> @@ -99,6 +99,9 @@ struct vmm_softc {
>
>   int mode;
>
> + size_t  vcpu_ct;
> + size_t  vcpu_max;
> +
>   struct rwlock   vm_lock;
>   size_t  vm_ct;  /* number of in-memory VMs */
>   size_t  vm_idx; /* next unique VM index */
> @@ -368,6 +371,8 @@ vmm_attach(struct device *parent, struct
>   sc->nr_svm_cpus = 0;
>   sc->nr_rvi_cpus = 0;
>   sc->nr_ept_cpus = 0;
> + sc->vcpu_ct = 0;
> + sc->vcpu_max = VMM_MAX_VCPUS;
>   sc->vm_ct = 0;
>   sc->vm_idx = 0;
>
> @@ -1498,6 +1503,15 @@ vm_create(struct vm_create_params *vcp,
>   if (vcp->vcp_ncpus != 1)
>   return (EINVAL);
>
> + rw_enter_write(_softc->vm_lock);
> + if (vmm_softc->vcpu_ct + vcp->vcp_ncpus > vmm_softc->vcpu_max) {
> + printf("%s: maximum vcpus (%lu) reached\n", __func__,
> + vmm_softc->vcpu_max);
> + rw_exit_write(_softc->vm_lock);
> + return (ENOMEM);
> + }
> + vmm_softc->vcpu_ct += vcp->vcp_ncpus;
> +
>   vm = pool_get(_pool, PR_WAITOK | PR_ZERO);
>   SLIST_INIT(>vm_vcpu_list);
>   rw_init(>vm_vcpu_lock, "vcpu_list");
> @@ -1509,8 +1523,6 @@ vm_create(struct vm_create_params *vcp,
>   vm->vm_memory_size = memsize;
>   strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1);
>
> - rw_enter_write(_softc->vm_lock);
> -
>   if (vm_impl_init(vm, p)) {
>   printf("failed to init arch-specific features for vm %p\n", vm);
>   vm_teardown(vm);
> @@ -3784,6 +3796,7 @@ vm_teardown(struct vm *vm)
>   SLIST_REMOVE(>vm_vcpu_list, vcpu, vcpu, vc_vcpu_link);
>   vcpu_deinit(vcpu);
>   pool_put(_pool, vcpu);
> + vmm_softc->vcpu_ct--;
>   }
>
>   vm_impl_deinit(vm);
> Index: sys/arch/amd64/include/vmmvar.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
> retrieving revision 1.73
> diff -u -p -r1.73 vmmvar.h
> --- sys/arch/amd64/include/vmmvar.h   31 Aug 2021 17:40:59 -  1.73
> +++ sys/arch/amd64/include/vmmvar.h   11 Sep 2021 17:36:28 -
> @@ -29,6 +29,7 @@
>  #define VMM_MAX_PATH_CDROM   128
>  #define VMM_MAX_NAME_LEN 64
>  #define VMM_MAX_KERNEL_PATH  128
> +#define VMM_MAX_VCPUS512
>  #define VMM_MAX_VCPUS_PER_VM 64
>  #define VMM_MAX_VM_MEM_SIZE  32768
>  #define VMM_MAX_NICS_PER_VM  4
>



Re: updated patch for iwx(4) Tx aggregation

2021-09-11 Thread Mike Larkin
On Sat, Sep 11, 2021 at 02:04:32PM +0200, Stefan Sperling wrote:
> On Fri, Sep 10, 2021 at 06:49:49PM +0200, Stefan Sperling wrote:
> > Here is another attempt at adding Tx aggregation to iwx(4).
> > This patch is based on the latest state in CVS (if_iwx.c r1.107, which
> > I have committed a minute ago). Sync your tree before applying this patch.
> >
> > Compared to previous iterations of this patch, I have fixed bugs which
> > caused fatal firmware errors and which made traffic stall after roaming.
> >
> > This patch could still make 7.0 release if it gets sufficient test coverage.
> > Please run with this and report any regressions. Thanks!
> >
> > So far, tested by me on AX200 and AX201 against a Pepwave 11ac AP.
> > I have so far not seen any fatal firmware errors, and roaming between 2GHz
> > and 5GHz channels offered by the same AP seems to work reliably.
> > Throughput goes up to 100 Mbit/s max.
>
> The previous version had a problem where it did not take frames
> off the Tx ring when they were done. It is possible that this
> could lead to memory corruption (seen by mlarkin).
>
> Please run this updated patch instead.
>
> And please enable 'ifconfig iwx0 debug' while testing this patch.
> Problem reports will be a lot more useful with debug enabled :)
>

This diff seems to resolve the NFS related corruption I was seeing.

Thanks!

-ml

> diff refs/heads/iwx-resume2 refs/heads/iwx-txagg
> blob - 4cfc91b7f4819a1a9f50fdaac339a78f67d1ab5a
> blob + 9e31a8d0bb5c9ba1fad3614fe6dcb5ebdcd33403
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -318,18 +318,16 @@ int iwx_ampdu_rx_start(struct ieee80211com *, 
> struct i
>   uint8_t);
>  void iwx_ampdu_rx_stop(struct ieee80211com *, struct ieee80211_node *,
>   uint8_t);
> +int  iwx_ampdu_tx_start(struct ieee80211com *, struct ieee80211_node *,
> + uint8_t);
>  void iwx_rx_ba_session_expired(void *);
>  void iwx_rx_bar_frame_release(struct iwx_softc *, struct iwx_rx_packet *,
>   struct iwx_rx_data *, struct mbuf_list *);
>  void iwx_reorder_timer_expired(void *);
>  void iwx_sta_rx_agg(struct iwx_softc *, struct ieee80211_node *, uint8_t,
>   uint16_t, uint16_t, int, int);
> -#ifdef notyet
> -int  iwx_ampdu_tx_start(struct ieee80211com *, struct ieee80211_node *,
> +void iwx_sta_tx_agg_start(struct iwx_softc *, struct ieee80211_node *,
>   uint8_t);
> -void iwx_ampdu_tx_stop(struct ieee80211com *, struct ieee80211_node *,
> - uint8_t);
> -#endif
>  void iwx_ba_task(void *);
>
>  int  iwx_set_mac_addr_from_csr(struct iwx_softc *, struct iwx_nvm_data *);
> @@ -355,10 +353,13 @@ int iwx_ccmp_decap(struct iwx_softc *, struct mbuf 
> *,
>   struct ieee80211_node *, struct ieee80211_rxinfo *);
>  void iwx_rx_frame(struct iwx_softc *, struct mbuf *, int, uint32_t, int, int,
>   uint32_t, struct ieee80211_rxinfo *, struct mbuf_list *);
> -void iwx_rx_tx_cmd_single(struct iwx_softc *, struct iwx_rx_packet *,
> - struct iwx_node *);
> +void iwx_clear_tx_desc(struct iwx_softc *, struct iwx_tx_ring *, int);
> +void iwx_txd_done(struct iwx_softc *, struct iwx_tx_data *);
> +void iwx_tx_ba_move_window(struct ieee80211com *, int, struct mbuf *);
> +void iwx_txq_advance(struct iwx_softc *, struct iwx_tx_ring *, int);
>  void iwx_rx_tx_cmd(struct iwx_softc *, struct iwx_rx_packet *,
>   struct iwx_rx_data *);
> +void iwx_clear_oactive(struct iwx_softc *, struct iwx_tx_ring *);
>  void iwx_rx_bmiss(struct iwx_softc *, struct iwx_rx_packet *,
>   struct iwx_rx_data *);
>  int  iwx_binding_cmd(struct iwx_softc *, struct iwx_node *, uint32_t);
> @@ -382,8 +383,11 @@ void iwx_cmd_done(struct iwx_softc *, int, int, int);
>  const struct iwx_rate *iwx_tx_fill_cmd(struct iwx_softc *, struct iwx_node *,
>   struct ieee80211_frame *, struct iwx_tx_cmd_gen2 *);
>  void iwx_tx_update_byte_tbl(struct iwx_tx_ring *, int, uint16_t, uint16_t);
> -int  iwx_tx(struct iwx_softc *, struct mbuf *, struct ieee80211_node *, int);
> -int  iwx_flush_tx_path(struct iwx_softc *);
> +int  iwx_tx(struct iwx_softc *, struct mbuf *, struct ieee80211_node *);
> +int  iwx_flush_sta_tids(struct iwx_softc *, int, uint16_t);
> +int  iwx_wait_tx_queues_empty(struct iwx_softc *);
> +int  iwx_drain_sta(struct iwx_softc *sc, struct iwx_node *, int);
> +int  iwx_flush_sta(struct iwx_softc *, struct iwx_node *);
>  int  iwx_beacon_filter_send_cmd(struct iwx_softc *,
>   struct iwx_beacon_filter_cmd *);
>  int  iwx_update_beacon_abort(struct iwx_softc *, struct iwx_node *, int);
> @@ -396,6 +400,7 @@ int   iwx_disable_beacon_filter(struct iwx_softc *);
>  int  iwx_add_sta_cmd(struct iwx_softc *, struct iwx_node *, int);
>  int  iwx_add_aux_sta(struct iwx_softc *);
>  int  iwx_rm_sta_cmd(struct iwx_softc *, struct iwx_node *);
> +int  iwx_rm_sta(struct iwx_softc *, struct iwx_node *);
>  int  iwx_fill_probe_req(struct iwx_softc *, struct iwx_scan_probe_req *);
>  int  

Re: let iwx(4) resume in the acpi thread

2021-09-10 Thread Mike Larkin
On Fri, Sep 10, 2021 at 11:12:45AM +0200, Stefan Sperling wrote:
> On Fri, Sep 10, 2021 at 10:59:25AM +0200, Stefan Sperling wrote:
> > On Fri, Sep 10, 2021 at 10:58:47AM +0200, Stefan Sperling wrote:
> > > All those changes are shown below. My device is still happy with this.
> > > I will reply with a new full diff against -current next.
> >
> > Full diff:
>
> Just realized that because iwx_resume() can no longer return an error
> we can make it a void function. And simplify the DVACT_RESUME/DVACT_WAKEUP
> case statements in iwx_activate() accordingly.
>
> Sorry about sending too many diffs :)
>

LGTM, ok mlarkin

> diff c9db663b670f8930f62c8f20c36e84d72697f036 refs/heads/iwx-resume2
> blob - 51063c862bfc0cf2dc9fbe3f41628bbdbdf3486e
> blob + 4cfc91b7f4819a1a9f50fdaac339a78f67d1ab5a
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -489,7 +489,8 @@ void  iwx_attach_hook(struct device *);
>  void iwx_attach(struct device *, struct device *, void *);
>  void iwx_init_task(void *);
>  int  iwx_activate(struct device *, int);
> -int  iwx_resume(struct iwx_softc *);
> +void iwx_resume(struct iwx_softc *);
> +int  iwx_wakeup(struct iwx_softc *);
>
>  #if NBPFILTER > 0
>  void iwx_radiotap_attach(struct iwx_softc *);
> @@ -1913,11 +1914,8 @@ int
>  iwx_check_rfkill(struct iwx_softc *sc)
>  {
>   uint32_t v;
> - int s;
>   int rv;
>
> - s = splnet();
> -
>   /*
>* "documentation" is not really helpful here:
>*  27: HW_RF_KILL_SW
> @@ -1933,7 +1931,6 @@ iwx_check_rfkill(struct iwx_softc *sc)
>   sc->sc_flags &= ~IWX_FLAG_RFKILL;
>   }
>
> - splx(s);
>   return rv;
>  }
>
> @@ -1986,8 +1983,6 @@ iwx_restore_interrupts(struct iwx_softc *sc)
>  void
>  iwx_disable_interrupts(struct iwx_softc *sc)
>  {
> - int s = splnet();
> -
>   if (!sc->sc_msix) {
>   IWX_WRITE(sc, IWX_CSR_INT_MASK, 0);
>
> @@ -2000,8 +1995,6 @@ iwx_disable_interrupts(struct iwx_softc *sc)
>   IWX_WRITE(sc, IWX_CSR_MSIX_HW_INT_MASK_AD,
>   sc->sc_hw_init_mask);
>   }
> -
> - splx(s);
>  }
>
>  void
> @@ -7822,16 +7815,6 @@ iwx_init_hw(struct iwx_softc *sc)
>   struct ieee80211com *ic = >sc_ic;
>   int err, i;
>
> - err = iwx_preinit(sc);
> - if (err)
> - return err;
> -
> - err = iwx_start_hw(sc);
> - if (err) {
> - printf("%s: could not initialize hardware\n", DEVNAME(sc));
> - return err;
> - }
> -
>   err = iwx_run_init_mvm_ucode(sc, 0);
>   if (err)
>   return err;
> @@ -7984,6 +7967,16 @@ iwx_init(struct ifnet *ifp)
>   KASSERT(sc->task_refs.refs == 0);
>   refcnt_init(>task_refs);
>
> + err = iwx_preinit(sc);
> + if (err)
> + return err;
> +
> + err = iwx_start_hw(sc);
> + if (err) {
> + printf("%s: could not initialize hardware\n", DEVNAME(sc));
> + return err;
> + }
> +
>   err = iwx_init_hw(sc);
>   if (err) {
>   if (generation == sc->sc_generation)
> @@ -9281,7 +9274,10 @@ iwx_attach(struct device *parent, struct device *self,
>   return;
>   }
>
> - /* Clear device-specific "PCI retry timeout" register (41h). */
> + /*
> +  * We disable the RETRY_TIMEOUT register (0x41) to keep
> +  * PCI Tx retries from interfering with C3 CPU state.
> +  */
>   reg = pci_conf_read(sc->sc_pct, sc->sc_pcitag, 0x40);
>   pci_conf_write(sc->sc_pct, sc->sc_pcitag, 0x40, reg & ~0xff00);
>
> @@ -9568,12 +9564,15 @@ iwx_init_task(void *arg1)
>   splx(s);
>  }
>
> -int
> +void
>  iwx_resume(struct iwx_softc *sc)
>  {
>   pcireg_t reg;
>
> - /* Clear device-specific "PCI retry timeout" register (41h). */
> + /*
> +  * We disable the RETRY_TIMEOUT register (0x41) to keep
> +  * PCI Tx retries from interfering with C3 CPU state.
> +  */
>   reg = pci_conf_read(sc->sc_pct, sc->sc_pcitag, 0x40);
>   pci_conf_write(sc->sc_pct, sc->sc_pcitag, 0x40, reg & ~0xff00);
>
> @@ -9588,8 +9587,34 @@ iwx_resume(struct iwx_softc *sc)
>   }
>
>   iwx_disable_interrupts(sc);
> +}
>
> - return iwx_start_hw(sc);
> +int
> +iwx_wakeup(struct iwx_softc *sc)
> +{
> + struct ieee80211com *ic = >sc_ic;
> + struct ifnet *ifp = >sc_ic.ic_if;
> + int err;
> +
> + refcnt_init(>task_refs);
> +
> + err = iwx_start_hw(sc);
> + if (err)
> + return err;
> +
> + err = iwx_init_hw(sc);
> + if (err)
> + return err;
> +
> + ifq_clr_oactive(>if_snd);
> + ifp->if_flags |= IFF_RUNNING;
> +
> + if (ic->ic_opmode == IEEE80211_M_MONITOR)
> + ieee80211_new_state(ic, IEEE80211_S_RUN, -1);
> + else
> + ieee80211_begin_scan(ifp);
> +
> + return 0;
>  }
>
>  int
> @@ -9608,15 +9633,15 @@ iwx_activate(struct device *self, int act)
>   }
>   break;
>   case DVACT_RESUME:
> - err = 

Re: iwx(4) firmware memory fixes

2021-09-09 Thread Mike Larkin
On Wed, Sep 08, 2021 at 02:08:36PM +0200, Stefan Sperling wrote:
> Add a missing call to iwx_ctxt_info_free_fw_img() in an error path
> of iwx_ctxt_info_init() which should always free on error.
>
> Also, free firmware paging DMA memory in case loading firmware has failed.
> If we don't free paging on error we hit KASSERT(dram->paging == NULL)
> in iwx_init_fw_sec() once we try to load firmware again.  I have hit
> this while debugging firmware load failures during suspend/resume.
>
> (Ideally, we would re-allocate firmware image and paging memory only
> after re-loading a potentially different fw image, but this can be
> fixed later.)
>
> ok?
>

ok mlarkin

> diff 50816b19557cd9c29c50f92eebbe32098a494bd3 
> 055f053850bb0f3af81ea3aa7c4f705a85cfcb76
> blob - f7d69707ed0a98dfcd7717c9c82faac3af4f39d7
> blob + 51063c862bfc0cf2dc9fbe3f41628bbdbdf3486e
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -914,8 +914,10 @@ iwx_ctxt_info_init(struct iwx_softc *sc, const struct
>   IWX_WRITE(sc, IWX_CSR_CTXT_INFO_BA + 4, paddr >> 32);
>
>   /* kick FW self load */
> - if (!iwx_nic_lock(sc))
> + if (!iwx_nic_lock(sc)) {
> + iwx_ctxt_info_free_fw_img(sc);
>   return EBUSY;
> + }
>   iwx_write_prph(sc, IWX_UREG_CPU_INIT_RUN, 1);
>   iwx_nic_unlock(sc);
>
> @@ -3364,8 +3366,10 @@ iwx_load_firmware(struct iwx_softc *sc)
>
>   /* wait for the firmware to load */
>   err = tsleep_nsec(>sc_uc, 0, "iwxuc", SEC_TO_NSEC(1));
> - if (err || !sc->sc_uc.uc_ok)
> + if (err || !sc->sc_uc.uc_ok) {
>   printf("%s: could not load firmware, %d\n", DEVNAME(sc), err);
> + iwx_ctxt_info_free_paging(sc);
> + }
>
>   iwx_ctxt_info_free_fw_img(sc);
>
>



Re: let iwx(4) resume in the acpi thread

2021-09-09 Thread Mike Larkin
On Wed, Sep 08, 2021 at 03:25:20PM +0200, Stefan Sperling wrote:
> On Wed, Sep 08, 2021 at 02:19:00PM +0200, Stefan Sperling wrote:
> > This patch applies on top of all the other iwx(4) diffs I've sent today.
> > It makes iwx(4) initialize the device completely in the acpi thread.
> >
> > We now prepare the device for loading firmware during DVACT_RESUME,
> > and load firmware from host memory into the device during DVACT_WAKEUP.
> >
> > Previously, DVACT_WAKEUP would schedule the init_task which resets the
> > device, undoing work done during DVACT_RESUME, and starts all over again.
> >
> > ok?
>
> The previous version had a bug: It resumed the device even while the
> interface was marked down. Fixed patch below.
>

It looks like DVACT_RESUME invokes iwx_resume which does a not-trivial amount
of chip repair/bringup. If you are satisfied this is safe, ok mlarkin@

-ml

> diff 055f053850bb0f3af81ea3aa7c4f705a85cfcb76 
> c734175f035f120197d6be7df1987cb81e535d3e
> blob - 51063c862bfc0cf2dc9fbe3f41628bbdbdf3486e
> blob + 26f8a7fa85aa48a054d79e7a175e35bfe96a447b
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -490,6 +490,7 @@ void  iwx_attach(struct device *, struct device *, 
> void
>  void iwx_init_task(void *);
>  int  iwx_activate(struct device *, int);
>  int  iwx_resume(struct iwx_softc *);
> +int  iwx_wakeup(struct iwx_softc *);
>
>  #if NBPFILTER > 0
>  void iwx_radiotap_attach(struct iwx_softc *);
> @@ -7822,16 +7823,6 @@ iwx_init_hw(struct iwx_softc *sc)
>   struct ieee80211com *ic = >sc_ic;
>   int err, i;
>
> - err = iwx_preinit(sc);
> - if (err)
> - return err;
> -
> - err = iwx_start_hw(sc);
> - if (err) {
> - printf("%s: could not initialize hardware\n", DEVNAME(sc));
> - return err;
> - }
> -
>   err = iwx_run_init_mvm_ucode(sc, 0);
>   if (err)
>   return err;
> @@ -7984,6 +7975,16 @@ iwx_init(struct ifnet *ifp)
>   KASSERT(sc->task_refs.refs == 0);
>   refcnt_init(>task_refs);
>
> + err = iwx_preinit(sc);
> + if (err)
> + return err;
> +
> + err = iwx_start_hw(sc);
> + if (err) {
> + printf("%s: could not initialize hardware\n", DEVNAME(sc));
> + return err;
> + }
> +
>   err = iwx_init_hw(sc);
>   if (err) {
>   if (generation == sc->sc_generation)
> @@ -9593,6 +9594,30 @@ iwx_resume(struct iwx_softc *sc)
>  }
>
>  int
> +iwx_wakeup(struct iwx_softc *sc)
> +{
> + struct ieee80211com *ic = >sc_ic;
> + struct ifnet *ifp = >sc_ic.ic_if;
> + int err;
> +
> + refcnt_init(>task_refs);
> +
> + err = iwx_init_hw(sc);
> + if (err)
> + return err;
> +
> + ifq_clr_oactive(>if_snd);
> + ifp->if_flags |= IFF_RUNNING;
> +
> + if (ic->ic_opmode == IEEE80211_M_MONITOR)
> + ieee80211_new_state(ic, IEEE80211_S_RUN, -1);
> + else
> + ieee80211_begin_scan(ifp);
> +
> + return 0;
> +}
> +
> +int
>  iwx_activate(struct device *self, int act)
>  {
>   struct iwx_softc *sc = (struct iwx_softc *)self;
> @@ -9608,15 +9633,27 @@ iwx_activate(struct device *self, int act)
>   }
>   break;
>   case DVACT_RESUME:
> + if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != IFF_UP)
> + break;
> + sc->sc_flags &= ~IWX_FLAG_SHUTDOWN;
>   err = iwx_resume(sc);
> - if (err)
> + if (err) {
>   printf("%s: could not initialize hardware\n",
>   DEVNAME(sc));
> + sc->sc_flags |= IWX_FLAG_SHUTDOWN;
> + }
>   break;
>   case DVACT_WAKEUP:
> - /* Hardware should be up at this point. */
> - if (iwx_set_hw_ready(sc))
> - task_add(systq, >init_task);
> + if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != IFF_UP)
> + break;
> + if (sc->sc_flags & IWX_FLAG_SHUTDOWN)
> + sc->sc_flags &= ~IWX_FLAG_SHUTDOWN;
> + else {
> + err = iwx_wakeup(sc);
> + if (err)
> + printf("%s: could not initialize hardware\n",
> + DEVNAME(sc));
> + }
>   break;
>   }
>
>
>
>



Re: riscv64/trap.c debug printfs

2021-09-03 Thread Mike Larkin
On Fri, Sep 03, 2021 at 04:38:55PM +0200, Jeremie Courreges-Anglas wrote:
>
> This one is a bit too chatty whenever you run a program under egdb.
> But the other printfs in this file seem ok, thus I'm not touching them.
>
> ok?
>
>
> Index: trap.c
> ===
> RCS file: /d/cvs/src/sys/arch/riscv64/riscv64/trap.c,v
> retrieving revision 1.16
> diff -u -p -p -u -r1.16 trap.c
> --- trap.c26 Jul 2021 22:13:19 -  1.16
> +++ trap.c3 Sep 2021 14:25:31 -
> @@ -159,7 +159,6 @@ do_trap_user(struct trapframe *frame)
>   trapsignal(p, SIGILL, 0, ILL_ILLTRP, sv);
>   break;
>   case EXCP_BREAKPOINT:
> - printf("BREAKPOINT\n");
>   sv.sival_ptr = (void *)frame->tf_stval;
>   trapsignal(p, SIGTRAP, 0, TRAP_BRKPT, sv);
>   break;
>
> --
> jca | PGP : 0x1524E7EE / 5135 92C1 AD36 5293 2BDF  DDCC 0DFA 74AE 1524 E7EE
>

ok mlarkin



Re: Incorrect IPL when pool_get(9) is called under rwlock

2021-09-01 Thread Mike Larkin
On Wed, Sep 01, 2021 at 08:53:35AM +0200, Martin Pieuchot wrote:
> syzkaller reported [0] the following lock ordering issue:
>
> db{0}> trace
> db_enter() at db_enter+0x18 sys/arch/amd64/amd64/db_interface.c:440
> panic(82464b8f) at panic+0x177 sys/kern/subr_prf.c:202
> witness_checkorder(82838c20,9,0) at witness_checkorder+0x11eb 
> sys/kern/subr_witness.c:833
> __mp_lock(82838a18) at __mp_lock+0xa1 read_rflags 
> machine/cpufunc.h:195 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 intr_disable 
> machine/cpufunc.h:216 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 sys/kern/kern_lock.c:142
> intr_handler(80002123ad80,80255d80) at intr_handler+0x5e 
> sys/arch/amd64/amd64/intr.c:532
> Xintr_ioapic_edge20_untramp() at Xintr_ioapic_edge20_untramp+0x18f
> Xspllower() at Xspllower+0x19
> mtx_enter_try(829b8d10) at mtx_enter_try+0x100
> mtx_enter(829b8d10) at mtx_enter+0x4b sys/kern/kern_lock.c:266
> pool_get(829b8d10,9) at pool_get+0xbf sys/kern/subr_pool.c:581
> vm_create(80b29000,8000211922a8) at vm_create+0x261 
> sys/arch/amd64/amd64/vmm.c:1526
> vmmioctl(a00,c5005601,80b29000,1,8000211922a8) at vmmioctl+0x1f2
> VOP_IOCTL(fd806e213830,c5005601,80b29000,1,fd807f7d8840,8000211922a8)
>  at VOP_IOCTL+0x9a sys/kern/vfs_vops.c:295
> vn_ioctl(fd806e4aca28,c5005601,80b29000,8000211922a8) at 
> vn_ioctl+0xba sys/kern/vfs_vnops.c:531
> sys_ioctl(8000211922a8,80002123b398,80002123b3e0) at 
> sys_ioctl+0x4a2
>
>
> The issue is that pool_get(9) at line 1526 is done after grabbing the
> `vm_lock'.  If an interrupt needing the KERNEL_LOCK() occurs at that
> moment the above mentionned lock ordering problem could cause a
> deadlock.
>
> To prevent such issue we generally mark the pool with IPL_MPFLOOR.
>
> [0] 
> https://syzkaller.appspot.com/bug?id=c73756cc996a58a625da35fbaa90ba6b9e0c60dc
>

ok mlarkin@

> Index: arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.287
> diff -u -p -r1.287 vmm.c
> --- arch/amd64/amd64/vmm.c31 Aug 2021 17:40:59 -  1.287
> +++ arch/amd64/amd64/vmm.c1 Sep 2021 06:45:38 -
> @@ -430,7 +430,7 @@ vmm_attach(struct device *parent, struct
>
>   pool_init(_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK,
>   "vmpool", NULL);
> - pool_init(_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
> + pool_init(_pool, sizeof(struct vcpu), 64, IPL_MPFLOOR, PR_WAITOK,
>   "vcpupl", NULL);
>
>   vmm_softc = sc;



Re: Incorrect IPL when pool_get(9) is called under rwlock

2021-09-01 Thread Mike Larkin
On Wed, Sep 01, 2021 at 08:53:35AM +0200, Martin Pieuchot wrote:
> syzkaller reported [0] the following lock ordering issue:
>
> db{0}> trace
> db_enter() at db_enter+0x18 sys/arch/amd64/amd64/db_interface.c:440
> panic(82464b8f) at panic+0x177 sys/kern/subr_prf.c:202
> witness_checkorder(82838c20,9,0) at witness_checkorder+0x11eb 
> sys/kern/subr_witness.c:833
> __mp_lock(82838a18) at __mp_lock+0xa1 read_rflags 
> machine/cpufunc.h:195 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 intr_disable 
> machine/cpufunc.h:216 [inline]
> __mp_lock(82838a18) at __mp_lock+0xa1 sys/kern/kern_lock.c:142
> intr_handler(80002123ad80,80255d80) at intr_handler+0x5e 
> sys/arch/amd64/amd64/intr.c:532
> Xintr_ioapic_edge20_untramp() at Xintr_ioapic_edge20_untramp+0x18f
> Xspllower() at Xspllower+0x19
> mtx_enter_try(829b8d10) at mtx_enter_try+0x100
> mtx_enter(829b8d10) at mtx_enter+0x4b sys/kern/kern_lock.c:266
> pool_get(829b8d10,9) at pool_get+0xbf sys/kern/subr_pool.c:581
> vm_create(80b29000,8000211922a8) at vm_create+0x261 
> sys/arch/amd64/amd64/vmm.c:1526
> vmmioctl(a00,c5005601,80b29000,1,8000211922a8) at vmmioctl+0x1f2
> VOP_IOCTL(fd806e213830,c5005601,80b29000,1,fd807f7d8840,8000211922a8)
>  at VOP_IOCTL+0x9a sys/kern/vfs_vops.c:295
> vn_ioctl(fd806e4aca28,c5005601,80b29000,8000211922a8) at 
> vn_ioctl+0xba sys/kern/vfs_vnops.c:531
> sys_ioctl(8000211922a8,80002123b398,80002123b3e0) at 
> sys_ioctl+0x4a2
>
>
> The issue is that pool_get(9) at line 1526 is done after grabbing the
> `vm_lock'.  If an interrupt needing the KERNEL_LOCK() occurs at that
> moment the above mentionned lock ordering problem could cause a
> deadlock.
>
> To prevent such issue we generally mark the pool with IPL_MPFLOOR.
>
> [0] 
> https://syzkaller.appspot.com/bug?id=c73756cc996a58a625da35fbaa90ba6b9e0c60dc
>

Thanks, will take a look. This was introduced yesterday with the new vcpu 
locking
diff.

-ml

> Index: arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.287
> diff -u -p -r1.287 vmm.c
> --- arch/amd64/amd64/vmm.c31 Aug 2021 17:40:59 -  1.287
> +++ arch/amd64/amd64/vmm.c1 Sep 2021 06:45:38 -
> @@ -430,7 +430,7 @@ vmm_attach(struct device *parent, struct
>
>   pool_init(_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK,
>   "vmpool", NULL);
> - pool_init(_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
> + pool_init(_pool, sizeof(struct vcpu), 64, IPL_MPFLOOR, PR_WAITOK,
>   "vcpupl", NULL);
>
>   vmm_softc = sc;
>



Re: ddb: machine sysregs for amd64

2021-08-31 Thread Mike Larkin
On Tue, Aug 31, 2021 at 06:30:40PM +1000, Alex Wilson wrote:
> Hi,
>
> This is a short diff to add "machine sysregs" to ddb on amd64 (plus it also
> prints out gsbase/kgsbase). This command is available on i386 but not amd64.
> I swear I remember discussing this with mlarkin at some point but I couldn't
> find a previous patch for it on tech@. If I missed it somehow, I am super
> sorry, and please hit me with the search stick.
>
> This command is mostly useful if you're futzing with page tables or GDT/IDT
> setup etc, but it's also useful for sanity-checking state generally
> sometimes, and quite useful for teaching demos showing how it all works
> (which is the main reason I want it right now).
>

Thanks, I'll commit this.

-ml

>
>
> Index: sys/arch/amd64//amd64/db_interface.c
> ===
> RCS file: /cvs/./src/sys/arch/amd64/amd64/db_interface.c,v
> retrieving revision 1.35
> diff -u -p -r1.35 db_interface.c
> --- sys/arch/amd64//amd64/db_interface.c  6 Nov 2019 07:34:35 -   
> 1.35
> +++ sys/arch/amd64//amd64/db_interface.c  31 Aug 2021 08:12:06 -
> @@ -46,6 +46,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>  #include 
> @@ -160,6 +161,45 @@ db_ktrap(int type, int code, db_regs_t *
>   return (1);
>  }
>
> +void
> +db_sysregs_cmd(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
> +{
> + int64_t idtr, gdtr;
> + uint64_t cr;
> + uint16_t ldtr, tr;
> + uint64_t gsb;
> +
> + __asm__ volatile("sidt %0" : "=m" (idtr));
> + db_printf("idtr:   0x%08llx/%04llx\n", idtr >> 16, idtr & 0x);
> +
> + __asm__ volatile("sgdt %0" : "=m" (gdtr));
> + db_printf("gdtr:   0x%08llx/%04llx\n", gdtr >> 16, gdtr & 0x);
> +
> + __asm__ volatile("sldt %0" : "=g" (ldtr));
> + db_printf("ldtr:   0x%04x\n", ldtr);
> +
> + __asm__ volatile("str %0" : "=g" (tr));
> + db_printf("tr: 0x%04x\n", tr);
> +
> + __asm__ volatile("movq %%cr0,%0" : "=r" (cr));
> + db_printf("cr0:0x%016llx\n", cr);
> +
> + __asm__ volatile("movq %%cr2,%0" : "=r" (cr));
> + db_printf("cr2:0x%016llx\n", cr);
> +
> + __asm__ volatile("movq %%cr3,%0" : "=r" (cr));
> + db_printf("cr3:0x%016llx\n", cr);
> +
> + __asm__ volatile("movq %%cr4,%0" : "=r" (cr));
> + db_printf("cr4:0x%016llx\n", cr);
> +
> + gsb = rdmsr(MSR_GSBASE);
> + db_printf("gsb:0x%016llx\n", gsb);
> +
> + gsb = rdmsr(MSR_KERNELGSBASE);
> + db_printf("kgsb:   0x%016llx\n", gsb);
> +}
> +
>
>  #ifdef MULTIPROCESSOR
>  void
> @@ -368,6 +408,7 @@ struct db_command db_machine_command_tab
>   { "startcpu",   db_startproc_cmd,   0,  0 },
>   { "stopcpu",db_stopproc_cmd,0,  0 },
>   { "ddbcpu", db_ddbproc_cmd, 0,  0 },
> + { "sysregs",db_sysregs_cmd, 0,  0 },
>  #endif
>  #if NACPI > 0
>   { "acpi",   NULL,   0,  db_acpi_cmds },
>



Re: vmd(8): simplify vcpu logic, removing uart & net reads

2021-07-15 Thread Mike Larkin
On Sun, Jul 11, 2021 at 08:10:42AM -0400, Dave Voutila wrote:
>
> Ping...looking for OK. Would like to get this committed this week.
>

Sorry this took so long. ok mlarkin.

Thanks to the numerous testers who ran with this for the past few
weeks.

> Dave Voutila writes:
>
> > Looking for an OK for this one now. Anyone?
> >
> > Dave Voutila  writes:
> >
> >> Dave Voutila writes:
> >>
> >>> Looking for some broader testing of the following diff. It cleans up
> >>> some complicated logic predominantly left over from the early days of
> >>> vmd prior to its having a dedicated device thread.
> >>
> >> Still looking for tester feedback. I've been running this diff while
> >> hosting multiple guests continously (OpenBSD-current, Alpine 3.14,
> >> Debian 10.10, Ubuntu 20.04) with no issues.
> >>
> >> I know a few folks have told me they've applied the diff and have not
> >> seen issues.
> >
> > I've had positive reports from 4 people. Thanks everyone that tested and
> > provided feedback!
> >
> >>
> >> I'll prod for OK next week, so if you've tested the diff please let me
> >> know!
> >
> > OK to commit?
> >
> >>
> >>>
> >>> In summary, this diff:
> >>>
> >>> - Removes vionet "rx pending" state handling and removes the code path
> >>>   for the vcpu thread to possibly take control of the virtio net device
> >>>   and attempt a read of the underlying tap(4). (virtio.{c,h}, vm.c)
> >>>
> >>> - Removes ns8250 "rcv pending" state handling and removes the code path
> >>>   for the vcpu thread to read the pty via com_rcv(). (ns8250.{c,h})
> >>>
> >>> In both of the above cases, the event handling thread will be notified
> >>> of readable data and deal with it.
> >>>
> >>> Why remove them? The logic is overly complicated and hard to reason
> >>> about for zero gain. (This diff results in no intended functional
> >>> change.) Plus, some of the above logic I helped add to deal with the
> >>> race conditions and state corruption over a year ago. The logic was
> >>> needed once upon a time, but shouldn't be needed at present.
> >>>
> >>> I've had positive testing feedback from abieber@ so far with at least
> >>> the ns8250/uart diff, but want to cast a broader net here with both
> >>> before either part is committed. I debated splitting these up, but
> >>> they're thematically related.
> >>>
> >>> -dv
> >>>
> >>> Index: virtio.c
> >>> ===
> >>> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> >>> retrieving revision 1.91
> >>> diff -u -p -r1.91 virtio.c
> >>> --- virtio.c  21 Jun 2021 02:38:18 -  1.91
> >>> +++ virtio.c  23 Jun 2021 11:28:03 -
> >>> @@ -1254,12 +1254,12 @@ static int
> >>>  vionet_rx(struct vionet_dev *dev)
> >>>  {
> >>>   char buf[PAGE_SIZE];
> >>> - int hasdata, num_enq = 0, spc = 0;
> >>> + int num_enq = 0, spc = 0;
> >>>   struct ether_header *eh;
> >>>   ssize_t sz;
> >>>
> >>>   do {
> >>> - sz = read(dev->fd, buf, sizeof buf);
> >>> + sz = read(dev->fd, buf, sizeof(buf));
> >>>   if (sz == -1) {
> >>>   /*
> >>>* If we get EAGAIN, No data is currently available.
> >>> @@ -1270,21 +1270,17 @@ vionet_rx(struct vionet_dev *dev)
> >>>   "device");
> >>>   } else if (sz > 0) {
> >>>   eh = (struct ether_header *)buf;
> >>> - if (!dev->lockedmac || sz < ETHER_HDR_LEN ||
> >>> + if (!dev->lockedmac ||
> >>>   ETHER_IS_MULTICAST(eh->ether_dhost) ||
> >>>   memcmp(eh->ether_dhost, dev->mac,
> >>>   sizeof(eh->ether_dhost)) == 0)
> >>>   num_enq += vionet_enq_rx(dev, buf, sz, );
> >>>   } else if (sz == 0) {
> >>>   log_debug("process_rx: no data");
> >>> - hasdata = 0;
> >>>   break;
> >>>   }
> >>> + } while (spc > 0 && sz > 0);
> >>>
> >>> - hasdata = fd_hasdata(dev->fd);
> >>> - } while (spc && hasdata);
> >>> -
> >>> - dev->rx_pending = hasdata;
> >>>   return (num_enq);
> >>>  }
> >>>
> >>> @@ -1301,16 +1297,6 @@ vionet_rx_event(int fd, short kind, void
> >>>
> >>>   mutex_lock(>mutex);
> >>>
> >>> - /*
> >>> -  * We already have other data pending to be received. The data that
> >>> -  * has become available now will be enqueued to the vionet_dev
> >>> -  * later.
> >>> -  */
> >>> - if (dev->rx_pending) {
> >>> - mutex_unlock(>mutex);
> >>> - return;
> >>> - }
> >>> -
> >>>   if (vionet_rx(dev) > 0) {
> >>>   /* XXX: vcpu_id */
> >>>   vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq);
> >>> @@ -1320,40 +1306,6 @@ vionet_rx_event(int fd, short kind, void
> >>>  }
> >>>
> >>>  /*
> >>> - * vionet_process_rx
> >>> - *
> >>> - * Processes any remaining pending receivable data for a vionet device.
> >>> - * Called on VCPU exit. Although we poll on the tap file descriptor of
> >>> - * a vionet_dev in a 

Re: vmd: spurious VM restarts

2021-06-26 Thread Mike Larkin
On Sat, Jun 26, 2021 at 03:26:55PM +0200, Thomas L. wrote:
> On Wed, 7 Apr 2021 17:00:00 -0700
> Mike Larkin  wrote:
> > Depends on the exact content that got swapped out (as we didn't handle
> > TLB flushes correctly), so a crash was certainly a possibility.
> > That's why I wanted to see the VMM_DEBUG output.
> >
> > In any case, Thomas should try -current and see if this problem is
> > even reproducible.
> >
> > -ml
>
> I've been running -current with VMM_DEBUG since Apr 14 and the problem
> has not reproduced, instead I see spurious stops now. Output in
> /var/log/messages on the occasion is:
>
> Jun 19 03:31:16 golem vmd[95337]: vcpu_run_loop: vm 8 / vcpu 0 run ioctl 
> failed: Invalid argument
> Jun 19 03:31:16 golem /bsd: vcpu_run_vmx: can't read procbased ctls on exit
> Jun 19 03:31:17 golem /bsd: vmm_free_vpid: freed VPID/ASID 8
>
> There's also a lot of probably unrelated messages for all the VMs:
>
> Jun 19 01:31:10 golem vmd[66318]: vionet_enq_rx: descriptor too small for 
> packet data
>
> I realize that this is an old version, so this might be an already
> fixed bug. I can upgrade to a newer snapshot, but the bug shows about
> once per month, so by the time it shows it will be an old version
> again.
>
> Kind regards,
>
> Thomas
>

you probably want a newer snap, dv@ fixed some things in this area recently.



Re: vmd(8): add barebones vioblk GET_ID support

2021-06-17 Thread Mike Larkin
On Thu, Jun 17, 2021 at 12:07:10PM -0400, Dave Voutila wrote:
>
> Dave Voutila writes:
>
> > The virtio spec has had a de facto command for drivers to read the
> > serial number off a virtual block device. QEMU introduced this feature
> > years ago. Last November, the virtio governing group voted in favor of
> > adopting it officially into v1.2 (the next virtio spec) [1].
> >
> > The below diff adds the basics of handling the request returning an
> > empty serial number. (Serial numbers are limited to 20 bytes.) This
> > stops vmd from complaining about "unsupported command 0x8" when guests
> > send this command type.
>
> Got some feedback off-list from claudio@ that I think is sound. Instead
> of providing an "empty" serial id/number, simply return an UNSUPP status
> to indicate we don't support the value.
>
> I think this approach better than the approach I was suggesting that was
> based off QEMU's design of defaulting to "". (FreeBSD's Bhyve generates
> a serial like "BHYVE-1122-3344-5566" where the suffix is some truncated
> md5 of the backing filename. I'm not a fan of this approach.)
>
> >
> > secdata_desc{,idx} variables are renamed to just data_desc{,idx} to
> > semantically match the change since they're used for more than sector
> > data.
>
> I undid this renaming for now to reduce noise.
>
> > This is primarily part of my work to clean up and bring vmd's virtio
> > implementation more up to date and to align to our own
> > v{io,ioblk,ioscsi,etc.}(4) current capabilities. (vioblk(4) doesn't
> > support this yet, but Linux guests use it frequently.)
>
> While adding the BLK_ID support, I also switched the FLUSH/FLUSH_OUT
> response to be VIRTIO_BLK_S_UNSUPP as well since the device does not
> negotiate that feature. Any request from the guest to "flush" currently
> doesn't do anything (some hypervisors will fsync(2) the underlying fd)
> but for now I'm correcting the response code.
>
> I also noticed and added read/write checks prior to calls to
> {read,write}_mem. The virtio spec says a device MUST not write to a
> read-only descriptor and SHOULD NOT read a write-only descriptor with an
> exception being made for debugging. (See 2.6.5.1 Device requirements:
> The Virtqueue Descriptor Table.)
>
> Next steps in this are of code will be to properly implement the missing
> VIRTIO_BLK_S_IOERR results for failed i/o. Right now the device bails
> processing the command and doesn't reply to the driver, which is not
> conforming with virtio spec.
>
> > OK?
>
> Any other feedback? OK?
>

ok mlarkin

> >
> > -dv
> >
> > [1] https://www.oasis-open.org/committees/ballot.php?id=3536
>
>
> Index: virtio.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> retrieving revision 1.89
> diff -u -p -r1.89 virtio.c
> --- virtio.c  16 Jun 2021 16:55:02 -  1.89
> +++ virtio.c  17 Jun 2021 15:57:56 -
> @@ -517,6 +517,11 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   }
>
>   /* Read command from descriptor ring */
> + if (cmd_desc->flags & VRING_DESC_F_WRITE) {
> + log_warnx("vioblk: unexpected writable cmd descriptor "
> + "%d", cmd_desc_idx);
> + goto out;
> + }
>   if (read_mem(cmd_desc->addr, , sizeof(cmd))) {
>   log_warnx("vioblk: command read_mem error @ 0x%llx",
>   cmd_desc->addr);
> @@ -541,6 +546,13 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   struct ioinfo *info;
>   const uint8_t *secdata;
>
> + if ((secdata_desc->flags & VRING_DESC_F_WRITE)
> + == 0) {
> + log_warnx("vioblk: unwritable data "
> + "descriptor %d", secdata_desc_idx);
> + goto out;
> + }
> +
>   info = vioblk_start_read(dev,
>   cmd.sector + secbias, secdata_desc->len);
>
> @@ -607,6 +619,13 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   do {
>   struct ioinfo *info;
>
> + if (secdata_desc->flags & VRING_DESC_F_WRITE) {
> + log_warnx("wr vioblk: unexpected "
> + "writable data descriptor %d",
> + secdata_desc_idx);
> + goto out;
> + }
> +
>   info = vioblk_start_write(dev,
>   cmd.sector + secbias,
>   secdata_desc->addr, secdata_desc->len);
> @@ -654,7 +673,35 @@ vioblk_notifyq(struct vioblk_dev *dev)
>   ds_desc_idx = 

Re: Document missing pledge promises

2021-06-11 Thread Mike Larkin
On Fri, Jun 11, 2021 at 09:16:46AM -0600, Theo de Raadt wrote:
> Dave Voutila  wrote:
>
> > Theo de Raadt writes:
> >
> > > Regarding the vmm chunk -- as I said in my other reply, these
> > > explanations are too precise.  They risk becoming outdated as things
> > > change.  Furthermore, some of those ioctl may work in one way, but not
> > > another way.  Which would be too complicated to describe also.  I urge
> > > simple messaging:
> > >
> > > .It Va vmm
> > > Operations required by
> > > .Xr vmd 8 .
> > >
> > > It is accurate.  If someone later wanted to use those operations, they
> > > would figure it out by reading kernel and vmd source.
> >
> > I agree simpler is better. The actual ioctls are documented in vmm.4 and
> > this is currently an all-or-nothing thing. You either get to perform all
> > operations on the vmm(4) device or none.
>
> What you just said is the truth.  But once you put it in a manual page,
> in the future the code may change, and some ioctl might be exposed
> without "vmm". it is better to be vague.
>

Agreed, simpler is better in the pledge docs.



Re: hvn(4): don't input mbufs if interface is not running

2021-06-11 Thread Mike Belopuhov
On 12/05/2021 15:15, Patrick Wildt wrote:
> Hi,
> 
> when hvn(4) attaches it sends commands and waits for replies to come
> back in, hence the interrupt function is being polled.  Unfortunately
> it seems that the 'receive pipe' has both command completion and data
> packets.  As it turns out, while hvn(4) is just setting up the pipes,
> it can already receive packets, which I have seen happening on Hyper-V.
> 
> This essentially means that if_input() is being called *before* the
> card is set up (or UP).  This seems wrong.  Apparently on drivers like
> em(4) we only read packets if IFF_RUNNING is set.  I think in the case
> of hvn(4), we should drop packets unless IFF_RUNNING is set.
> 
> Opinions?
> 

Hi Patrick,

You're right that hvn needs to have the receiving path setup to exchange
commands with the hypervisor. This diff LGTM and should be committed if
it wasn't.

Cheers,
Mike

> Patrick
> 
> diff --git a/sys/dev/pv/if_hvn.c b/sys/dev/pv/if_hvn.c
> index f12e2f935ca..4306f717baf 100644
> --- a/sys/dev/pv/if_hvn.c
> +++ b/sys/dev/pv/if_hvn.c
> @@ -1470,7 +1470,10 @@ hvn_rndis_input(struct hvn_softc *sc, uint64_t tid, 
> void *arg)
>   }
>   hvn_nvs_ack(sc, tid);
>  
> - if_input(ifp, );
> + if (ifp->if_flags & IFF_RUNNING)
> + if_input(ifp, );
> + else
> + ml_purge();
>  }
>  
>  static inline struct mbuf *
> 



Re: Update vmctl(8) to use TERMINATE_VM_EVENTs

2021-06-10 Thread Mike Larkin
On Thu, Jun 10, 2021 at 09:19:45AM -0400, Dave Voutila wrote:
>
> Still looking for an OK or feedback on the below. This is finishing work
> to fixes made previously to vmd(8)/vmctl(8) regarding vm
> stopping/running state corruption when using vmctl(8) to wait for a vm
> to stop.
>

Sorry for the delay. ok mlarkin@ with one comment below.

-ml

> Dave Voutila writes:
>
> > ping
> >
> > Dave Voutila writes:
> >
> >> Dave Voutila writes:
> >>
> >>> The conclusion of my previous fixes to vmd(8) [1] changes the event
> >>> handling in vmctl(8) to support receiving IMSG_VMDOP_TERMINATE_VM_EVENTs
> >>> from the control process. (This removes a XXX comment from vmd.)
> >>>
> >>> For clarity, the messaging logic was changed previously:
> >>>
> >>> - ...TERMINATE_VM_RESPONSE conveying success/failure of the request to
> >>>   terminate a guest regardless of waiting for termination
> >>> - ...TERMINATE_VM_EVENT conveying the actual termination of a guest
> >>>
> >>> This diff finishes bringing that logic from vmd(8) to vmctl(8).
> >>>
> >>> OK?
> >>
> >> Ping. Looking to close this gap.
> >>
> >> Note: this diff does preserve some errno abuse in vmd & vmctl that I'm
> >> working on separately.
> >>
> >>>
> >>> -dv
> >>>
> >>>
> >>> Index: usr.sbin/vmd/control.c
> >>> ===
> >>> RCS file: /cvs/src/usr.sbin/vmd/control.c,v
> >>> retrieving revision 1.35
> >>> diff -u -p -r1.35 control.c
> >>> --- usr.sbin/vmd/control.c26 Apr 2021 22:58:27 -  1.35
> >>> +++ usr.sbin/vmd/control.c30 Apr 2021 12:31:22 -
> >>> @@ -154,9 +154,8 @@ control_dispatch_vmd(int fd, struct priv
> >>>   if (notify->ctl_vmid != vmr.vmr_id)
> >>>   continue;
> >>>   if ((c = control_connbyfd(notify->ctl_fd)) != NULL) {
> >>> - /* XXX vmctl expects *_RESPONSE, not *_EVENT */
> >>> - imsg_compose_event(>iev,
> >>> - IMSG_VMDOP_TERMINATE_VM_RESPONSE,
> >>> + /* Forward to the vmctl(8) client */
> >>> + imsg_compose_event(>iev, imsg->hdr.type,
> >>>   0, 0, -1, imsg->data, IMSG_DATA_SIZE(imsg));
> >>>   TAILQ_REMOVE(_notify_q, notify, entry);
> >>>   free(notify);
> >>> Index: usr.sbin/vmctl/vmctl.c
> >>> ===
> >>> RCS file: /cvs/src/usr.sbin/vmctl/vmctl.c,v
> >>> retrieving revision 1.77
> >>> diff -u -p -r1.77 vmctl.c
> >>> --- usr.sbin/vmctl/vmctl.c22 Mar 2021 18:50:11 -  1.77
> >>> +++ usr.sbin/vmctl/vmctl.c30 Apr 2021 12:31:22 -
> >>> @@ -461,7 +461,7 @@ terminate_vm(uint32_t terminate_id, cons
> >>>   * terminate_vm_complete
> >>>   *
> >>>   * Callback function invoked when we are expecting an
> >>> - * IMSG_VMDOP_TERMINATE_VM_RESPONSE message indicating the completion of
> >>> + * IMSG_VMDOP_TERMINATE_VM_EVENT message indicating the completion of

It looks like this function has cases for both IMSG_VMDOP_TERMINATE_VM_RESPONSE
*and* _EVENT. Should the comment be phrased accordingly? If I read this
correctly, the comment would only state this function handles _EVENT messages.

> >>>   * a terminate vm operation.
> >>>   *
> >>>   * Parameters:
> >>> @@ -484,41 +484,50 @@ terminate_vm_complete(struct imsg *imsg,
> >>>   struct vmop_result *vmr;
> >>>   int res;
> >>>
> >>> - if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_RESPONSE) {
> >>> + switch (imsg->hdr.type) {
> >>> + case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
> >>> + IMSG_SIZE_CHECK(imsg, );
> >>>   vmr = (struct vmop_result *)imsg->data;
> >>>   res = vmr->vmr_result;
> >>> - if (res) {
> >>> - switch (res) {
> >>> - case VMD_VM_STOP_INVALID:
> >>> - fprintf(stderr,
> >>> - "cannot stop vm that is not running\n");
> >>> - *ret = EINVAL;
> >>> - break;
> >>> - case ENOENT:
> >>> - fprintf(stderr, "vm not found\n");
> >>> - *ret = EIO;
> >>> - break;
> >>> - case EINTR:
> >>> - fprintf(stderr, "interrupted call\n");
> >>> - *ret = EIO;
> >>> - break;
> >>> - default:
> >>> - errno = res;
> >>> - fprintf(stderr, "failed: %s\n",
> >>> - strerror(res));
> >>> - *ret = EIO;
> >>> - }
> >>> - } else if (flags & VMOP_WAIT) {
> >>> +
> >>> + switch (res) {
> >>> + case 0:
> >>> + fprintf(stderr, "requested to shutdown vm %d\n",
> >>> + vmr->vmr_id);

Re: limit MSR_INT_PEN_MSG use to < family 16h

2021-06-10 Thread Mike Larkin
On Wed, Jun 09, 2021 at 10:35:48PM -0700, Mike Larkin wrote:
> On Thu, Jun 10, 2021 at 03:19:43PM +1000, Jonathan Gray wrote:
> > Ilya Voronin sent a diff to misc to limit MSR_INT_PEN_MSG use to
> > < AMD family 17h prompted by a problem with an AWS t3a instance.
> >
> > https://marc.info/?l=openbsd-misc=162120066715633=2
> >
> > Digging some more the 16h bkdgs have it as RAZ/non-functional as well.
> > Bits are documented in 15h.
> >
> > BKDG for AMD Family 16h Models 00h-0Fh Processors
> > MSRC001_0055 Interrupt Pending
> > 63:0 RAZ.
> >
> > BKDG for AMD Family 16h Models 30h-3Fh Processors
> > MSRC001_0055 Interrupt Pending
> > 63:0 RAZ
> >
> > PPR for AMD Family 17h Model 71h B0
> > MSRC001_0055 [Reserved.] (Core::X86::Msr::IntPend)
> > Read-only. Reset: Fixed,___h.
> >
> > Change the test to use extended family id while here.
> >
>
> I'd be ok with this if someone reported that it works on a bare metal EPYC,
> since the fix here is for a virtualized environment (and we don't know what
> AWS is doing here).
>
> -ml
>

Seeing that people have tested this on a few machines of the right era,
ok mlarkin@.

>
> > Index: sys/arch/amd64/amd64/lapic.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> > retrieving revision 1.57
> > diff -u -p -r1.57 lapic.c
> > --- sys/arch/amd64/amd64/lapic.c6 Sep 2020 20:50:00 -   1.57
> > +++ sys/arch/amd64/amd64/lapic.c19 May 2021 09:16:37 -
> > @@ -299,8 +299,7 @@ lapic_set_lvt(void)
> >  *Family 0Fh Processors"
> >  *   #32559 revision 3.00
> >  */
> > -   if ((cpu_id & 0x0f00) == 0x0f00 &&
> > -   (cpu_id & 0x0fff) >= 0x0004) {
> > +   if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
> > uint64_t msr;
> >
> > msr = rdmsr(MSR_INT_PEN_MSG);
> > Index: sys/arch/i386/i386/lapic.c
> > ===
> > RCS file: /cvs/src/sys/arch/i386/i386/lapic.c,v
> > retrieving revision 1.47
> > diff -u -p -r1.47 lapic.c
> > --- sys/arch/i386/i386/lapic.c  30 Jul 2018 14:19:12 -  1.47
> > +++ sys/arch/i386/i386/lapic.c  19 May 2021 09:19:41 -
> > @@ -160,8 +160,7 @@ lapic_set_lvt(void)
> >  *Family 0Fh Processors"
> >  *   #32559 revision 3.00
> >  */
> > -   if ((cpu_id & 0x0f00) == 0x0f00 &&
> > -   (cpu_id & 0x0fff) >= 0x0004) {
> > +   if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
> > uint64_t msr;
> >
> > msr = rdmsr(MSR_INT_PEN_MSG);
> >
> >
>



Re: limit MSR_INT_PEN_MSG use to < family 16h

2021-06-09 Thread Mike Larkin
On Thu, Jun 10, 2021 at 03:19:43PM +1000, Jonathan Gray wrote:
> Ilya Voronin sent a diff to misc to limit MSR_INT_PEN_MSG use to
> < AMD family 17h prompted by a problem with an AWS t3a instance.
>
> https://marc.info/?l=openbsd-misc=162120066715633=2
>
> Digging some more the 16h bkdgs have it as RAZ/non-functional as well.
> Bits are documented in 15h.
>
> BKDG for AMD Family 16h Models 00h-0Fh Processors
> MSRC001_0055 Interrupt Pending
> 63:0 RAZ.
>
> BKDG for AMD Family 16h Models 30h-3Fh Processors
> MSRC001_0055 Interrupt Pending
> 63:0 RAZ
>
> PPR for AMD Family 17h Model 71h B0
> MSRC001_0055 [Reserved.] (Core::X86::Msr::IntPend)
> Read-only. Reset: Fixed,___h.
>
> Change the test to use extended family id while here.
>

I'd be ok with this if someone reported that it works on a bare metal EPYC,
since the fix here is for a virtualized environment (and we don't know what
AWS is doing here).

-ml


> Index: sys/arch/amd64/amd64/lapic.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> retrieving revision 1.57
> diff -u -p -r1.57 lapic.c
> --- sys/arch/amd64/amd64/lapic.c  6 Sep 2020 20:50:00 -   1.57
> +++ sys/arch/amd64/amd64/lapic.c  19 May 2021 09:16:37 -
> @@ -299,8 +299,7 @@ lapic_set_lvt(void)
>*Family 0Fh Processors"
>*   #32559 revision 3.00
>*/
> - if ((cpu_id & 0x0f00) == 0x0f00 &&
> - (cpu_id & 0x0fff) >= 0x0004) {
> + if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
>   uint64_t msr;
>
>   msr = rdmsr(MSR_INT_PEN_MSG);
> Index: sys/arch/i386/i386/lapic.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/lapic.c,v
> retrieving revision 1.47
> diff -u -p -r1.47 lapic.c
> --- sys/arch/i386/i386/lapic.c30 Jul 2018 14:19:12 -  1.47
> +++ sys/arch/i386/i386/lapic.c19 May 2021 09:19:41 -
> @@ -160,8 +160,7 @@ lapic_set_lvt(void)
>*Family 0Fh Processors"
>*   #32559 revision 3.00
>*/
> - if ((cpu_id & 0x0f00) == 0x0f00 &&
> - (cpu_id & 0x0fff) >= 0x0004) {
> + if (ci->ci_family >= 0xf && ci->ci_family < 0x16) {
>   uint64_t msr;
>
>   msr = rdmsr(MSR_INT_PEN_MSG);
>
>



Re: vmm(4): use monotonic base for pvclock

2021-06-01 Thread Mike Larkin
On Tue, Jun 01, 2021 at 08:03:43PM -0500, Scott Cheloha wrote:
> The documentation for the Linux pvclock is pretty sparse but I am
> pretty sure we want to use a monotonic base for ti_system_time.  We
> also have a function for converting a timespec into a 64-bit count of
> nanoseconds we can use.
>
> We may as well also use rdtsc_lfence() to ensure consistent behavior.
>
> ... this is still not quite right because the VM expects the pvclock
> to have a fixed frequency, but we have no interface to reading a raw
> timestamp.  Something to add in the future, maybe.
>
> Index: vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.284
> diff -u -p -r1.284 vmm.c
> --- vmm.c 18 May 2021 00:05:20 -  1.284
> +++ vmm.c 2 Jun 2021 00:57:31 -
> @@ -7294,8 +7294,8 @@ vmm_init_pvclock(struct vcpu *vcpu, padd
>  int
>  vmm_update_pvclock(struct vcpu *vcpu)
>  {
> + struct timespec now;
>   struct pvclock_time_info *pvclock_ti;
> - struct timespec tv;
>   struct vm *vm = vcpu->vc_parent;
>   paddr_t pvclock_hpa, pvclock_gpa;
>
> @@ -7309,10 +7309,9 @@ vmm_update_pvclock(struct vcpu *vcpu)
>   pvclock_ti->ti_version =
>   (++vcpu->vc_pvclock_version << 1) | 0x1;
>
> - pvclock_ti->ti_tsc_timestamp = rdtsc();
> - nanotime();
> - pvclock_ti->ti_system_time =
> - tv.tv_sec * 10L + tv.tv_nsec;
> + pvclock_ti->ti_tsc_timestamp = rdtsc_lfence();
> + nanouptime();
> + pvclock_ti->ti_system_time = TIMESPEC_TO_NSEC();
>   pvclock_ti->ti_tsc_shift = 12;
>   pvclock_ti->ti_tsc_to_system_mul =
>   vcpu->vc_pvclock_system_tsc_mul;
>

This probably needs to be tested on a wide variety (and versions) of Linux
guests. I've found in the past that different kernel versions do different
things and behave differently.

Did you test a few Linux guest VMs? Did this work across all of them?

-ml



Re: vio.4: mention support provided by vmd(8)

2021-05-24 Thread Mike Larkin
On Sun, May 23, 2021 at 09:50:46PM -0400, Dave Voutila wrote:
> Seems only right that vio.4 mention it's the driver used for the virtio
> networking device provided by vmd(8).
>
> OK?
>

ok mlarkin

>
> Index: vio.4
> ===
> RCS file: /cvs/src/share/man/man4/vio.4,v
> retrieving revision 1.15
> diff -u -p -r1.15 vio.4
> --- vio.4 24 Sep 2015 13:11:48 -  1.15
> +++ vio.4 24 May 2021 01:48:44 -
> @@ -27,7 +27,8 @@ The
>  .Nm
>  driver provides support for the
>  .Xr virtio 4
> -network interface provided by bhyve, KVM, QEMU, and VirtualBox.
> +network interface provided by bhyve, KVM, QEMU, VirtualBox, and
> +.Xr vmd 8 .
>  .Pp
>  Setting the bit 0x2 in the flags disables the RingEventIndex feature.
>  This can be tried as a workaround for possible bugs in host implementations 
> of
>



Re: vmd(8): add MTU feature support to vionet device

2021-05-24 Thread Mike Larkin
On Mon, May 24, 2021 at 08:25:04AM +0200, Claudio Jeker wrote:
> On Sun, May 23, 2021 at 10:25:38PM -0400, Dave Voutila wrote:
> > The following diff adds in virtio 1.1's VIRTIO_NET_F_MTU feature support
> > to vmd(8)'s virtio networking device. This allows for communicating an MTU
> > to the guest driver and then enforcing it in the emulated device.
> >
> > When the feature is offered, per Virtio v1.1, 5.1.4.1 [1]:
> >
> > "The device MUST NOT pass received packets that exceed mtu (plus low
> > level ethernet header length) size with gso_type NONE or ECN after
> > VIRTIO_NET_F_MTU has been successfully negotiated."
> >
> > (GSO is not supported or negotiated, so it's always NONE. This is
> > primarly because the vmd vionet device also doesn't support or negotiate
> > checksum offloading.)
> >
> > The prior logic in place simply checked the packet was of a allowable
> > size, which meant the largest IP packet (65535) plus an ethernet header.
> >
> > If testing the diff, you can change the VIONET_MTU definition to
> > something other than 1500 and check that a non-OpenBSD guest defaults to
> > using the value and forbids setting it higher. This is easy in an Alpine
> > or Debian Linux guest using:
> >
> > a) to view the mtu: ip link
> > b) to set the mtu: sudo ip link set dev  mtu 
> >
> > For example:
> >
> >   dave@debian:~$ sudo ip link set dev enp0s2 mtu 1501
> >   Error: mtu greater than device maximum.
> >
> > Since the diff lacks context of the goto, it jumps to section that
> > advances to the next ring
> >
> > Currently, vio(4) does not negotiate this feature and won't obey it. I'm
> > working on that separately.
> >
> > OK? Feedback?
> >
> > [1] 
> > https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-204
> >
> > Index: virtio.c
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> > retrieving revision 1.87
> > diff -u -p -r1.87 virtio.c
> > --- virtio.c18 May 2021 11:06:43 -  1.87
> > +++ virtio.c24 May 2021 01:31:22 -
> > @@ -60,6 +60,7 @@ int nr_vioblk;
> >
> >  #define MAXPHYS(64 * 1024) /* max raw I/O transfer size */
> >
> > +#define VIRTIO_NET_F_MTU   (1<<3)
> >  #define VIRTIO_NET_F_MAC   (1<<5)
> >
> >  #define VMMCI_F_TIMESYNC   (1<<0)
> > @@ -1046,6 +1047,26 @@ virtio_net_io(int dir, uint16_t reg, uin
> > *data = dev->mac[reg -
> > VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
> > break;
> > +   case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
> > +   if (sz == 2) {
> > +   *data = VIONET_MTU;
> > +   } else if (sz == 1) {
> > +   *data &= 0xFF00;
> > +   *data |= (uint32_t)(VIONET_MTU) & 0xFF;
> > +   } else {
> > +   log_warnx("%s: illegal read of vionet_mtu",
> > +   __progname);
> > +   }
> > +   break;
> > +   case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
> > +   if (sz == 1) {
> > +   *data &= 0xFF00;
> > +   *data = (uint32_t)(VIONET_MTU >> 8) & 0xFF;
> > +   } else {
> > +   log_warnx("%s: illegal read of vionet_mtu",
> > +   __progname);
> > +   }
> > +   break;
>
> Is it possible to get proper defines for these two options?
> This VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11 is ugly.
>

We could fix the + 11 part, but about the best we could do would be something
like the following:

VIRTIO_CONFIG_NET_MTU
VIRTIO_CONFIG_NET_MTU + 1
VIRTIO_CONFIG_NET_MTU + 2
VIRTIO_CONFIG_NET_MTU + 3

Since this is a pci config space access and I've seen Linux use 1, 2, and 4 byte
accesses. But, yes, we could improve the actual name.

Once dv@ gets this in I'll go back and redo the other devices (since we do a
similar thing for those as well).

-ml

> > case VIRTIO_CONFIG_DEVICE_FEATURES:
> > *data = dev->cfg.device_feature;
> > break;
> > @@ -1437,7 +1458,7 @@ vionet_notify_tx(struct vionet_dev *dev)
> > size_t pktsz, chunk_size = 0;
> > ssize_t dhcpsz;
> > int ret, num_enq, ofs, spc;
> > -   char *vr, *pkt, *dhcppkt;
> > +   char *vr, *pkt = NULL, *dhcppkt;
> > struct vring_desc *desc, *pkt_desc, *hdr_desc;
> > struct vring_avail *avail;
> > struct vring_used *used;
> > @@ -1505,12 +1526,13 @@ vionet_notify_tx(struct vionet_dev *dev)
> > /* Remove virtio header descriptor len */
> > pktsz -= hdr_desc->len;
> >
> > -   /* Only allow buffer len < max IP packet + Ethernet header */
> > -   if (pktsz > IP_MAXPACKET + ETHER_HDR_LEN) {
> > +   /* Drop frames larger than our MTU + ethernet header */
> > +   if 

Re: vmd(8): skip inspecting non-udp packets on local ifs

2021-05-23 Thread Mike Larkin
On Sat, May 22, 2021 at 10:20:37AM -0400, Dave Voutila wrote:
> tech@ & krw (since your code in question was imported to vmd),
>
> I found strange behavior running tcpbench(1) to measure the connection
> between a vmd guest and my host, as well as guest-to-guest. In short,
> it's some bogus logic in how vmd tries to intercept dhcp/bootp on local
> interfaces. Diff at the bottom addresses the issue, some background:
>
> Running tcpbench(1) for ~20-30s on my machine, vmd (with -v debug
> logging) barfs a bunch of lines like:
>
>   5 udp packets in 5 too long - dropped
>
> The tcpbench(1) throughput stalls out at that point and reports 0 Mbps
> avg bandwidth measurements.
>
> If anyone wants to reproduce, use an OpenBSD guest and just run:
>
>[host]$ tcpbench -s
>   [guest]$ tcpbench -t 180 100.64.x.2
>
> Where 'x' is the appropriate value for your guest's local interface.
>
> reyk@ imported packet.c from dhclient(8), but there's no validation that
> the packet being inspected is an IP/UDP packet vs. IP/TCP, leading to
> bogus logic related to inspecing UDP header attributes. In dhclient(8),
> the decode_udp_ip_header function is used in a place where a bpf capture
> buffer has already made sure it's a UDP packet (see sbin/dhclient/bpf.c).
>
> In addition, there was a lot of stateful counting and checking we just
> don't need in vmd(8), so I've ripped that out as well. It makes no sense
> in this context.
>
> OK?
>

ok mlarkin

>
> Index: packet.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/packet.c,v
> retrieving revision 1.1
> diff -u -p -r1.1 packet.c
> --- packet.c  19 Apr 2017 15:38:32 -  1.1
> +++ packet.c  22 May 2021 14:15:09 -
> @@ -220,12 +220,6 @@ decode_udp_ip_header(unsigned char *buf,
>   unsigned char *data;
>   u_int32_t ip_len;
>   u_int32_t sum, usum;
> - static unsigned int ip_packets_seen;
> - static unsigned int ip_packets_bad_checksum;
> - static unsigned int udp_packets_seen;
> - static unsigned int udp_packets_bad_checksum;
> - static unsigned int udp_packets_length_checked;
> - static unsigned int udp_packets_length_overflow;
>   int len;
>
>   /* Assure that an entire IP header is within the buffer. */
> @@ -236,17 +230,11 @@ decode_udp_ip_header(unsigned char *buf,
>   return (-1);
>
>   ip = (struct ip *)(buf + offset);
> - ip_packets_seen++;
> + if (ip->ip_p != IPPROTO_UDP)
> + return (-1);
>
>   /* Check the IP header checksum - it should be zero. */
>   if (wrapsum(checksum(buf + offset, ip_len, 0)) != 0) {
> - ip_packets_bad_checksum++;
> - if (ip_packets_seen > 4 && ip_packets_bad_checksum != 0 &&
> - (ip_packets_seen / ip_packets_bad_checksum) < 2) {
> - log_info("%u bad IP checksums seen in %u packets",
> - ip_packets_bad_checksum, ip_packets_seen);
> - ip_packets_seen = ip_packets_bad_checksum = 0;
> - }
>   return (-1);
>   }
>
> @@ -274,7 +262,6 @@ decode_udp_ip_header(unsigned char *buf,
>   if (buflen < offset + ip_len + sizeof(*udp))
>   return (-1);
>   udp = (struct udphdr *)(buf + offset + ip_len);
> - udp_packets_seen++;
>
>   /* Assure that the entire UDP packet is within the buffer. */
>   if (buflen < offset + ip_len + ntohs(udp->uh_ulen))
> @@ -286,20 +273,8 @@ decode_udp_ip_header(unsigned char *buf,
>* UDP header and the data. If the UDP checksum field is zero,
>* we're not supposed to do a checksum.
>*/
> - udp_packets_length_checked++;
>   len = ntohs(udp->uh_ulen) - sizeof(*udp);
>   if ((len < 0) || (len + data > buf + buflen)) {
> - udp_packets_length_overflow++;
> - if (udp_packets_length_checked > 4 &&
> - udp_packets_length_overflow != 0 &&
> - (udp_packets_length_checked /
> - udp_packets_length_overflow) < 2) {
> - log_info("%u udp packets in %u too long - dropped",
> - udp_packets_length_overflow,
> - udp_packets_length_checked);
> - udp_packets_length_overflow =
> - udp_packets_length_checked = 0;
> - }
>   return (-1);
>   }
>   if (len + data != buf + buflen)
> @@ -313,15 +288,7 @@ decode_udp_ip_header(unsigned char *buf,
>   2 * sizeof(ip->ip_src),
>   IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen);
>
> - udp_packets_seen++;
>   if (usum && usum != sum) {
> - udp_packets_bad_checksum++;
> - if (udp_packets_seen > 4 && udp_packets_bad_checksum != 0 &&
> - (udp_packets_seen / udp_packets_bad_checksum) < 2) {
> - log_info("%u bad udp checksums in %u packets",
> - 

Re: vmm(4): Mask TSC_ADJUST cpu feature

2021-05-20 Thread Mike Larkin
On Thu, May 20, 2021 at 07:36:23AM -0400, Dave Voutila wrote:
> We don't currently emulate all TSC related features yet. While hacking
> on other issues, I've found some more obnoxious guests (*cough* debian
> *cough*) constantly try to read the IA32_TSC_ADJUST msr every second,
> not getting the hint when we inject #GP. This floods the kernel message
> buffer with things like:
>
>   vmx_handle_rdmsr: unsupported rdmsr (msr=0x3b), injecting #GP
>
> (The above debug logging exists to help find msr's we're not supporting
> that guests are poking, so I guess you can say it's working as intended
> [1].)
>
> If and when we add more TSC capabilities to vmm we can always unmask.
>
> Ok?
>
> [1] https://marc.info/?l=openbsd-tech=161739346822128=2
>
> Index: sys/arch/amd64/include/vmmvar.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
> retrieving revision 1.71
> diff -u -p -r1.71 vmmvar.h
> --- sys/arch/amd64/include/vmmvar.h   5 Apr 2021 18:26:46 -   1.71
> +++ sys/arch/amd64/include/vmmvar.h   16 May 2021 16:55:06 -
> @@ -637,6 +637,7 @@ struct vm_mprotect_ept_params {
>
>  /*
>   * SEFF flags - copy from host minus:
> + *  TSC_ADJUST (SEFF0EBX_TSC_ADJUST)
>   *  SGX (SEFF0EBX_SGX)
>   *  HLE (SEFF0EBX_HLE)
>   *  INVPCID (SEFF0EBX_INVPCID)
> @@ -655,7 +656,8 @@ struct vm_mprotect_ept_params {
>   *  PT (SEFF0EBX_PT)
>   *  AVX512VBMI (SEFF0ECX_AVX512VBMI)
>   */
> -#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_SGX | SEFF0EBX_HLE | SEFF0EBX_INVPCID | 
> \
> +#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_TSC_ADJUST | SEFF0EBX_SGX | \
> +SEFF0EBX_HLE | SEFF0EBX_INVPCID | \
>  SEFF0EBX_RTM | SEFF0EBX_PQM | SEFF0EBX_MPX | \
>  SEFF0EBX_PCOMMIT | SEFF0EBX_PT | \
>  SEFF0EBX_AVX512F | SEFF0EBX_AVX512DQ | \

Yep, if we don't implement it we should not be advertising support for it.

ok mlarkin.



Re: patch: new fix for vmctl create

2021-05-12 Thread Mike Larkin
On Mon, Mar 15, 2021 at 08:21:56AM +, James Cook wrote:
> Hi tech@,
>
> The below patch removes calls to realpath(3) when looking up a qcow2
> base image. Previous thread:
> https://marc.info/?t=16156249642=1=2
>
> In short, the calls were failing inside vmctl, because of unveil. The
> other thread has alternative solutions but I think this is simplest.
>
> I included a regression test demonstrating the vmctl bug, in case
> there's interest. I tested vmd manually as described in the other
> thread.
>
> I also added a check in case dirname(3) fails --- I don't think it
> currently can, but better safe than sorry, I figure. (Noticed by Dave
> in the other thread.)
>
> - James
>

After looking at this a bit, we decided to remove the unveil parts around
the base images, since the realpath removal below would also affect vmd.

dv@ just committed that. Thanks for the diff and research!

>
> diff --git a/regress/usr.sbin/Makefile b/regress/usr.sbin/Makefile
> index 60e2178d3c7..146f9c9f322 100644
> --- a/regress/usr.sbin/Makefile
> +++ b/regress/usr.sbin/Makefile
> @@ -15,6 +15,7 @@ SUBDIR += rpki-client
>  SUBDIR += snmpd
>  SUBDIR += switchd
>  SUBDIR += syslogd
> +SUBDIR += vmctl
>
>  .if ${MACHINE} == "amd64" || ${MACHINE} == "i386"
>  SUBDIR += vmd
> diff --git a/regress/usr.sbin/vmctl/Makefile b/regress/usr.sbin/vmctl/Makefile
> new file mode 100644
> index 000..8fa87f0f6f0
> --- /dev/null
> +++ b/regress/usr.sbin/vmctl/Makefile
> @@ -0,0 +1,34 @@
> +# $OpenBSD$
> +
> +REGRESS_TARGETS = run-regress-convert-with-base-path
> +
> +run-regress-convert-with-base-path:
> + # non-relative base path
> + rm -f *.qcow2
> + vmctl create -s 1m base.qcow2
> + vmctl create -b ${PWD}/base.qcow2 source.qcow2
> + vmctl create -i source.qcow2 dest.qcow2
> +
> + # relative base path; two base images
> + rm -f *.qcow2
> + vmctl create -s 1m base0.qcow2
> + vmctl create -b base0.qcow2 base1.qcow2
> + vmctl create -b base1.qcow2 source.qcow2
> + vmctl create -i source.qcow2 dest.qcow2
> +
> + # copy from a different directory
> + rm -rf dir *.qcow2
> + vmctl create -s 1m base.qcow2
> + vmctl create -b base.qcow2 source.qcow2
> + mkdir dir
> + cd dir; vmctl create -i ../source.qcow2 dest.qcow2
> +
> + # base accessed through symlink
> + rm -rf dir sym *.qcow2
> + mkdir dir
> + cd dir; vmctl create -s 1m base.qcow2
> + cd dir; vmctl create -b base.qcow2 source.qcow2
> + ln -s dir sym
> + vmctl create -i sym/source.qcow2 dest.qcow2
> +
> +.include 
> diff --git a/usr.sbin/vmd/vioqcow2.c b/usr.sbin/vmd/vioqcow2.c
> index 34d0f116cc4..be8609f1644 100644
> --- a/usr.sbin/vmd/vioqcow2.c
> +++ b/usr.sbin/vmd/vioqcow2.c
> @@ -145,8 +145,8 @@ virtio_qcow2_init(struct virtio_backing *file, off_t 
> *szp, int *fd, size_t nfd)
>  ssize_t
>  virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
>  {
> + char pathbuf[PATH_MAX];
>   char dpathbuf[PATH_MAX];
> - char expanded[PATH_MAX];
>   struct qcheader header;
>   uint64_t backingoff;
>   uint32_t backingsz;
> @@ -180,27 +180,23 @@ virtio_qcow2_get_base(int fd, char *path, size_t npath, 
> const char *dpath)
>* rather than relative to the directory vmd happens to be running in,
>* since this is the only userful interpretation.
>*/
> - if (path[0] == '/') {
> - if (realpath(path, expanded) == NULL ||
> - strlcpy(path, expanded, npath) >= npath) {
> - log_warnx("unable to resolve %s", path);
> + if (path[0] != '/') {
> + if (strlcpy(pathbuf, path, sizeof(pathbuf)) >=
> + sizeof(pathbuf)) {
> + log_warnx("path too long: %s", path);
>   return -1;
>   }
> - } else {
>   if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
>   sizeof(dpathbuf)) {
>   log_warnx("path too long: %s", dpath);
>   return -1;
>   }
> - s = dirname(dpathbuf);
> - if (snprintf(expanded, sizeof(expanded),
> - "%s/%s", s, path) >= (int)sizeof(expanded)) {
> - log_warnx("path too long: %s/%s", s, path);
> + if ((s = dirname(dpathbuf)) == NULL) {
> + log_warn("dirname");
>   return -1;
>   }
> - if (npath < PATH_MAX ||
> - realpath(expanded, path) == NULL) {
> - log_warnx("unable to resolve %s", path);
> + if (snprintf(path, npath, "%s/%s", s, pathbuf) >= (int)npath) {
> + log_warnx("path too long: %s/%s", s, path);
>   return -1;
>   }
>   }
>



Re: potentially uninitialized string printed by vmd

2021-05-11 Thread Mike Larkin
On Mon, Mar 15, 2021 at 09:29:29AM +, James Cook wrote:
> > The array "base" which is passed to log_warnx might be uninitialized:
> > virtio_get_base doesn't necessarily touch it if it returns -1. Maybe it
> > would be better just omit base from the output, e.g.
> >
> > log_warnx("vm \"%s\" unable to read "
> > "base for disk %s", vcp->vcp_name,
> > vcp->vcp_disks[i]);
>
> Here it is as a patch.
>
> - James
>
> diff --git a/usr.sbin/vmd/config.c b/usr.sbin/vmd/config.c
> index 9ef5dca626e..3ce82052e4a 100644
> --- a/usr.sbin/vmd/config.c
> +++ b/usr.sbin/vmd/config.c
> @@ -393,8 +393,8 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, 
> uint32_t peerid, uid_t uid)
>   break;
>   if (n == -1) {
>   log_warnx("vm \"%s\" unable to read "
> - "base %s for disk %s", vcp->vcp_name,
> - base, vcp->vcp_disks[i]);
> + "base for disk %s", vcp->vcp_name,
> + vcp->vcp_disks[i]);
>   goto fail;
>   }
>   (void)strlcpy(path, base, sizeof(path));
>

Committed. I was going through old emails and found this. Sorry this took so
long.

Thanks!



Re: vmd(8): init debug logging before using logging

2021-05-03 Thread Mike Larkin
On Mon, May 03, 2021 at 08:50:36PM -0400, Dave Voutila wrote:
> If debug state in the logging routines isn't set, messages from
> fatal{,x} and warn{,x} don't get flushed to stderr, so running vmd
> un-daemonized can result in the process exiting at startup with no
> discernable message reason other than the ambiguous exit code (1).
>
> OK?
>
> Index: vmd.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v
> retrieving revision 1.123
> diff -u -p -r1.123 vmd.c
> --- vmd.c 26 Apr 2021 22:58:27 -  1.123
> +++ vmd.c 4 May 2021 00:44:56 -
> @@ -802,6 +802,9 @@ main(int argc, char **argv)
>   if (env->vmd_noaction && !env->vmd_debug)
>   env->vmd_debug = 1;
>
> + log_init(env->vmd_debug, LOG_DAEMON);
> + log_setverbose(env->vmd_verbose);
> +
>   /* check for root privileges */
>   if (env->vmd_noaction == 0) {
>   if (geteuid())
> @@ -835,9 +838,6 @@ main(int argc, char **argv)
>
>   /* Configuration will be parsed after forking the children */
>   env->vmd_conffile = conffile;
> -
> - log_init(env->vmd_debug, LOG_DAEMON);
> - log_setverbose(env->vmd_verbose);
>
>   if (env->vmd_noaction)
>   ps->ps_noaction = 1;
>

ok mlarkin



Re: added support for precompressed static files on httpd(so sorry about my other email that was unreadable)

2021-05-01 Thread Mike Larkin
On Sat, May 01, 2021 at 09:26:39PM +, alloca wrote:
> This patch adds a serve_gzip option. When enabled, If the client requests 
> path, then serve path.gz if it exists and the client accepts 
> Content-Encoding: gzip.
>
>


man style


> diff -up httpd.orig/config.c httpd/config.c
> --- httpd.orig/config.c Sat May  1 15:03:11 2021
> +++ httpd/config.c Sat May  1 15:45:43 2021
> @@ -568,12 +568,12 @@ config_getserver_config(struct httpd *env, struct serv
> >default_type, sizeof(struct media_type));
> }
>
> - f = SRVFLAG_PATH_REWRITE|SRVFLAG_NO_PATH_REWRITE;
> +/* f = SRVFLAG_PATH_REWRITE|SRVFLAG_NO_PATH_REWRITE;
> if ((srv_conf->flags & f) == 0) {
> srv_conf->flags |= parent->flags & f;
> (void)strlcpy(srv_conf->path, parent->path,
> sizeof(srv_conf->path));
> - }
> + } */
>
> f = SRVFLAG_SERVER_HSTS;
> srv_conf->flags |= parent->flags & f;
> diff -up httpd.orig/httpd.conf.5 httpd/httpd.conf.5
> --- httpd.orig/httpd.conf.5 Sat May  1 15:03:11 2021
> +++ httpd/httpd.conf.5 Sat May  1 16:02:44 2021
> @@ -397,6 +397,13 @@ a browser's preload list.
> Signal to the receiving user agent that this host and all sub domains
> of the host's domain should be considered HSTS hosts.
> .El
> +.It Ic serve_gzip
> +If the client requests
> +.Nm path ,
> +then serve
> +.Nm path.gz
> +if it exists and the client accepts
> +.Nm Content-Encoding: gzip .
> .It Ic listen on Ar address Oo Ic tls Oc Ic port Ar number
> Set the listen address and port.
> This statement can be specified multiple times.
> diff -up httpd.orig/httpd.h httpd/httpd.h
> --- httpd.orig/httpd.h Sat May  1 15:03:11 2021
> +++ httpd/httpd.h Sat May  1 15:41:58 2021
> @@ -390,17 +390,17 @@ SPLAY_HEAD(client_tree, client);
> #define SRVFLAG_SERVER_MATCH 0x0020
> #define SRVFLAG_SERVER_HSTS 0x0040
> #define SRVFLAG_DEFAULT_TYPE 0x0080
> -#define SRVFLAG_PATH_REWRITE 0x0100
> -#define SRVFLAG_NO_PATH_REWRITE 0x0200
> +/* #define SRVFLAG_PATH_REWRITE 0x0100
> +#define SRVFLAG_NO_PATH_REWRITE 0x0200 */
> #define SRVFLAG_LOCATION_FOUND 0x4000
> #define SRVFLAG_LOCATION_NOT_FOUND 0x8000
> -
> +#define SRVFLAG_SERVER_GZIP 0x0100
> #define SRVFLAG_BITS \
> "\10\01INDEX\02NO_INDEX\03AUTO_INDEX\04NO_AUTO_INDEX" \
> "\05ROOT\06LOCATION\07FCGI\10NO_FCGI\11LOG\12NO_LOG" \
> "\14SYSLOG\15NO_SYSLOG\16TLS\17ACCESS_LOG\20ERROR_LOG" \
> "\21AUTH\22NO_AUTH\23BLOCK\24NO_BLOCK\25LOCATION_MATCH" \
> - "\26SERVER_MATCH\27SERVER_HSTS\30DEFAULT_TYPE\31PATH\32NO_PATH" \
> + "\26SERVER_MATCH\27SERVER_HSTS\30DEFAULT_TYPE\31SERVER_GZIP" \
> "\37LOCATION_FOUND\40LOCATION_NOT_FOUND"
>
> #define TCPFLAG_NODELAY 0x01
> @@ -684,7 +684,7 @@ int server_headers(struct client *, void *,
> int (*)(struct client *, struct kv *, void *), void *);
> int server_writeresponse_http(struct client *);
> int server_response_http(struct client *, unsigned int,
> - struct media_type *, off_t, time_t);
> + struct media_type *, off_t, time_t, int);
> void server_reset_http(struct client *);
> void server_close_http(struct client *);
> int server_response(struct httpd *, struct client *);
> diff -up httpd.orig/parse.y httpd/parse.y
> --- httpd.orig/parse.y Sat May  1 15:03:11 2021
> +++ httpd/parse.y Sat May  1 15:48:31 2021
> @@ -138,7 +138,7 @@ typedef struct {
> %token COMBINED CONNECTION DHE DIRECTORY ECDHE ERR FCGI INDEX IP KEY LIFETIME
> %token LISTEN LOCATION LOG LOGDIR MATCH MAXIMUM NO NODELAY OCSP ON PORT 
> PREFORK
> %token PROTOCOLS REQUESTS ROOT SACK SERVER SOCKET STRIP STYLE SYSLOG TCP 
> TICKET
> -%token TIMEOUT TLS TYPE TYPES HSTS MAXAGE SUBDOMAINS DEFAULT PRELOAD REQUEST
> +%token TIMEOUT TLS TYPE TYPES HSTS SERVE_GZIP MAXAGE SUBDOMAINS DEFAULT 
> PRELOAD REQUEST
> %token ERROR INCLUDE AUTHENTICATE WITH BLOCK DROP RETURN PASS REWRITE
> %token CA CLIENT CRL OPTIONAL PARAM FORWARDED FOUND NOT
> %token  STRING
> @@ -644,6 +644,9 @@ serveroptsl : LISTEN ON STRING opttls port {
> }
> srv->srv_conf.flags |= SRVFLAG_SERVER_HSTS;
> }
> + | SERVE_GZIP {
> + srv->srv_conf.flags |= SRVFLAG_SERVER_GZIP;
> + }
> ;
>
> optfound : /* empty */ { $$ = 0; }
> @@ -925,23 +928,7 @@ requestflags_l : requestflags optcommanl requestflags_
> | requestflags optnl
> ;
>
> -requestflags : REWRITE STRING {
> - if (strlcpy(srv->srv_conf.path, $2,
> - sizeof(srv->srv_conf.path)) >=
> - sizeof(srv->srv_conf.path)) {
> - yyerror("request path too long");
> - free($2);
> - YYERROR;
> - }
> - free($2);
> - srv->srv_conf.flags |= SRVFLAG_PATH_REWRITE;
> - srv->srv_conf.flags &= ~SRVFLAG_NO_PATH_REWRITE;
> - }
> - | NO REWRITE {
> - srv->srv_conf.flags |= SRVFLAG_NO_PATH_REWRITE;
> - srv->srv_conf.flags &= ~SRVFLAG_PATH_REWRITE;
> - }
> - | STRIP NUMBER {
> +requestflags :  STRIP NUMBER {
> if ($2 < 0 || $2 > INT_MAX) {
> yyerror("invalid strip number");
> YYERROR;
> @@ -1431,6 +1418,7 @@ lookup(char *s)
> { "rewrite", REWRITE },
> { "root", ROOT },
> { "sack", SACK },
> + { "serve_gzip", SERVE_GZIP },
> { "server", SERVER },
> { "socket", SOCKET },
> { 

Re: vmd(8): remove duplicate struct definition

2021-04-29 Thread Mike Larkin
On Thu, Apr 29, 2021 at 03:24:42PM -0400, Dave Voutila wrote:
> Found this while running ctags(1)... vioqcow2.c has struct qcheader
> already defined at L53 (which stylistically is where it should be).
>
> This diff just removes the duplicate definition inside
> virtio_qcow2_create().
>
> OK?
>
>
> Index: vioqcow2.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/vioqcow2.c,v
> retrieving revision 1.14
> diff -u -p -r1.14 vioqcow2.c
> --- vioqcow2.c19 Oct 2020 19:06:49 -  1.14
> +++ vioqcow2.c29 Apr 2021 19:17:11 -
> @@ -634,27 +634,7 @@ int
>  virtio_qcow2_create(const char *imgfile_path,
>  const char *base_path, long imgsize)
>  {
> - struct qcheader {
> - char magic[4];
> - uint32_t version;
> - uint64_t backingoff;
> - uint32_t backingsz;
> - uint32_t clustershift;
> - uint64_t disksz;
> - uint32_t cryptmethod;
> - uint32_t l1sz;
> - uint64_t l1off;
> - uint64_t refoff;
> - uint32_t refsz;
> - uint32_t snapcount;
> - uint64_t snapsz;
> - /* v3 additions */
> - uint64_t incompatfeatures;
> - uint64_t compatfeatures;
> - uint64_t autoclearfeatures;
> - uint32_t reforder;
> - uint32_t headersz;
> - } __packed hdr, basehdr;
> + struct qcheader hdr, basehdr;
>   int fd, ret;
>   ssize_t base_len;
>   uint64_t l1sz, refsz, disksz, initsz, clustersz;
>

sure



Re: km_alloc(9) for i386 pmap

2021-04-23 Thread Mike Larkin
On Fri, Apr 23, 2021 at 08:07:43PM +0200, Martin Pieuchot wrote:
> Diff below convert the last uses of uvm_km_alloc(9) and uvm_km_zalloc(9)
> to km_alloc(9).
>
> One of the allocations below uses `kp_pageable' instead of `kp_zero'
> because the mapping for `pm_pdir_intel' is lost when PAE is enabled
> and need to be re-established when a fault happens.  This is consistent
> with what currently happens with uvm_km_zalloc().  Thanks to hshoexer@
> for the analysis.
>
> Fixing this is left as an exercise for the reader.  I'm currently
> concerned by getting rid of the old allocators.
>
> ok?
>

Reads ok. ok mlarkin

> Index: arch/i386/i386/pmap.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
> retrieving revision 1.211
> diff -u -p -r1.211 pmap.c
> --- arch/i386/i386/pmap.c 11 Mar 2021 11:16:57 -  1.211
> +++ arch/i386/i386/pmap.c 23 Apr 2021 17:36:57 -
> @@ -1365,7 +1365,7 @@ void
>  pmap_pinit_pd_86(struct pmap *pmap)
>  {
>   /* allocate PDP */
> - pmap->pm_pdir = uvm_km_alloc(kernel_map, NBPG);
> + pmap->pm_pdir = (vaddr_t)km_alloc(NBPG, _any, _dirty, _waitok);
>   if (pmap->pm_pdir == 0)
>   panic("pmap_pinit_pd_86: kernel_map out of virtual space!");
>   pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir,
> @@ -1397,7 +1397,8 @@ pmap_pinit_pd_86(struct pmap *pmap)
>* execution, one that lacks all kernel mappings.
>*/
>   if (cpu_meltdown) {
> - pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG);
> + pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, _any, _zero,
> + _waitok);
>   if (pmap->pm_pdir_intel == 0)
>   panic("%s: kernel_map out of virtual space!", __func__);
>
> @@ -1449,11 +1450,12 @@ pmap_destroy(struct pmap *pmap)
>   uvm_pagefree(pg);
>   }
>
> - uvm_km_free(kernel_map, pmap->pm_pdir, pmap->pm_pdirsize);
> + km_free((void *)pmap->pm_pdir, pmap->pm_pdirsize, _any, _dirty);
>   pmap->pm_pdir = 0;
>
>   if (pmap->pm_pdir_intel) {
> - uvm_km_free(kernel_map, pmap->pm_pdir_intel, pmap->pm_pdirsize);
> + km_free((void *)pmap->pm_pdir_intel, pmap->pm_pdirsize,
> + _any, _dirty);
>   pmap->pm_pdir_intel = 0;
>   }
>
> @@ -2522,8 +2524,9 @@ pmap_enter_special_86(vaddr_t va, paddr_
>   __func__, va);
>
>   if (!pmap->pm_pdir_intel) {
> - if ((pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG))
> - == 0)
> + pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, _any, _zero,
> + _waitok);
> + if (pmap->pm_pdir_intel == 0)
>   panic("%s: kernel_map out of virtual space!", __func__);
>   if (!pmap_extract(pmap, pmap->pm_pdir_intel,
>   >pm_pdirpa_intel))
> Index: arch/i386/i386/pmapae.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/pmapae.c,v
> retrieving revision 1.60
> diff -u -p -r1.60 pmapae.c
> --- arch/i386/i386/pmapae.c   23 Sep 2020 15:13:26 -  1.60
> +++ arch/i386/i386/pmapae.c   23 Apr 2021 17:59:05 -
> @@ -738,7 +738,7 @@ pmap_bootstrap_pae(void)
>   (uint32_t)VM_PAGE_TO_PHYS(ptppg));
>   }
>   }
> - uvm_km_free(kernel_map, (vaddr_t)pd, NBPG);
> + km_free(pd, NBPG, _any, _dirty);
>   DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd);
>   }
>
> @@ -944,7 +944,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
>   paddr_t pdidx[4];
>
>   /* allocate PDP */
> - pmap->pm_pdir = uvm_km_alloc(kernel_map, 4 * NBPG);
> + pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, _any, _dirty,
> + _waitok);
>   if (pmap->pm_pdir == 0)
>   panic("pmap_pinit_pd_pae: kernel_map out of virtual space!");
>   /* page index is in the pmap! */
> @@ -997,7 +998,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
>   if (cpu_meltdown) {
>   int i;
>
> - if ((va = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
> + va = (vaddr_t)km_alloc(4 * NBPG, _any, _zero, _nowait);
> + if (va == 0)
>   panic("%s: kernel_map out of virtual space!", __func__);
>   if (!pmap_extract(pmap_kernel(),
>   (vaddr_t)>pm_pdidx_intel, >pm_pdirpa_intel))
> @@ -1936,7 +1938,20 @@ pmap_enter_special_pae(vaddr_t va, paddr
>   __func__, va);
>
>   if (!pmap->pm_pdir_intel) {
> - if ((vapd = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
> +#if notyet
> + /*
> +  * XXX mapping is established via pmap_kenter() and lost
> +  * after enabling PAE.
> +  */
> + vapd = (vaddr_t)km_alloc(4 * NBPG, _any, _zero,
> + _waitok);
> 

Re: umm_map returns unaligned address?

2021-04-23 Thread Mike Larkin
On Fri, Apr 23, 2021 at 01:55:14PM +0200, Alessandro Pistocchi wrote:
> Hi all,
>
> I am fairly new to openbsd so if this is something obvious that I missed
> please be understanding.
>
> I am adding a syscall to openbsd 6.8. I am working on a raspberry pi.
>
> During the syscall I allocate some memory that I want to share between the
> kernel
> and the calling process.
>
> When it's time to wrap up and unmap the memory, I unmap it both from the
> kernel
> map and from the process map.
>
> The unmapping from the process map goes fine, the unmapping from the kernel
> map
> fails by saying that the virtual address in kernel map is not aligned to
> the page size
> ( it's actually 4 bytes off ).
>
> What have I missed? I assumed that umm_map would return a page aligned
> virtual
> address for the kernel mapping as well.
>
> Here is my code for creating the shared memory chunk:
>
> 
> // memory_size is a multiple of page size
> uvm_object = uao_create(memory_size, 0);
> if(!uvm_object) return;
>
> // TODO(ale): make sure that this memory cannot be swapped out
>
> uao_reference(uvm_object)
> if(uvm_map(kernel_map, (vaddr_t *), round_page(memory_size),
> uvm_object,
>0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
>MAP_INHERIT_SHARED, MADV_NORMAL, 0))) {
> uao_detach(uvm_object);
> uvm_object = 0;
> return;
> }
>
> uao_reference(uvm_object);
> if(uvm_map(>p_vmspace->vm_map, _in_proc_space,
> round_page(memory_size), uvm_object,
>0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
>MAP_INHERIT_NONE, MADV_NORMAL, 0))) {
> memory = 0;
> uao_detach(uvm_object);
> uao_detach(uvm_object);
> uvm_object = 0;
> return;
> }
> 
>
> Thanks,
> A

Please share the whole diff, this snippet above lacks context.



Re: vmd: spurious VM restarts

2021-04-07 Thread Mike Larkin
On Wed, Apr 07, 2021 at 07:47:28PM -0400, Dave Voutila wrote:
>
> Thomas L. writes:
>
> >> > Thomas: I looked at your host dmesg and your provided vm.conf. It
> >> > looks like 11 vm's with the default 512M memory and one (minecraft)
> >> > with 8G. Your host seems to have only 16GB of memory, some of which
> >> > is probably unavailable as it's used by the integrated gpu. I'm
> >> > wondering if you are effectively oversusbcribing your memory here.
> >> >
> >> > I know we currently don't support swapping guest memory out, but not
> >> > sure what happens if we don't have the physical memory to fault a
> >> > page in and wire it.
> >> >
> >>
> >> Something else gets swapped out.
> >
> > Wire == Can't swap out?
>
> Yes.
>
> > top shows 15G real memory available. That should be enough (8G + 11 *
> > 0.5G = 13.5G), or is this inherently risky with 6.8?
>
> With 6.8, the guests might have memory swapped out and worst case you'll
> see some performance issues. That shouldn't cause unexpected
> termination.
>

Depends on the exact content that got swapped out (as we didn't handle
TLB flushes correctly), so a crash was certainly a possibility. That's why
I wanted to see the VMM_DEBUG output.

In any case, Thomas should try -current and see if this problem is even
reproducible.

-ml

> > I can try -current as suggested in the other mail. Is this a likely
> > cause or should I run with VMM_DEBUG for further investigation? Is
> > "somewhat slower" from VMM_DEBUG still usable? I don't need full
> > performance, but ~month downtime until the problem shows again would be
> > too much.
>
> A fix is more likely to land in -current if an issue can be
> identified. Since the issue doesn't sound like it's easily reproducible
> yet, VMM_DEBUG is the best bet for having the information you'd need to
> share when the issue occurs.
>
> >> > Even without a custom kernel with VMM_DEBUG, if it's a uvm_fault
> >> > issue you should see a message in the kernel buffer. Something like:
> >> >
> >> >   vmx_fault_page: uvm_fault returns N, GPA=0x, rip=0x
> >> >
> >> > mlarkin: thoughts on my hypothesis? Am I wildly off course?
> >> >
> >> > -dv
> >> >
> >>
> >> Yeah I was trying to catch the big dump when a VM resets. That would
> >> tell us if the vm caused the reset or if vmd(8) crashed for some
> >> reason.
> >
> > But if vmd crashed it wouldn't restart automatically or does it?
> > All VMs down from vmd crashing would have been noticed.
> > That kernel message would have shown in the dmesg too, wouldn't it?
> >
>
> There are multiple factors. First is vmd(8) is multi-process and a vm's
> process can die without impacting others. Second is the vcpu could be
> reset making the guest "reboot." There are numerous reasons these things
> could happen, hence needing debug logging.
>
> -dv
>



Re: vmd: spurious VM restarts

2021-04-07 Thread Mike Larkin
On Wed, Apr 07, 2021 at 09:23:14AM -0400, Dave Voutila wrote:
>
> Dave Voutila writes:
>
> > Mike Larkin writes:
> >
> >> On Wed, Apr 07, 2021 at 12:22:23AM +0200, Thomas L. wrote:
> >>> On Tue, 6 Apr 2021 14:28:09 -0700
> >>> Mike Larkin  wrote:
> >>>
> >>> > On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> >>> > > On Tue, 6 Apr 2021 11:11:01 -0700
> >>> > > Mike Larkin  wrote:
> >>> > > > Anything in the host's dmesg?
> >>> > >
> >>> >
> >>> > *host* dmesg. I think you misread what I was after...
> >>>
> >>> The dmesg of the host was already attached to the first mail below the
> >>> vm.conf (I mistakenly called the host hypervisor, which I realize now is
> >>> not accurate). I figured since it was already attached, that
> >>> you must mean the VM, compounding the confusion ...
> >>>
> >>> Kind regards,
> >>>
> >>> Thomas
> >>>
> >>
> >> I see.
> >>
> >> You'll probably need to build a kernel with VMM_DEBUG and save that output 
> >> and
> >> send it to me once a VM crashes. Note: it will generate a lot of output and
> >> probably make things somewhat slower.
> >>
> >> -ml
> >
> > Thomas: I looked at your host dmesg and your provided vm.conf. It looks
> > like 11 vm's with the default 512M memory and one (minecraft) with
> > 8G. Your host seems to have only 16GB of memory, some of which is
> > probably unavailable as it's used by the integrated gpu. I'm wondering
> > if you are effectively oversusbcribing your memory here.
> >
> > I know we currently don't support swapping guest memory out, but not
> > sure what happens if we don't have the physical memory to fault a page
> > in and wire it.
>
> Looked a bit further and since your host is running 6.8 it doesn't have
> wiring memory logic, but I'd still be cautious about oversubscribing
> memory.
>

Yep. Try -current and see if this can be reproduced.

> >
> > Even without a custom kernel with VMM_DEBUG, if it's a uvm_fault issue
> > you should see a message in the kernel buffer. Something like:
> >
> >   vmx_fault_page: uvm_fault returns N, GPA=0x, rip=0x
> >
>
> You can also run vmd(8) with debug logging (-v or -vv) and maybe capture
> these events. Like with vmm(4) logging, it can be excessively verbose.
>
> > mlarkin: thoughts on my hypothesis? Am I wildly off course?
> >
> > -dv
>



Re: vmd: spurious VM restarts

2021-04-07 Thread Mike Larkin
On Wed, Apr 07, 2021 at 07:26:41AM -0400, Dave Voutila wrote:
>
> Mike Larkin writes:
>
> > On Wed, Apr 07, 2021 at 12:22:23AM +0200, Thomas L. wrote:
> >> On Tue, 6 Apr 2021 14:28:09 -0700
> >> Mike Larkin  wrote:
> >>
> >> > On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> >> > > On Tue, 6 Apr 2021 11:11:01 -0700
> >> > > Mike Larkin  wrote:
> >> > > > Anything in the host's dmesg?
> >> > >
> >> >
> >> > *host* dmesg. I think you misread what I was after...
> >>
> >> The dmesg of the host was already attached to the first mail below the
> >> vm.conf (I mistakenly called the host hypervisor, which I realize now is
> >> not accurate). I figured since it was already attached, that
> >> you must mean the VM, compounding the confusion ...
> >>
> >> Kind regards,
> >>
> >> Thomas
> >>
> >
> > I see.
> >
> > You'll probably need to build a kernel with VMM_DEBUG and save that output 
> > and
> > send it to me once a VM crashes. Note: it will generate a lot of output and
> > probably make things somewhat slower.
> >
> > -ml
>
> Thomas: I looked at your host dmesg and your provided vm.conf. It looks
> like 11 vm's with the default 512M memory and one (minecraft) with
> 8G. Your host seems to have only 16GB of memory, some of which is
> probably unavailable as it's used by the integrated gpu. I'm wondering
> if you are effectively oversusbcribing your memory here.
>
> I know we currently don't support swapping guest memory out, but not
> sure what happens if we don't have the physical memory to fault a page
> in and wire it.
>

Something else gets swapped out.

> Even without a custom kernel with VMM_DEBUG, if it's a uvm_fault issue
> you should see a message in the kernel buffer. Something like:
>
>   vmx_fault_page: uvm_fault returns N, GPA=0x, rip=0x
>
> mlarkin: thoughts on my hypothesis? Am I wildly off course?
>
> -dv
>

Yeah I was trying to catch the big dump when a VM resets. That would tell
us if the vm caused the reset or if vmd(8) crashed for some reason.



Re: vmd: spurious VM restarts

2021-04-06 Thread Mike Larkin
On Wed, Apr 07, 2021 at 12:22:23AM +0200, Thomas L. wrote:
> On Tue, 6 Apr 2021 14:28:09 -0700
> Mike Larkin  wrote:
>
> > On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> > > On Tue, 6 Apr 2021 11:11:01 -0700
> > > Mike Larkin  wrote:
> > > > Anything in the host's dmesg?
> > >
> >
> > *host* dmesg. I think you misread what I was after...
>
> The dmesg of the host was already attached to the first mail below the
> vm.conf (I mistakenly called the host hypervisor, which I realize now is
> not accurate). I figured since it was already attached, that
> you must mean the VM, compounding the confusion ...
>
> Kind regards,
>
> Thomas
>

I see.

You'll probably need to build a kernel with VMM_DEBUG and save that output and
send it to me once a VM crashes. Note: it will generate a lot of output and
probably make things somewhat slower.

-ml



Re: vmd: spurious VM restarts

2021-04-06 Thread Mike Larkin
On Tue, Apr 06, 2021 at 09:15:10PM +0200, Thomas L. wrote:
> On Tue, 6 Apr 2021 11:11:01 -0700
> Mike Larkin  wrote:
> > Anything in the host's dmesg?
>

*host* dmesg. I think you misread what I was after...

> Below is the dmesg and latest syslog from one of the VMs.
>
> OpenBSD 6.8 (GENERIC) #1: Tue Nov  3 09:04:47 MST 2020
> r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC
> real mem = 520085504 (495MB)
> avail mem = 489435136 (466MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 2.4 @ 0xf3f40 (10 entries)
> bios0: vendor SeaBIOS version "1.11.0p3-OpenBSD-vmm" date 01/01/2011
> bios0: OpenBSD VMM
> acpi at bios0 not configured
> cpu0 at mainbus0: (uniprocessor)
> cpu0: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz, 3403.18 MHz, 06-3a-09
> cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,CX8,SEP,PGE,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,CX16,SSE4.1,SSE4.2,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,LONG,LAHF,ITSC,FSGSBASE,SMEP,ERMS,MD_CLEAR,MELTDOWN
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, package 0
> cpu0: using VERW MDS workaround
> pvbus0 at mainbus0: OpenBSD
> pvclock0 at pvbus0
> pci0 at mainbus0 bus 0
> pchb0 at pci0 dev 0 function 0 "OpenBSD VMM Host" rev 0x00
> virtio0 at pci0 dev 1 function 0 "Qumranet Virtio RNG" rev 0x00
> viornd0 at virtio0
> virtio0: irq 3
> virtio1 at pci0 dev 2 function 0 "Qumranet Virtio Network" rev 0x00
> vio0 at virtio1: address fe:e1:ba:d0:00:04
> virtio1: irq 5
> virtio2 at pci0 dev 3 function 0 "Qumranet Virtio Storage" rev 0x00
> vioblk0 at virtio2
> scsibus1 at vioblk0: 1 targets
> sd0 at scsibus1 targ 0 lun 0: 
> sd0: 307200MB, 512 bytes/sector, 629145600 sectors
> virtio2: irq 6
> virtio3 at pci0 dev 4 function 0 "OpenBSD VMM Control" rev 0x00
> vmmci0 at virtio3
> virtio3: irq 7
> isa0 at mainbus0
> isadma0 at isa0
> com0 at isa0 port 0x3f8/8 irq 4: ns8250, no fifo
> com0: console
> vscsi0 at root
> scsibus2 at vscsi0: 256 targets
> softraid0 at root
> scsibus3 at softraid0: 256 targets
> root on sd0a (c14ce37920a910f7.a) swap on sd0b dump on sd0b
> WARNING: / was not properly unmounted
>
> Apr  6 14:39:33 schleuder /bsd: OpenBSD 6.8 (GENERIC) #1: Tue Nov  3 09:04:47 
> MST 2020
> Apr  6 14:39:33 schleuder /bsd: 
> r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC
> Apr  6 14:39:33 schleuder /bsd: real mem = 520085504 (495MB)
> Apr  6 14:39:33 schleuder /bsd: avail mem = 489435136 (466MB)
> Apr  6 14:39:33 schleuder /bsd: random: good seed from bootblocks
> Apr  6 14:39:33 schleuder /bsd: mpath0 at root
> Apr  6 14:39:33 schleuder /bsd: scsibus0 at mpath0: 256 targets
> Apr  6 14:39:33 schleuder /bsd: mainbus0 at root
> Apr  6 14:39:33 schleuder /bsd: bios0 at mainbus0: SMBIOS rev. 2.4 @ 0xf3f40 
> (10 entries)
> Apr  6 14:39:33 schleuder /bsd: bios0: vendor SeaBIOS version 
> "1.11.0p3-OpenBSD-vmm" date 01/01/2011
> Apr  6 14:39:33 schleuder /bsd: bios0: OpenBSD VMM
> Apr  6 14:39:33 schleuder /bsd: acpi at bios0 not configured
> Apr  6 14:39:33 schleuder /bsd: cpu0 at mainbus0: (uniprocessor)
> Apr  6 14:39:33 schleuder /bsd: cpu0: Intel(R) Core(TM) i7-3770 CPU @ 
> 3.40GHz, 3403.18 MHz, 06-3a-09
> Apr  6 14:39:33 schleuder /bsd: cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,CX8,SEP,PGE,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,PCLMUL,SSSE3,CX16,SSE4.1,SSE4.2,POPCNT,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,LONG,LAHF,ITSC,FSGSBASE,SMEP,ERMS,MD_CLEAR,MELTDOWN
> Apr  6 14:39:33 schleuder /bsd: cpu0: 256KB 64b/line 8-way L2 cache
> Apr  6 14:39:33 schleuder /bsd: cpu0: smt 0, core 0, package 0
> Apr  6 14:39:33 schleuder /bsd: cpu0: using VERW MDS workaround
> Apr  6 14:39:33 schleuder /bsd: pvbus0 at mainbus0: OpenBSD
> Apr  6 14:39:33 schleuder /bsd: pvclock0 at pvbus0
> Apr  6 14:39:33 schleuder /bsd: pci0 at mainbus0 bus 0
> Apr  6 14:39:33 schleuder /bsd: pchb0 at pci0 dev 0 function 0 "OpenBSD VMM 
> Host" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: virtio0 at pci0 dev 1 function 0 "Qumranet 
> Virtio RNG" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: viornd0 at virtio0
> Apr  6 14:39:33 schleuder /bsd: virtio0: irq 3
> Apr  6 14:39:33 schleuder /bsd: virtio1 at pci0 dev 2 function 0 "Qumranet 
> Virtio Network" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: vio0 at virtio1: address fe:e1:ba:d0:00:04
> Apr  6 14:39:33 schleuder /bsd: virtio1: irq 5
> Apr  6 14:39:33 schleuder /bsd: virtio2 at pci0 dev 3 function 0 "Qumranet 
> Virtio Storage" rev 0x00
> Apr  6 14:39:33 schleuder /bsd: vioblk0 at virtio2
> Apr  6 

Re: vmd(8): send correct response on unpause error

2021-04-06 Thread Mike Larkin
On Fri, Apr 02, 2021 at 07:14:34PM -0400, Dave Voutila wrote:
> If vmctl(8) sends an unpause request for a vm that doesn't exist, vmd(8)
> should be responding with the IMSG_VMDOP_UNPAUSE_VM_RESPONSE imsg_type
> with an ENOENT error code. (Similarly if the request comes from a user
> without permissions to unpause, the error is EPERM but the imsg_type is
> wrong.)
>
> Since the handling for pause/unpause are the same code path, vmd(8) is
> sending an IMSG_VMDOP_PAUSE_VM_RESPONSE in these situations (i.e. on an
> error unpausing).
>
> The below diff sets the cmd correctly based on the imsg being
> processed.
>
> For context, case statement in this switch block looks like:
>
>   case IMSG_VMDOP_PAUSE_VM:
>   case IMSG_VMDOP_UNPAUSE_VM:
>   IMSG_SIZE_CHECK(imsg, );
>   memcpy(, imsg->data, sizeof(vid));
> ..
>
> OK?
>
> -dv
>

This is ok mlarkin@ if it wasn't already committed.

-ml

>
> Index: usr.sbin/vmd/vmd.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v
> retrieving revision 1.121
> diff -u -p -r1.121 vmd.c
> --- usr.sbin/vmd/vmd.c29 Mar 2021 23:37:01 -  1.121
> +++ usr.sbin/vmd/vmd.c2 Apr 2021 23:06:47 -
> @@ -203,20 +203,26 @@ vmd_dispatch_control(int fd, struct priv
>   if (vid.vid_id == 0) {
>   if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
>   res = ENOENT;
> - cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
> + cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
> + ? IMSG_VMDOP_PAUSE_VM_RESPONSE
> + : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
>   break;
>   } else {
>   vid.vid_id = vm->vm_vmid;
>   }
>   } else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
>   res = ENOENT;
> - cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
> + cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
> + ? IMSG_VMDOP_PAUSE_VM_RESPONSE
> + : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
>   break;
>   }
>   if (vm_checkperm(vm, >vm_params.vmc_owner,
>   vid.vid_uid) != 0) {
>   res = EPERM;
> - cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
> + cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
> + ? IMSG_VMDOP_PAUSE_VM_RESPONSE
> + : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
>   break;
>   }
>   proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
>



Re: vmd: spurious VM restarts

2021-04-06 Thread Mike Larkin
On Tue, Apr 06, 2021 at 07:47:52PM +0200, Thomas L. wrote:
> Hi,
>
> I'm running OpenBSD 6.8 as hypervisor with multiple OpenBSD VMs.
> Regularly, it happens that all VM are restarted, not at the same time
> but clustered. The indication that this happend is reduced uptime on the
> VMs, some services that fail to come up again and the following logs:
>
> # grep vmd /var/log/daemon
> Apr  1 18:10:35 golem vmd[31367]: wiki: started vm 12 successfully, tty 
> /dev/ttyp0
> Apr  6 13:24:52 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
> Apr  6 13:25:55 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
> Apr  6 13:26:45 golem vmd[18933]: vmd: LSR UART write 0x8203d260 unsupported
> Apr  6 13:26:45 golem vmd[31367]: ticketfrei: started vm 5 successfully, tty 
> /dev/ttyp5
> Apr  6 14:22:34 golem vmd[31367]: www: started vm 4 successfully, tty 
> /dev/ttyp4
> Apr  6 14:33:54 golem vmd[31367]: kibicara: started vm 8 successfully, tty 
> /dev/ttyp8
> Apr  6 14:35:02 golem vmd[31367]: vpn: started vm 3 successfully, tty 
> /dev/ttyp3
> Apr  6 14:36:38 golem vmd[31367]: relay: started vm 1 successfully, tty 
> /dev/ttyp1
> Apr  6 14:37:51 golem vmd[31367]: schleuder: started vm 2 successfully, tty 
> /dev/ttyp2
> Apr  6 14:40:34 golem vmd[31367]: mumble: started vm 6 successfully, tty 
> /dev/ttyp6
> Apr  6 14:41:58 golem vmd[31367]: minecraft: started vm 9 successfully, tty 
> /dev/ttyp9
>
> The restarts seem to be non-graceful, since the matrix vm needed manual
> fsck on /var. Going back over the logs this seems to happen about every
> month (not all restarts are this phenomenon, but Mar 8/10 and Feb
> 17/20/22 seem like it):
>
> # zgrep vmd /var/log/daemon.0.gz
> Mar  8 19:43:07 golem vmd[31367]: wiki: started vm 12 successfully, tty 
> /dev/ttyp0
> Mar  8 19:43:37 golem vmd[31367]: ticketfrei: started vm 5 successfully, tty 
> /dev/ttyp5
> Mar 10 09:21:20 golem vmd[31367]: www: started vm 4 successfully, tty 
> /dev/ttyp4
> Mar 10 09:24:13 golem vmd[31367]: kibicara: started vm 8 successfully, tty 
> /dev/ttyp8
> Mar 10 09:26:13 golem vmd[31367]: vpn: started vm 3 successfully, tty 
> /dev/ttyp3
> Mar 10 09:28:40 golem vmd[31367]: gitea: started vm 7 successfully, tty 
> /dev/ttyp7
> Mar 10 09:29:01 golem vmd[31367]: relay: started vm 1 successfully, tty 
> /dev/ttyp1
> Mar 10 09:31:29 golem vmd[31367]: schleuder: started vm 2 successfully, tty 
> /dev/ttyp2
> Mar 10 09:34:02 golem vmd[31367]: mumble: started vm 6 successfully, tty 
> /dev/ttyp6
> Mar 10 09:35:44 golem vmd[31367]: minecraft: started vm 9 successfully, tty 
> /dev/ttyp9
> Mar 13 01:46:37 golem vmd[31367]: gitea: started vm 7 successfully, tty 
> /dev/ttyp7
> golem# zgrep vmd /var/log/daemon.1.gz
> Feb 17 21:18:45 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypc
> Feb 20 08:32:28 golem vmd[31367]: wiki: started vm 12 successfully, tty 
> /dev/ttyp0
> Feb 20 08:33:14 golem vmd[31367]: ticketfrei: started vm 5 successfully, tty 
> /dev/ttyp5
> Feb 20 08:35:20 golem vmd[31367]: www: started vm 4 successfully, tty 
> /dev/ttyp4
> Feb 20 11:09:01 golem vmd[31367]: kibicara: started vm 8 successfully, tty 
> /dev/ttyp8
> Feb 20 11:10:18 golem vmd[31367]: vpn: started vm 3 successfully, tty 
> /dev/ttyp3
> Feb 20 11:11:52 golem vmd[31367]: gitea: started vm 7 successfully, tty 
> /dev/ttyp7
> Feb 22 00:51:03 golem vmd[31367]: relay: started vm 1 successfully, tty 
> /dev/ttyp1
> Feb 22 00:52:44 golem vmd[31367]: schleuder: started vm 2 successfully, tty 
> /dev/ttyp2
> Feb 22 00:53:59 golem vmd[31367]: mumble: started vm 6 successfully, tty 
> /dev/ttyp6
> Feb 22 00:54:45 golem vmd[31367]: minecraft: started vm 9 successfully, tty 
> /dev/ttyp9
> Feb 24 23:01:50 golem vmd[31367]: vmd_sighdlr: reload requested with SIGHUP
> Feb 24 23:01:51 golem vmd[31367]: test: started vm 10 successfully, tty 
> /dev/ttypa
> Feb 24 23:01:51 golem vmd[52735]: test: unsupported refcount size
> Feb 24 23:06:27 golem vmd[31367]: vmd_sighdlr: reload requested with SIGHUP
> Feb 24 23:06:27 golem vmd[1230]: test: unsupported refcount size
> Feb 24 23:06:27 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
> Feb 24 23:06:27 golem vmd[31367]: test: started vm 10 successfully, tty 
> /dev/ttypc
> Feb 24 23:10:20 golem vmd[31367]: matrix: started vm 13 successfully, tty 
> /dev/ttypb
>
> vm.conf and dmesg of the hypervisor are below. How would I go
> about debugging this?
>
> Kind regards,
>
> Thomas
>

Anything in the host's dmesg?

>
> switch internal {
>   interface bridge0
>   locked lladdr
>   group internal
> }
>
>
> vm relay {
>   disk /data/vmd/relay.qcow2
>   interface {
>   switch internal
>   lladdr fe:e1:ba:d0:00:03
>   }
> }
>
> vm schleuder {
>   disk /data/vmd/schleuder.qcow2
>   interface {
>   switch internal
>   lladdr fe:e1:ba:d0:00:04
>   }
> }
>
> vm vpn {
>   disk 

Re: amd64: add MSR_TSC_ADJUST

2021-04-06 Thread Mike Larkin
On Mon, Apr 05, 2021 at 07:37:51PM -0500, Scott Cheloha wrote:
> Intel calls it "IA32_TSC_ADJUST".  Is "MSR_TSC_ADJUST" fine or should
> it be "MSR_IA32_TSC_ADJUST"?
>
> We have a feature flag for this one already, SEFF0EBX_TSC_ADJUST.
>
> Index: specialreg.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v
> retrieving revision 1.89
> diff -u -p -r1.89 specialreg.h
> --- specialreg.h  29 Mar 2021 12:39:02 -  1.89
> +++ specialreg.h  6 Apr 2021 00:31:58 -
> @@ -352,6 +352,7 @@
>  #define MSR_EBC_FREQUENCY_ID0x02c   /* Pentium 4 only */
>  #define  MSR_TEST_CTL0x033
>  #define MSR_IA32_FEATURE_CONTROL 0x03a
> +#define MSR_TSC_ADJUST   0x03b
>  #define MSR_SPEC_CTRL0x048   /* Speculation Control IBRS / 
> STIBP */
>  #define SPEC_CTRL_IBRS   (1ULL << 0)
>  #define SPEC_CTRL_STIBP  (1ULL << 1)
>

This seems fine to me. ok mlarkin



Re: monotonic time going back by wrong skews

2021-04-05 Thread Mike Larkin
On Sat, Apr 03, 2021 at 10:21:02PM -0500, Scott Cheloha wrote:
> On Fri, Apr 02, 2021 at 10:37:36AM -0700, Mike Larkin wrote:
> > On Thu, Apr 01, 2021 at 06:43:30PM -0500, Scott Cheloha wrote:
> > >
> > > [...]
> > >
> > > Hmmm.  Being able to work around this would be nice.
> > >
> > > FreeBSD has code that uses WRMSR to synchronize the TSC:
> > >
> > > https://cgit.freebsd.org/src/commit/sys/x86/x86/tsc.c?id=b2c63698d4b81576e0c8842263ee86e86cd34e76
> > >
> > > My guess is that support for writing the TSC is not implemented by
> > > every hypervisor, so we would need to be very careful in deciding when
> > > to try it.  Otherwise we end up with protection faults and other crap
> > > we don't want.
> > >
> >
> > We implemented rdmsr_safe for things like this. We could probably do the 
> > same
> > for wrmsr.
>
> Like this?
>
> Sorry if this is not idiomatic.  I don't write much assembly.
>
> I tested this a bit on my laptop.  Stuff like:
>
>   wrmsr_safe(MSR_TSC, rdtsc() + 100);
>
> Which seems to desync the normally synchronized TSCs here.
>
> Unclear what the rules are for RETGUARD.  I just copied what was in
> rdmsr_safe().  We're not using R10 so we can use R10?
>
> -Scott
>
> Index: include/cpufunc.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/cpufunc.h,v
> retrieving revision 1.36
> diff -u -p -r1.36 cpufunc.h
> --- include/cpufunc.h 13 Sep 2020 11:53:16 -  1.36
> +++ include/cpufunc.h 4 Apr 2021 03:16:48 -
> @@ -398,6 +398,7 @@ struct cpu_info_full;
>  void cpu_enter_pages(struct cpu_info_full *);
>
>  int rdmsr_safe(u_int msr, uint64_t *);
> +int wrmsr_safe(uint32_t msr, uint64_t);
>
>  #endif /* _KERNEL */
>
> Index: amd64/locore.S
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/locore.S,v
> retrieving revision 1.122
> diff -u -p -r1.122 locore.S
> --- amd64/locore.S3 Nov 2020 18:19:31 -   1.122
> +++ amd64/locore.S4 Apr 2021 03:16:48 -
> @@ -1154,6 +1154,30 @@ NENTRY(rdmsr_resume)
>   ret
>  END(rdmsr_safe)
>
> +/* int wrmsr_safe(uint32_t msr, uint64_t val) */
> +ENTRY(wrmsr_safe)
> + RETGUARD_SETUP(wrmsr_safe, r10)
> +
> + movl%edi,   %ecx/* uint32_t msr */
> +
> + movl%esi,   %eax/* uint64_t val */
> + sarq$32,%rsi
> + movl%esi,   %edx
> +
> + .globl  wrmsr_safe_fault
> +wrmsr_safe_fault:
> + wrmsr
> +
> + xorq%rax,   %rax
> + RETGUARD_CHECK(rdmsr_safe, r10)
> + ret
> +
> +NENTRY(wrmsr_resume)
> + movq$0x1,   %rax
> + RETGUARD_CHECK(wrmsr_safe, r10)
> + ret
> +END(wrmsr_safe)
> +
>  #if NXEN > 0
>   /* Hypercall page needs to be page aligned */
>   .text
>

You will need the handler case in vector.S also (like we did for rdmsr_safe).

(Sorry if this reply hits the list twice; mailer error on previous attempt).

-ml



Re: vmm.4: document supported ioctls

2021-04-02 Thread Mike Larkin
On Fri, Apr 02, 2021 at 03:24:55AM +0200, Klemens Nanni wrote:
> On Thu, Apr 01, 2021 at 08:34:37PM -0400, Dave Voutila wrote:
> > I've updated the diff using your feedback. See below.
> Thanks, OK kn
>

ok mlarkin@ also



Re: monotonic time going back by wrong skews

2021-04-02 Thread Mike Larkin
On Thu, Apr 01, 2021 at 06:43:30PM -0500, Scott Cheloha wrote:
> On Thu, Apr 01, 2021 at 03:41:24PM -0400, Josh Rickmar wrote:
> > On Thu, Apr 01, 2021 at 03:22:00PM -0400, Josh Rickmar wrote:
> > > On Thu, Apr 01, 2021 at 02:15:48PM -0500, Scott Cheloha wrote:
> > > > On Sat, Mar 27, 2021 at 02:20:21AM +, Stefmorino wrote:
> > > > > > Feel free to share your raw data.
> > > > >
> > > > > Also includes some standard sendbug dumps: https://0x0.st/-qng.tgz
> > > >
> > > > Thanks!
> > > >
> > > > TL;DR:
> > > >
> > > > Two things:
> > > >
> > > > 1. Could you check whether Linux will use the TSC as a clocksource on
> > > >this machine?  The dmesg output on any given distribution should
> > > >contain lines about the TSC.
> > > >
> > > >[...]
> > > >
> > > Hey, thanks for the reminder to try this out with Linux.  Will give it
> > > a shot shortly.
> > >
> > > As for the BIOS, 1.58 is the current version (found here):
> > >
> > > https://support.lenovo.com/us/en/downloads/ds503790
> > >
> > > This same issue was happening with all older BIOS versions that I have
> > > used as well.
>
> Okay, not great news, but at least the behavior is consistent.
>
> > Seems Linux doesn't like it either:
> >
> > localhost:~# dmesg | egrep -i 'tsc|clocksource'
> > [0.00] tsc: Fast TSC calibration using PIT
> > [0.00] tsc: Detected 1996.173 MHz processor
> > [0.043227] clocksource: refined-jiffies: mask: 0x max_cycles: 
> > 0x, max_idle_ms: 6370452778343963 ns
> > [0.114728] clocksource: hpet: mask: 0x max_cycles: 0x, 
> > max_idle_ns: 133484873504 ns
> > [0.131435] clocksource: tsc-early: mask: 0x max_cycles: 
> > 0x398c1ebcd00, max_idle_ns: 881590807727 ns
> > [0.244772] TSC synchronization [CPU#0 -> CPU#1]:
> > [0.244772] Measured 7296391160 warp between CPUs, turning off TSC clock.
> > [0.244772] tsc: Marking TSC unstable due to check_tsc_sync_source_failed
> > [0.252185] clocksource: jiffies: mask: 0x max_cycles: 
> > 0x, max_idle_ns: 6370867519511994 ns
> > [0.316884] clocksource: Switched to clocksource hpet
> > [0.335046] clocksource: acpi_pm: mask: 0xff max_cycles: 0xff, 
> > max_idle_ns: 2085701024 ns
>
> Hmmm.  Being able to work around this would be nice.
>
> FreeBSD has code that uses WRMSR to synchronize the TSC:
>
> https://cgit.freebsd.org/src/commit/sys/x86/x86/tsc.c?id=b2c63698d4b81576e0c8842263ee86e86cd34e76
>
> My guess is that support for writing the TSC is not implemented by
> every hypervisor, so we would need to be very careful in deciding when
> to try it.  Otherwise we end up with protection faults and other crap
> we don't want.
>

We implemented rdmsr_safe for things like this. We could probably do the same
for wrmsr.

-ml

> Doing this via TSC_ADJUST (instead of writing the TSC directly) is
> nicer because you just check for the CPUID level and bit.  No
> guesswork.  But we can't in your case because, as I said, no
> TSC_ADJUST support on your CPU.
>



Re: vmctl: off-by-one error handling mixing -a with a VM id

2021-03-29 Thread Mike Larkin
On Fri, Mar 26, 2021 at 07:24:32AM -0400, Dave Voutila wrote:
>
> Theo Buehler writes:
>
> > On Thu, Mar 25, 2021 at 08:07:53PM +0100, Preben Guldberg wrote:
> >> Dave Voutila wrote:
> >> > Preben Guldberg writes:
> >> > > The patch below addresses an off-by-one error reading argv when
> >> > > generating the error message.
> >>
> >> > > I personally find it clearer if the condition of mixing -a with an id
> >> > > is highlighted. I included a suggestion in the patch below.
> >>
> >> > Since -a and providing an id are mutually exclusive, I think it's more
> >> > helpful to print usage information via ctl_usage(res->ctl). From the
> >> > usage details, it's self explanatory what's wrong.
> >>
> >> >   usage:  vmctl [-v] stop [-fw] [id | -a]
> >>
> >> The updated diff below would do just that:
> >>
> >> % vmctl stop -a testvm
> >> usage:  vmctl [-v] stop [-fw] [id | -a]
> >
> > Yes, your diff would do that.
> >
> > However, I think the current logic is both wrong and the wrong way
> > around.  I believe the following is much clearer. It doesn't have a dead
> > else branch and it deletes 'ret', so it doesn't use it uninitialized when
> > checking 'res->action == CMD_STOPALL && ret != -1' (e.g. 'vmctl stop -a').
> > Since the diff is slightly messy, this is the result:
> >
> > if (res->action == CMD_STOPALL) {
> > if (argc != 0)
> > ctl_usage(res->ctl);
> > } else {
> > if (argc != 1)
> > ctl_usage(res->ctl);
> > if (parse_vmid(res, argv[0], 0) == -1)
> > errx(1, "invalid id: %s", argv[0]);
> > }
> >
> > return (vmmaction(res));
>
> I like this a lot better. The only thing to note is the only code path I
> can identify that will result in "invalid id" is using '-' as the
> id...parse_vmid prints warnings itself for other use cases. Having the
> errx here though is a nice guard if someone changes parse_vmid in the future.
>
> OK dv@
>

also ok mlarkin@

> >
> > Index: main.c
> > ===
> > RCS file: /cvs/src/usr.sbin/vmctl/main.c,v
> > retrieving revision 1.62
> > diff -u -p -r1.62 main.c
> > --- main.c  3 Jan 2020 05:32:00 -   1.62
> > +++ main.c  25 Mar 2021 19:23:16 -
> > @@ -927,7 +927,7 @@ ctl_start(struct parse_result *res, int
> >  int
> >  ctl_stop(struct parse_result *res, int argc, char *argv[])
> >  {
> > -   int  ch, ret;
> > +   int  ch;
> >
> > while ((ch = getopt(argc, argv, "afw")) != -1) {
> > switch (ch) {
> > @@ -948,20 +948,15 @@ ctl_stop(struct parse_result *res, int a
> > argc -= optind;
> > argv += optind;
> >
> > -   if (argc == 0) {
> > -   if (res->action != CMD_STOPALL)
> > +   if (res->action == CMD_STOPALL) {
> > +   if (argc != 0)
> > ctl_usage(res->ctl);
> > -   } else if (argc > 1)
> > -   ctl_usage(res->ctl);
> > -   else if (argc == 1)
> > -   ret = parse_vmid(res, argv[0], 0);
> > -   else
> > -   ret = -1;
> > -
> > -   /* VM id is only expected without the -a flag */
> > -   if ((res->action != CMD_STOPALL && ret == -1) ||
> > -   (res->action == CMD_STOPALL && ret != -1))
> > -   errx(1, "invalid id: %s", argv[1]);
> > +   } else {
> > +   if (argc != 1)
> > +   ctl_usage(res->ctl);
> > +   if (parse_vmid(res, argv[0], 0) == -1)
> > +   errx(1, "invalid id: %s", argv[0]);
> > +   }
> >
> > return (vmmaction(res));
> >  }
>
>
> --
> -Dave Voutila
>



Re: patch: vamm(4) IA32_EPT_VPID_CAP_XO_TRANSLATIONS specified incorrectly.

2021-03-29 Thread Mike Larkin
On Sat, Mar 27, 2021 at 10:15:27AM -0400, Dave Voutila wrote:
>
> Adam Steen writes:
>
> > Hi
> >
> > IA32_EPT_VPID_CAP_XO_TRANSLATIONS is specified incorrectly, see the
> > patch below.
>
> Adam's diff looks correct to me based on reading Intel SDM Vol 3D,
> Appendix A.10 (VPID and EPT Capabilities) [1]:
>
>   The IA32_VMX_EPT_VPID_CAP MSR (index 48CH) reports information about
>   the capabilities of the logical processor with regard to
>   virtual-processor identifiers (VPIDs, Section 28.1) and extended page
>   tables (EPT, Section 28.2):
>
> * If bit 0 is read as 1, the processor supports execute-only
> translations by EPT. This support allows software to configure EPT
> paging-structure entries in which bits 1:0 are clear (indicating
> that data accesses are not allowed) and bit 2 is set (indicating
> that instruction fetches are allowed).
>
> ...
>
> IA32_EPT_VPID_CAP_XO_TRANSLATIONS is only referenced in vmm.c.
>
> I've updated the diff so it applies cleanly, but didn't change the name
> of the capability as it's more accurate with "TRANSLATIONS" included
> imo.
>
> OK?
>
> >
> > Cheers
> > Adam
> >
> > On Fri, Feb 26, 2021 at 01:08:17PM +0800, Adam Steen wrote:
> >> Hi
> >>
> >> IA32_EPT_VPID_CAP_XO_TRANSLATIONS is specified as 0x0 and not (1ULL << 0)
> >> ie 0 and not bit 0 as on.
> >>
> >> Please see the attach diff to correct this and rename
> >> IA32_EPT_VPID_CAP_XO_TRANSLATIONS to IA32_EPT_VPID_CAP_XO to reduce
> >> wordyness.
> >>
> >> Cheers
> >> Adam
> >>
>
> [1] 
> https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-sdm-volume-3d-system-programming-guide-part-4.html
>
> -Dave
>
>
> Index: sys/arch/amd64/include/specialreg.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v
> retrieving revision 1.88
> diff -u -p -r1.88 specialreg.h
> --- sys/arch/amd64/include/specialreg.h   13 Sep 2020 05:57:28 -  
> 1.88
> +++ sys/arch/amd64/include/specialreg.h   27 Mar 2021 14:14:13 -
> @@ -957,7 +957,7 @@
>  #define IA32_VMX_TRUE_ENTRY_CTLS 0x490
>  #define IA32_VMX_VMFUNC  0x491
>
> -#define IA32_EPT_VPID_CAP_XO_TRANSLATIONS0x0
> +#define IA32_EPT_VPID_CAP_XO_TRANSLATIONS(1ULL << 0)
>  #define IA32_EPT_VPID_CAP_PAGE_WALK_4(1ULL << 6)
>  #define IA32_EPT_VPID_CAP_WB (1ULL << 14)
>  #define IA32_EPT_VPID_CAP_AD_BITS(1ULL << 21)
>

ok mlarkin@ if you want to commit this.



Re: vmm(4): fix boot issue for 9front guests

2021-03-29 Thread Mike Larkin
On Sun, Mar 28, 2021 at 09:28:11AM -0400, Bryan Steele wrote:
> On Sun, Mar 28, 2021 at 08:38:13AM -0400, Dave Voutila wrote:
> > abieber@ found the latest 9front release ends up in a boot loop if
> > hosted on an AMD system. I tracked it down to 9front (oddly) trying to
> > read the PAT msr prior to writing it. [1] The problem is vmm(4)'s msr
> > handling for svm injects #GP exceptions into the guest for most msr
> > reads (since we don't emulate more than a few).
> >
> > For those (two? few? dozen?) 9front users of AMD hardware and -current,
> > can you try the below diff?
> >
> > vmm(4)'s vmx msr handlers ignores this instruction and only logs the
> > rdmsr information if the kernel is built with VMM_DEBUG. vmm(4) will
> > advance the instruction pointer regardless and it's up to the guest to
> > deal with any resulting issues.
> >
> > The diff syncs the logic between the svm and vmx msr vm-exit handlers by
> > injecting #GP *ONLY* on attempts to read the SMBASE msr.
> >
> > For context, this is the vmx rdmsr handler's (vmx_handle_rdmsr) logic:
> >
> > switch (*rcx) {
> > case MSR_SMBASE:
> > /*
> >  * 34.15.6.3 - Saving Guest State (SMM)
> >  *
> >  * Unsupported, so inject #GP and return without
> >  * advancing %rip.
> >  */
> > ret = vmm_inject_gp(vcpu);
> > return (ret);
> > }
> >
> > It is *not* a design for emulating PAT access and manipulation by a
> > guest.
> >
> > (As an aside, OpenBSD doesn't bother reading the msr [2] before writing
> > to it, neither does Linux. Why is 9front special? ¯\_(ツ)_/¯)
> >
> > -Dave
> >
> > [1] https://code.9front.org/hg/plan9front/rev/10cd3e23a8c1
> > [2] 
> > https://github.com/openbsd/src/blob/36fd90dcf1acf2ddb4ef5dbabe5313b3a8d46ee2/sys/arch/amd64/amd64/cpu.c#L1145-L1168
> >

IIRC I had to advertise PAT support or some guest OS didn't work. I can't recall
what OS that was though (this was years ago).

I'd say just allow reading of the host PAT but discard all writes. Do the same 
on
both SVM and VMX, for now. See if this helps 9front.

Bonus points: there are rules for accessing and manipulating the PAT in a guest,
we could probably emulate that if desired.

-ml

> >
> > Index: sys/arch/amd64/amd64/vmm.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> > retrieving revision 1.278
> > diff -u -p -r1.278 vmm.c
> > --- sys/arch/amd64/amd64/vmm.c  11 Mar 2021 11:16:55 -  1.278
> > +++ sys/arch/amd64/amd64/vmm.c  28 Mar 2021 00:45:08 -
> > @@ -6545,10 +6545,16 @@ svm_handle_msr(struct vcpu *vcpu)
> > *rax = 0;
> > *rdx = 0;
> > break;
> > -   default:
> > -   DPRINTF("%s: guest read msr 0x%llx, injecting "
> > -   "#GP\n", __func__, *rcx);
> > +   case MSR_SMBASE:
> > +   /* Unsupported, inject #GP w/o advancing %rip */
> > ret = vmm_inject_gp(vcpu);
> > return (ret);
> > +#ifdef VMM_DEBUG
> > +   default:
> > +   /* Log the access to identify unknown MSRs */
> > +   DPRINTF("%s: rdmsr exit, msr=0x%llx, data "
> > +   "returned to guest=0x%llx:0x%llx\n",
> > +   __func__, *rcx, *rdx, *rax);
> > +#endif /* VMM_DEBUG */
> > }
> > }
>
> I'm not sure this is correct, doesn't this mean that registers will
> contain whatevever garbage that was in them beforehand, without
> injecting #GP host does the guest kernel to know the MSR read failed?
>
> I was initially concerned as this touches the codepath pd@ fixed last
> Feb where MSR reads were being passed through to the host, but still
> I think that injecting the #GP for unsupported MSR reads is right.
>
> -Bryan.
>



Re: UVM return(val)

2021-03-23 Thread Mike Larkin
On Tue, Mar 23, 2021 at 01:52:20PM +0100, Martin Pieuchot wrote:
> Diff below convert multiple "return(val)" and "return (val)" to
> "return val".  I only changed those that help decrease the size
> of the diff with NetBSD or didn't change anything.
>
> ok?
>

I read through these and agree this should not change any behaviour.

ok mlarkin if this helps you move forward by improving diffability.

> Index: uvm/uvm_amap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> retrieving revision 1.88
> diff -u -p -r1.88 uvm_amap.c
> --- uvm/uvm_amap.c20 Mar 2021 10:24:21 -  1.88
> +++ uvm/uvm_amap.c23 Mar 2021 12:14:26 -
> @@ -342,7 +342,7 @@ amap_alloc1(int slots, int waitf, int la
>   amap = pool_get(_small_amap_pool[slots - 1],
>   pwaitf | PR_ZERO);
>   if (amap == NULL)
> - return(NULL);
> + return NULL;
>
>   amap->am_lock = NULL;
>   amap->am_ref = 1;
> @@ -355,7 +355,7 @@ amap_alloc1(int slots, int waitf, int la
>
>   if (UVM_AMAP_SMALL(amap)) {
>   amap->am_small.ac_nslot = slots;
> - return (amap);
> + return amap;
>   }
>
>   amap->am_ncused = 0;
> @@ -392,14 +392,14 @@ amap_alloc1(int slots, int waitf, int la
>   }
>   }
>
> - return(amap);
> + return amap;
>
>  fail1:
>   free(amap->am_buckets, M_UVMAMAP, buckets * sizeof(*amap->am_buckets));
>   TAILQ_FOREACH_SAFE(chunk, >am_chunks, ac_list, tmp)
>   pool_put(_amap_chunk_pool, chunk);
>   pool_put(_amap_pool, amap);
> - return (NULL);
> + return NULL;
>  }
>
>  static void
> @@ -423,7 +423,7 @@ amap_alloc(vaddr_t sz, int waitf, int la
>
>   AMAP_B2SLOT(slots, sz); /* load slots */
>   if (slots > INT_MAX)
> - return (NULL);
> + return NULL;
>
>   amap = amap_alloc1(slots, waitf, lazyalloc);
>   if (amap != NULL) {
> @@ -431,7 +431,7 @@ amap_alloc(vaddr_t sz, int waitf, int la
>   amap_list_insert(amap);
>   }
>
> - return(amap);
> + return amap;
>  }
>
>
> Index: uvm/uvm_anon.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
> retrieving revision 1.53
> diff -u -p -r1.53 uvm_anon.c
> --- uvm/uvm_anon.c20 Mar 2021 10:24:21 -  1.53
> +++ uvm/uvm_anon.c23 Mar 2021 12:01:03 -
> @@ -67,7 +67,7 @@ uvm_analloc(void)
>   anon->an_page = NULL;
>   anon->an_swslot = 0;
>   }
> - return(anon);
> + return anon;
>  }
>
>  /*
> Index: uvm/uvm_aobj.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
> retrieving revision 1.92
> diff -u -p -r1.92 uvm_aobj.c
> --- uvm/uvm_aobj.c20 Mar 2021 10:24:21 -  1.92
> +++ uvm/uvm_aobj.c23 Mar 2021 12:17:00 -
> @@ -211,7 +211,7 @@ uao_find_swhash_elt(struct uvm_aobj *aob
>*/
>   LIST_FOREACH(elt, swhash, list) {
>   if (elt->tag == page_tag)
> - return(elt);
> + return elt;
>   }
>
>   if (!create)
> @@ -234,7 +234,7 @@ uao_find_swhash_elt(struct uvm_aobj *aob
>   LIST_INSERT_HEAD(swhash, elt, list);
>   elt->tag = page_tag;
>
> - return(elt);
> + return elt;
>  }
>
>  /*
> @@ -248,7 +248,7 @@ uao_find_swslot(struct uvm_aobj *aobj, i
>* if noswap flag is set, then we never return a slot
>*/
>   if (aobj->u_flags & UAO_FLAG_NOSWAP)
> - return(0);
> + return 0;
>
>   /*
>* if hashing, look in hash table.
> @@ -258,15 +258,15 @@ uao_find_swslot(struct uvm_aobj *aobj, i
>   uao_find_swhash_elt(aobj, pageidx, FALSE);
>
>   if (elt)
> - return(UAO_SWHASH_ELT_PAGESLOT(elt, pageidx));
> + return UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
>   else
> - return(0);
> + return 0;
>   }
>
>   /*
>* otherwise, look in the array
>*/
> - return(aobj->u_swslots[pageidx]);
> + return aobj->u_swslots[pageidx];
>  }
>
>  /*
> @@ -289,7 +289,7 @@ uao_set_swslot(struct uvm_object *uobj,
>*/
>   if (aobj->u_flags & UAO_FLAG_NOSWAP) {
>   if (slot == 0)
> - return(0);  /* a clear is ok */
> + return 0;   /* a clear is ok */
>
>   /* but a set is not */
>   printf("uao_set_swslot: uobj = %p\n", uobj);
> @@ -309,7 +309,7 @@ uao_set_swslot(struct uvm_object *uobj,
>   uao_find_swhash_elt(aobj, pageidx, slot ? TRUE : FALSE);
>   if (elt == NULL) {
>   KASSERT(slot == 0);
> - return (0);
> + return 0;
>   }
>
>   oldslot = 

Re: Remove booting from kernels in raw/qcow2 images in vmd(8)

2021-03-17 Thread Mike Larkin
On Wed, Mar 17, 2021 at 10:29:32PM +0100, Klemens Nanni wrote:
> On Sun, Mar 14, 2021 at 11:00:22AM -0400, Dave Voutila wrote:
> > Any takers?
> Yes, I plan to commit the updated diff at the end until friday
> unless someone objects.
>

no objection, thanks everyone.

ok mlarkin

> > Here's an updated diff also removes some logic in config.c related to
> > checking the value sent by vmctl(8)'s -b flag to see if it's the same as
> > the root disk image (-d).
> Both your first and this diff fail to apply, see inline.
>
> I've fixed both and tested the following diff (incl. mail to myself and
> apply from there).
>
> > Index: Makefile
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
> > retrieving revision 1.24
> > diff -u -p -u -p -r1.24 Makefile
> > --- Makefile23 Sep 2020 19:18:18 -  1.24
> > +++ Makefile14 Mar 2021 14:56:06 -
> > @@ -5,7 +5,7 @@
> >  PROG=  vmd
> >  SRCS=  vmd.c control.c log.c priv.c proc.c config.c vmm.c
> >  SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
> > -SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c 
> > packet.c
> > +SRCS+= ns8250.c i8253.c dhcp.c packet.c
> You remove disklabel.c here but not with `cvs rm';
> fixed in the diff below.
>
> >  SRCS+= parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c 
> > fw_cfg.c
> >
> >  CFLAGS+=   -Wall -I${.CURDIR}
>
> > Index: loadfile_elf.c
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/loadfile_elf.c,v
> > retrieving revision 1.36
> > diff -u -p -u -p -r1.36 loadfile_elf.c
> > --- loadfile_elf.c  26 Oct 2020 04:04:31 -  1.36
> > +++ loadfile_elf.c  14 Mar 2021 14:56:06 -
>
> > @@ -414,15 +407,6 @@ push_bootargs(bios_memmap_t *memmap, siz
> > memcpy([i + 3], , sizeof(bios_consdev_t));
> > i += consdev_sz / sizeof(int);
> >
> > -   if (bootmac) {
> > -   bootmac_sz = 3 * sizeof(int) + (sizeof(bios_bootmac_t) + 3) & 
> > ~3;
> > -   ba[i] = 0x7;   /* bootmac */
> > -   ba[i + 1] = bootmac_sz;
> > -   ba[i + 2] = bootmac_sz;
> > -   memcpy([i + 3], bootmac, sizeof(bios_bootmac_t));
> > -   i += bootmac_sz / sizeof(int);
> > -   }
> This line in the file ends with a single whitespace, but your diff does
> not have it;
> fixed in the diff below.
>
> > -
> > ba[i++] = 0x; /* BOOTARG_END */
> >
> > write_mem(BOOTARGS_PAGE, ba, PAGE_SIZE);
>
>
>
> Index: Makefile
> ===
> RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
> retrieving revision 1.24
> diff -u -p -r1.24 Makefile
> --- Makefile  23 Sep 2020 19:18:18 -  1.24
> +++ Makefile  17 Mar 2021 21:04:06 -
> @@ -5,7 +5,7 @@
>  PROG=vmd
>  SRCS=vmd.c control.c log.c priv.c proc.c config.c vmm.c
>  SRCS+=   vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
> -SRCS+=   ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c 
> packet.c
> +SRCS+=   ns8250.c i8253.c dhcp.c packet.c
>  SRCS+=   parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c 
> fw_cfg.c
>
>  CFLAGS+= -Wall -I${.CURDIR}
> Index: config.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/config.c,v
> retrieving revision 1.59
> diff -u -p -r1.59 config.c
> --- config.c  28 Feb 2021 22:56:09 -  1.59
> +++ config.c  17 Mar 2021 21:04:06 -
> @@ -216,7 +216,7 @@ config_setvm(struct privsep *ps, struct
>   struct vmop_create_params *vmc = >vm_params;
>   struct vm_create_params *vcp = >vmc_params;
>   unsigned int i, j;
> - int  fd = -1, vmboot = 0;
> + int  fd = -1;
>   int  kernfd = -1;
>   int *tapfds = NULL;
>   int  cdromfd = -1;
> @@ -295,16 +295,8 @@ config_setvm(struct privsep *ps, struct
>
>   if (!(vm->vm_state & VM_STATE_RECEIVED)) {
>   if (strlen(vcp->vcp_kernel)) {
> - /*
> -  * Boot kernel from disk image if path matches the
> -  * root disk.
> -  */
> - if (vcp->vcp_ndisks &&
> - strcmp(vcp->vcp_kernel, vcp->vcp_disks[0]) == 0)
> - vmboot = 1;
>   /* Open external kernel for child */
> - else if ((kernfd =
> - open(vcp->vcp_kernel, O_RDONLY)) == -1) {
> + if ((kernfd = open(vcp->vcp_kernel, O_RDONLY)) == -1) {
>   log_warn("%s: can't open kernel or BIOS "
>   "boot image %s", __func__, vcp->vcp_kernel);
>   

Re: vmm crash on 6.9-beta

2021-03-13 Thread Mike Larkin
On Wed, Mar 10, 2021 at 08:30:32PM +0100, Mischa wrote:
> On 10 Mar at 18:59, Mike Larkin  wrote:
> > On Wed, Mar 10, 2021 at 03:08:21PM +0100, Mischa wrote:
> > > Hi All,
> > >
> > > Currently I am running 6.9-beta on one of my hosts to test 
> > > veb(4)/vport(4).
> > >
> > > root@server14:~ # sysctl kern.version
> > > kern.version=OpenBSD 6.9-beta (GENERIC.MP) #385: Mon Mar  8 12:57:12 MST 
> > > 2021
> > > dera...@amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> > >
> > > On order to add some load to the system I created 41 additional VMs based 
> > > on a single qcow2 base image.
> > > A couple of those VMs crashed with the following ddb output.
> > >
> > > ddb> show panic
> > > ffs_valloc: dup alloc
> > > ddb> trace
> > > db_enter() at db_enter+0x10
> > > panic(81dc0709) at panic+0x12a
> > > ffs_inode_alloc(fd80269831e0,8180,fd803f7bb540,800014e1e3e8) 
> > > at ffs
> > > _inode_alloc+0x442
> > > ufs_makeinode(8180,fd8026a386a0,800014e1e6e0,800014e1e730) at 
> > > ufs_m
> > > akeinode+0x7f
> > > ufs_create(800014e1e490) at ufs_create+0x3c
> > > VOP_CREATE(fd8026a386a0,800014e1e6e0,800014e1e730,800014e1e4f0)
> > >  at VOP_CREATE+0x4a
> > > vn_open(800014e1e6b0,10602,180) at vn_open+0x182
> > > doopenat(800014e8a518,ff9c,70e0e92a500,10601,1b6,800014e1e8b0)
> > >  at d
> > > oopenat+0x1d0
> > > syscall(800014e1e920) at syscall+0x315
> > > Xsyscall() at Xsyscall+0x128
> > > end of kernel
> > > end trace frame: 0x7f7e5000, count: -10
> > >
> > > Mischa
> > >
> >
> > Probably not vmm(4) related but thanks for reporting!
>
> Could it be qcow2 related? or is this general disk? At least that is what I 
> think ffs_ is. :)
>
> Mischa
>

likely completely unrelated to anything vmd(8) is doing.



Re: Remove booting from kernels in raw/qcow2 images in vmd(8)

2021-03-11 Thread Mike Larkin
On Thu, Mar 11, 2021 at 06:11:03PM -0500, Dave Voutila wrote:
> tl;dr: tedu vmboot.{c,h}, ufs.c from vmd(8) to remove broken ability to
> exract and boot a kernel image from a raw or qcow2 disk image
>
> The following diff removes the ability to boot directly from a disk
> image containing a FFS filesystem. No new functionality is added. It's
> still possible to boot via a kernel image or with either disk or iso
> images via seabios. (PXE booting should still work via a kernel image,
> but I haven't tested it personally.)
>
> Why remove this?
>
> - since 6.7 switched to FFS2 as the default filesystem for new installs,
>   the ability for vmd(8) to load a kernel and boot.conf from a disk
>   image directly (without seabios) has been broken. tb@ apparently sent
>   a diff to update support for FFS2 awhile back, but it never made it
>   into the tree.
>
> - on 5th Jan 2021, new ramdisks for amd64 have started shipping gzip'd,
>   breaking the ability to load the bsd.rd directly as a kernel image for
>   a vmd(8) guest without first uncompressing the image
>
> Why not fix it?
>
> - using bios (via seabios) works
>
> - the FFS2 change happened ten months ago and afaict few if any have
>   complained about the breakage, so I'm not sure the value in fixing
>   it. vmctl(8) is still vague about supporting it per its man page and
>   you still have to pass the disk image twice as a -b and -d arg if
>   you're trying to avoid using seabios to boot an OpenBSD guest.
>
> - Josh Rickmar reported the gzip issue on bugs@ and provided patches to
>   add in support for compressed ramdisks and kernel images. In doing so,
>   we found the easiest way to add gzip kernel image support was to drop
>   support for FFS images since they require a call to fmemopen(3) while
>   all the other logic uses fopen(3)/fdopen(3) calls and a file
>   descriptor. I think it would be easier to get his patches into vmd(8)
>   if they don't have to account for extracting kernels from disk
>   images.
>
> I can understand an argument to shy away from relying on seabios for
> booting, but given it's readily available via fw_update(1) and is part
> of the default behavior, I'd imagine most won't miss this feature.
>
> If people ARE using direct booting of raw/qcow2 images (without using
> seabios) please speak up and instead I can look into dusting off tb@'s
> old diff.
>

reyk@ wrote that ffs module for vmd but since he has not stepped up to
maintain it after the ffs2 switch, I vote to remove it. If someone wants
to come back and fixup ffs2 support with the tb@ diff we can look at that
when said person steps up.

ok mlarkin

> --
> -Dave Voutila
>
>
> Index: Makefile
> ===
> RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
> retrieving revision 1.24
> diff -u -p -u -p -r1.24 Makefile
> --- Makefile  23 Sep 2020 19:18:18 -  1.24
> +++ Makefile  11 Mar 2021 22:10:08 -
> @@ -5,7 +5,7 @@
>  PROG=vmd
>  SRCS=vmd.c control.c log.c priv.c proc.c config.c vmm.c
>  SRCS+=   vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
> -SRCS+=   ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c 
> packet.c
> +SRCS+=   ns8250.c i8253.c dhcp.c packet.c
>  SRCS+=   parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c 
> fw_cfg.c
>
>  CFLAGS+= -Wall -I${.CURDIR}
> Index: loadfile.h
> ===
> RCS file: /cvs/src/usr.sbin/vmd/loadfile.h,v
> retrieving revision 1.12
> diff -u -p -u -p -r1.12 loadfile.h
> --- loadfile.h16 May 2019 21:16:04 -  1.12
> +++ loadfile.h11 Mar 2021 22:10:08 -
> @@ -73,8 +73,6 @@
>  #define PML2_PAGE 0x13000
>  #define NPTE_PG (PAGE_SIZE / sizeof(uint64_t))
>
> -int loadfile_elf(FILE *, struct vm_create_params *,
> -struct vcpu_reg_state *, uint32_t, uint32_t, unsigned int);
> +int loadfile_elf(FILE *, struct vm_create_params *, struct vcpu_reg_state *);
>
>  size_t mread(FILE *, paddr_t, size_t);
> -
> Index: loadfile_elf.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/loadfile_elf.c,v
> retrieving revision 1.36
> diff -u -p -u -p -r1.36 loadfile_elf.c
> --- loadfile_elf.c26 Oct 2020 04:04:31 -  1.36
> +++ loadfile_elf.c11 Mar 2021 22:10:10 -
> @@ -118,8 +118,8 @@ static void setsegment(struct mem_segmen
>  static int elf32_exec(FILE *, Elf32_Ehdr *, u_long *, int);
>  static int elf64_exec(FILE *, Elf64_Ehdr *, u_long *, int);
>  static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
> -static uint32_t push_bootargs(bios_memmap_t *, size_t, bios_bootmac_t *);
> -static size_t push_stack(uint32_t, uint32_t, uint32_t, uint32_t);
> +static uint32_t push_bootargs(bios_memmap_t *, size_t);
> +static size_t push_stack(uint32_t, uint32_t);
>  static void push_gdt(void);
>  static void push_pt_32(void);
>  

Re: vmm crash on 6.9-beta

2021-03-10 Thread Mike Larkin
On Wed, Mar 10, 2021 at 03:08:21PM +0100, Mischa wrote:
> Hi All,
>
> Currently I am running 6.9-beta on one of my hosts to test veb(4)/vport(4).
>
> root@server14:~ # sysctl kern.version
> kern.version=OpenBSD 6.9-beta (GENERIC.MP) #385: Mon Mar  8 12:57:12 MST 2021
> dera...@amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
>
> On order to add some load to the system I created 41 additional VMs based on 
> a single qcow2 base image.
> A couple of those VMs crashed with the following ddb output.
>
> ddb> show panic
> ffs_valloc: dup alloc
> ddb> trace
> db_enter() at db_enter+0x10
> panic(81dc0709) at panic+0x12a
> ffs_inode_alloc(fd80269831e0,8180,fd803f7bb540,800014e1e3e8) at 
> ffs
> _inode_alloc+0x442
> ufs_makeinode(8180,fd8026a386a0,800014e1e6e0,800014e1e730) at 
> ufs_m
> akeinode+0x7f
> ufs_create(800014e1e490) at ufs_create+0x3c
> VOP_CREATE(fd8026a386a0,800014e1e6e0,800014e1e730,800014e1e4f0)
>  at VOP_CREATE+0x4a
> vn_open(800014e1e6b0,10602,180) at vn_open+0x182
> doopenat(800014e8a518,ff9c,70e0e92a500,10601,1b6,800014e1e8b0) at 
> d
> oopenat+0x1d0
> syscall(800014e1e920) at syscall+0x315
> Xsyscall() at Xsyscall+0x128
> end of kernel
> end trace frame: 0x7f7e5000, count: -10
>
> Mischa
>

Probably not vmm(4) related but thanks for reporting!

-ml



Re: veb(4) support for vmd(8)?

2021-02-26 Thread Mike Larkin
On Sat, Feb 27, 2021 at 09:44:03AM +1000, David Gwynne wrote:
>
>
> > On 27 Feb 2021, at 7:50 am, Klemens Nanni  wrote:
> >
> > On Sat, Feb 27, 2021 at 07:30:56AM +1000, David Gwynne wrote:
> >> i think this is enough to let vmd wire guests up to veb interfaces.
> > But please update vm.conf(5) to mention veb(4) and vport(4) in as well
> > SWITCH CONFIGURATION.
>
> How would you fit wording about vport(4) in?
>
> >
> > OK kn
>

Do we want to just talk only about veb/vport and remove all the old discussion
around bridge/vether?



Re: uvm_fault: Comments & style cleanup

2021-02-15 Thread Mike Larkin
On Mon, Feb 15, 2021 at 01:15:33PM +0100, Martin Pieuchot wrote:
> On 15/02/21(Mon) 11:47, Martin Pieuchot wrote:
> > Diff below includes non-functional changes:
> >
> > - Sync comments with NetBSD including locking details.
> > - Remove superfluous parenthesis and spaces.
> > - Add brackets, even if questionable, to reduce diff with NetBSD
> > - Use for (;;) instead of while(1)
> > - Rename a variable from 'result' into 'error'.
> > - Move uvm_fault() and uvm_fault_upper_lookup()
> > - Add an locking assert in uvm_fault_upper_lookup()
>
> Updated diff on top of recent fix, still ok?
>

I reviewed the diff and agree it introduces no functional changes. If you are
still looking for oks and it helps you with the locking work, ok mlarkin.

-ml

> Index: uvm/uvm_fault.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.114
> diff -u -p -r1.114 uvm_fault.c
> --- uvm/uvm_fault.c   15 Feb 2021 12:12:54 -  1.114
> +++ uvm/uvm_fault.c   15 Feb 2021 12:14:08 -
> @@ -55,11 +55,11 @@
>   *read/write1 write>1  read/write   +-cow_write/zero
>   * | | ||
>   *  +--|--+   +--|--+ +-+   +  |  + | +-+
> - * amap |  V  |   |  --->new|  || |  ^  |
> + * amap |  V  |   |  -> new |  || |  ^  |
>   *  +-+   +-+ +-+   +  |  + | +--|--+
>   * |||
>   *  +-+   +-+   +--|--+ | +--|--+
> - * uobj | d/c |   | d/c |   |  V  | +|  |
> + * uobj | d/c |   | d/c |   |  V  | ++  |
>   *  +-+   +-+   +-+   +-+
>   *
>   * d/c = don't care
> @@ -69,7 +69,7 @@
>   *
>   *   case [1]: upper layer fault [anon active]
>   * 1A: [read] or [write with anon->an_ref == 1]
> - *   I/O takes place in top level anon and uobj is not touched.
> + *   I/O takes place in upper level anon and uobj is not touched.
>   * 1B: [write with anon->an_ref > 1]
>   *   new anon is alloc'd and data is copied off ["COW"]
>   *
> @@ -89,7 +89,7 @@
>   * the code is structured as follows:
>   *
>   * - init the "IN" params in the ufi structure
> - *   ReFault:
> + *   ReFault: (ERESTART returned to the loop in uvm_fault)
>   * - do lookups [locks maps], check protection, handle needs_copy
>   * - check for case 0 fault (error)
>   * - establish "range" of fault
> @@ -136,8 +136,8 @@
>   *by multiple map entries, and figuring out what should wait could be
>   *complex as well...).
>   *
> - * we use alternative 2 currently.   maybe alternative 3 would be useful
> - * in the future.XXX keep in mind for future consideration//rechecking.
> + * we use alternative 2.  given that we are multi-threaded now we may want
> + * to reconsider the choice.
>   */
>
>  /*
> @@ -177,7 +177,7 @@ uvmfault_anonflush(struct vm_anon **anon
>   int lcv;
>   struct vm_page *pg;
>
> - for (lcv = 0 ; lcv < n ; lcv++) {
> + for (lcv = 0; lcv < n; lcv++) {
>   if (anons[lcv] == NULL)
>   continue;
>   KASSERT(rw_lock_held(anons[lcv]->an_lock));
> @@ -222,14 +222,14 @@ uvmfault_init(void)
>  /*
>   * uvmfault_amapcopy: clear "needs_copy" in a map.
>   *
> + * => called with VM data structures unlocked (usually, see below)
> + * => we get a write lock on the maps and clear needs_copy for a VA
>   * => if we are out of RAM we sleep (waiting for more)
>   */
>  static void
>  uvmfault_amapcopy(struct uvm_faultinfo *ufi)
>  {
> -
> - /* while we haven't done the job */
> - while (1) {
> + for (;;) {
>   /* no mapping?  give up. */
>   if (uvmfault_lookup(ufi, TRUE) == FALSE)
>   return;
> @@ -258,36 +258,46 @@ uvmfault_amapcopy(struct uvm_faultinfo *
>   * uvmfault_anonget: get data in an anon into a non-busy, non-released
>   * page in that anon.
>   *
> - * => we don't move the page on the queues [gets moved later]
> - * => if we allocate a new page [we_own], it gets put on the queues.
> - *either way, the result is that the page is on the queues at return time
> + * => Map, amap and thus anon should be locked by caller.
> + * => If we fail, we unlock everything and error is returned.
> + * => If we are successful, return with everything still locked.
> + * => We do not move the page on the queues [gets moved later].  If we
> + *allocate a new page [we_own], it gets put on the queues.  Either way,
> + *the result is that the page is on the queues at return time
>   */
>  int
>  uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
>  struct vm_anon *anon)
>  {
> - boolean_t we_own;   /* we own anon's page? */
> -   

Re: XCP-ng, OpenBSD and network interface changes

2021-02-01 Thread Mike Belopuhov
On Sun, Jan 31, 2021 at 2:59 PM Denis Fondras  wrote:

> I am using XCP-ng with the latest OpenBSD snapshot.
>
> Whenever I make an hardware change in networking on the VM (connect or
> disconnect an interface, change associated network), the VM panics :
>
> openbsd# panic: grant table reference 5912 is held by domain 0: frame
> 0x1f1a4 flags 0x19
> Stopped at   db_enter+0x10: popq %rbp
> TID   PID  UIDPRFLAGS   PFLAGS CPU COMMAND
> *349758 6557900x14000   0x200   0 xenwatch
> db_enter() at db_enter+0x10
> panic(81da7541) at panic+0x12a
> xen_bus_dmamap_unload(820ede50,800e9380) at
> xen_bus_dmamap_unload+0x138
> xnf_tx_ring_destroy(80162000) at xnf_tx_ring_destroy+0x104
> xnf_detach(80162000,0) at xnf_detach+0x55
> config_detach(80162000,0) at config_detach+0x140
> xen_hotplug(8012e200) at xen_hotplug+0x181
> taskq_thread(800dde00) at taskq_thread+0x66
> end trace frame: 0x0, count: 7
> https://www.openbsd.org/ddb.html describes the minimum info required in
> bug reports. Insufficient info makes it difficult to find and fix bugs.
> ddb>
>
> If I apply the following patch, it obviously does not panic and seems to
> work
> correctly :
>
>
Hi Denis,

This is not a real fix unfortunately, you're just ignoring the issue.
Somehow the grant table reference is not released when we perform the
detach.
You can try increasing amount of iterations to 1 (or more) for example
and see
if this is a timing issue.

Cheers,
Mike


> Index: xen.c
> ===
> RCS file: /cvs/src/sys/dev/pv/xen.c,v
> retrieving revision 1.97
> diff -u -p -r1.97 xen.c
> --- xen.c   29 Jun 2020 06:50:52 -  1.97
> +++ xen.c   31 Jan 2021 13:13:07 -
> @@ -1204,7 +1204,7 @@ xen_grant_table_remove(struct xen_softc
> loop = 0;
> while (atomic_cas_uint(ptr, flags, GTF_invalid) != flags) {
> if (loop++ > 10) {
> -   panic("grant table reference %u is held "
> +   printf("grant table reference %u is held "
> "by domain %d: frame %#x flags %#x",
> ref + ge->ge_start, ge->ge_table[ref].domid,
> ge->ge_table[ref].frame,
> ge->ge_table[ref].flags);
>
> Can someone give me a clue on what _atomic_cas_uint() is ?
>
> Thank you in advance.
>
> Denis
>
> OpenBSD 6.8-current (GENERIC) #9: Sun Jan 31 14:08:42 CET 2021
> r...@openbsd.lab.ledeuns.net:/sys/arch/amd64/compile/GENERIC
> real mem = 1052770304 (1004MB)
> avail mem = 1005694976 (959MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 2.4 @ 0xeb01f (11 entries)
> bios0: vendor Xen version "4.13" date 01/21/2021
> bios0: Xen HVM domU
> acpi0 at bios0: ACPI 4.0
> acpi0: sleep states S5
> acpi0: tables DSDT FACP APIC HPET WAET
> acpi0: wakeup devices
> acpitimer0 at acpi0: 3579545 Hz, 32 bits
> acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
> ioapic0 at mainbus0: apid 1 pa 0xfec0, version 11, 48 pins, remapped
> cpu0 at mainbus0: apid 0 (boot processor)
> cpu0: Intel(R) Xeon(R) CPU E5-2407 v2 @ 2.40GHz, 2394.83 MHz, 06-3e-04
> cpu0:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,ACPI,MMX,FXSR,SSE,SSE2,SS,SSE3,PCLMUL,SSSE3,CX16,PCID,SSE4.1,SSE4.2,x2APIC,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,PAGE1GB,RDTSCP,LONG,LAHF,FSGSBASE,SMEP,ERMS,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,XSAVEOPT,MELTDOWN
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, package 0
> mtrr: Pentium Pro MTRR support, 8 var ranges, 88 fixed ranges
> cpu0: apic clock running at 100MHz
> acpihpet0 at acpi0: 6250 Hz
> acpiprt0 at acpi0: bus 0 (PCI0)
> acpipci0 at acpi0 PCI0
> acpicmos0 at acpi0
> "ACPI0007" at acpi0 not configured
> acpicpu0 at acpi0: C1(@1 halt!)
> cpu0: using VERW MDS workaround (except on vmm entry)
> pvbus0 at mainbus0: Hyper-V 0.0, Xen 4.13
> xen0 at pvbus0: features 0x2705, 64 grant table frames, event channel 2
> xbf0 at xen0 backend 0 channel 6: disk
> scsibus1 at xbf0: 1 targets
> sd0 at scsibus1 targ 0 lun 0: 
> sd0: 10240MB, 512 bytes/sector, 20971520 sectors
> xbf1 at xen0 backend 0 channel 7: cdrom
> xbf1: timed out waiting for backend to connect
> xnf0 at xen0 backend 0 channel 7: address 76:88:23:28:25:f4
> xnf1 at xen0 backend 0 channel 8: address 62:36:ed:68:46:3c
> xnf2 at xen0 backend 0 channel 9: address be:04:e2:f3:7d:75
> pci0 at mainbus0 bus 0
> pchb0 at pci0 dev 0 function 0 "Int

Re: Increase timeout length for VMs trying to fully shutdown

2021-01-05 Thread Mike Larkin
On Tue, Jan 05, 2021 at 12:49:29PM -0700, Tracey Emery wrote:
> Hello tech@,
>
> Some of us have been having shutdown issues with our VMs on OpenBSDAms.
> I tracked down the problem to too short of a timeout for the shutdown
> event.
>
> If there are an additional 1 or 2 package daemons running on the instance,
> the timeout triggers before the VM has shutdown the package daemons and
> properly synced the disks, resulting in a dirty startup.
>
> I've increased the timeout to 2 minutes instead of 30 seconds. My test
> VM on my laptop with 7 additional package daemons succeeded in 60
> seconds, but that might not be fast enough for slower disks.
>
> Am I being conservative enough with this number? Should it be another
> minute or two?
>
> Thoughts? Ok?
>
> --
>
> Tracey Emery
>
> diff 7a6bb14936050379800deb10d4a137c4d2d4a3c4 /usr/src
> blob - 9a64973ab998accb810d56c386c1bb92c204ab20
> file + usr.sbin/vmd/virtio.h
> --- usr.sbin/vmd/virtio.h
> +++ usr.sbin/vmd/virtio.h
> @@ -38,7 +38,7 @@
>
>  /* VMM Control Interface shutdown timeout (in seconds) */
>  #define VMMCI_TIMEOUT3
> -#define VMMCI_SHUTDOWN_TIMEOUT   30
> +#define VMMCI_SHUTDOWN_TIMEOUT   120
>
>  /* All the devices we support have either 1, 2 or 3 queues */
>  /* viornd - 1 queue
>

I took a look through the code. I'd say this bump is fine, there is no
side effect aside from just waiting for the VM to shutdown, *except*
possibly when waiting for the host to shutdown (/etc/rc in the shutdown
path), that might take longer if some VMs get stuck in their shutdown
code. But if you got impatient, you could always ^C at that point...

ok mlarkin

-ml



Re: PATCH: Fix PCI Config Space union size on VMM

2020-09-09 Thread Mike Larkin
On Mon, Sep 07, 2020 at 06:03:00PM -0500, Jordan Hargrave wrote:
> This code fixes the pci device union for accessing PCI config space >= 0x40
>
> Running pcidump -xxx in a virtual machine would return garbage data due to 
> union overlap
>

Thanks, looks good from my perspective.

-ml

> On Mon, Sep 07, 2020 at 05:52:55PM -0500, Jordan Hargrave wrote:
> > Index: pci.h
> > ===
> > RCS file: /cvs/src/usr.sbin/vmd/pci.h,v
> > retrieving revision 1.7
> > diff -u -p -u -r1.7 pci.h
> > --- pci.h   17 Sep 2017 23:07:56 -  1.7
> > +++ pci.h   7 Sep 2020 22:48:09 -
> > @@ -32,43 +32,44 @@ typedef int (*pci_iobar_fn_t)(int dir, u
> >  void *, uint8_t);
> >  typedef int (*pci_mmiobar_fn_t)(int dir, uint32_t ofs, uint32_t *data);
> >
> > -union pci_dev {
> > -   uint32_t pd_cfg_space[PCI_CONFIG_SPACE_SIZE / 4];
> >
> > -   struct {
> > -   uint16_t pd_vid;
> > -   uint16_t pd_did;
> > -   uint16_t pd_cmd;
> > -   uint16_t pd_status;
> > -   uint8_t pd_rev;
> > -   uint8_t pd_prog_if;
> > -   uint8_t pd_subclass;
> > -   uint8_t pd_class;
> > -   uint8_t pd_cache_size;
> > -   uint8_t pd_lat_timer;
> > -   uint8_t pd_header_type;
> > -   uint8_t pd_bist;
> > -   uint32_t pd_bar[PCI_MAX_BARS];
> > -   uint32_t pd_cardbus_cis;
> > -   uint16_t pd_subsys_vid;
> > -   uint16_t pd_subsys_id;
> > -   uint32_t pd_exp_rom_addr;
> > -   uint8_t pd_cap;
> > -   uint32_t pd_reserved0 : 24;
> > -   uint32_t pd_reserved1;
> > -   uint8_t pd_irq;
> > -   uint8_t pd_int;
> > -   uint8_t pd_min_grant;
> > -   uint8_t pd_max_grant;
> > +struct pci_dev {
> > +   union {
> > +   uint32_t pd_cfg_space[PCI_CONFIG_SPACE_SIZE / 4];
> > +   struct {
> > +   uint16_t pd_vid;
> > +   uint16_t pd_did;
> > +   uint16_t pd_cmd;
> > +   uint16_t pd_status;
> > +   uint8_t pd_rev;
> > +   uint8_t pd_prog_if;
> > +   uint8_t pd_subclass;
> > +   uint8_t pd_class;
> > +   uint8_t pd_cache_size;
> > +   uint8_t pd_lat_timer;
> > +   uint8_t pd_header_type;
> > +   uint8_t pd_bist;
> > +   uint32_t pd_bar[PCI_MAX_BARS];
> > +   uint32_t pd_cardbus_cis;
> > +   uint16_t pd_subsys_vid;
> > +   uint16_t pd_subsys_id;
> > +   uint32_t pd_exp_rom_addr;
> > +   uint8_t pd_cap;
> > +   uint32_t pd_reserved0 : 24;
> > +   uint32_t pd_reserved1;
> > +   uint8_t pd_irq;
> > +   uint8_t pd_int;
> > +   uint8_t pd_min_grant;
> > +   uint8_t pd_max_grant;
> > +   } __packed;
> > +   };
> > +   uint8_t pd_bar_ct;
> > +   pci_cs_fn_t pd_csfunc;
> >
> > -   uint8_t pd_bar_ct;
> > -   pci_cs_fn_t pd_csfunc;
> > -
> > -   uint8_t pd_bartype[PCI_MAX_BARS];
> > -   uint32_t pd_barsize[PCI_MAX_BARS];
> > -   void *pd_barfunc[PCI_MAX_BARS];
> > -   void *pd_bar_cookie[PCI_MAX_BARS];
> > -   } __packed;
> > +   uint8_t pd_bartype[PCI_MAX_BARS];
> > +   uint32_t pd_barsize[PCI_MAX_BARS];
> > +   void *pd_barfunc[PCI_MAX_BARS];
> > +   void *pd_bar_cookie[PCI_MAX_BARS];
> >  };
> >
> >  struct pci {
> > @@ -79,7 +80,7 @@ struct pci {
> > uint32_t pci_addr_reg;
> > uint32_t pci_data_reg;
> >
> > -   union pci_dev pci_devices[PCI_CONFIG_MAX_DEV];
> > +   struct pci_dev pci_devices[PCI_CONFIG_MAX_DEV];
> >  };
> >
> >  void pci_handle_address_reg(struct vm_run_params *);
> >
>



Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-25 Thread Mike Larkin
On Tue, Aug 25, 2020 at 12:12:36PM -0700, Mike Larkin wrote:
> On Mon, Aug 24, 2020 at 01:55:45AM +0200, Mark Kettenis wrote:
> > > Date: Sun, 23 Aug 2020 18:11:12 -0500
> > > From: Scott Cheloha 
> > >
> > > Hi,
> > >
> > > Other BSDs use the TSC to implement delay(9) if the TSC is constant
> > > and invariant.  Here's a patch to add something similar to our kernel.
> >
> > If the TSC is fine as a timecounter it should be absolutely fine for
> > use as delay().  And we could even use if the TSC isn't synchronized
> > between CPUs.
> >
> > >
> > > This patch (or something equivalent) is a prerequisite to running the
> > > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > > implement delay(9) when it isn't running in periodic mode is too
> > > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > > need an alternative.
> >
> > Hmm, but what are we going to use on machines where the TSC isn't
> > constant/invariant?
> >
> > In what respect is the i8254 too slow?  Does it take more than a
> > microsecond to read it?
> >
>
> It's 3 outb/inb pairs to ensure you get the reading correct. So that could
> be quite a long time (as cheloha@ points out). Also, that's 6 VM exits if
> running virtually (I realize that's not the main use case here but just
> saying...)
>
> IIRC the 3 in/out pairs are the latch command followed by reading the LSB/MSB
> of the counter. It's not MMIO like the HPET or ACPI timer.
>
> And as cheloha@ also points out, it is highly likely that none of us have a
> real i8254 anymore, much of this is probably implemented in some EC somewhere
> and it's unlikely the developer of said EC put a lot of effort into optimizing
> the implementation of a legacy device like this.
>
> On the topic of virtualization:
>
> while (rdtsc() - start < want)
>  rdtsc();
>

I just realized the original diff didn't do two rdtscs. It did a pause inside 
the
loop. So the effect is not *as* bad as I described but it's still *somewhat* 
bad.

PS - pause loop exiting can be enabled to improve performance in this situation.

> ..produces two VM exits (generally, on most hypervisors) since the TSC is
> usually time corrected. That's a lot of exits, and it gets worse on faster
> machines. I don't have a better idea, however. There may be a PV clock option
> that is more optimized in some scenarios.
>
> -ml
>
>
> > We could use the HPET I suppose, whic may be a bit better.
> >
> > > As for the patch, it works for me here, though I'd appreciate a few
> > > tests.  I admit that comparing function pointers is ugly, but I think
> > > this is as simple as it can be without implementing some sort of
> > > framework for "registering" delay(9) implementations and comparing
> > > them and selecting the "best" implementation.
> >
> > What about:
> >
> > if (delay_func == NULL)
> > delay_func = lapic_delay;
> >
> > > I'm not sure I put the prototypes in the right headers.  We don't have
> > > a tsc.h but cpuvar.h looks sorta-correct for tsc_delay().
> >
> > I think cpuvar.h is fine since it has other TSC-related stuff.
> > However, with my suggestion above you can drop that.
> >
> > > FreeBSD's x86/delay.c may be of note:
> > >
> > > https://github.com/freebsd/freebsd/blob/ed96335a07b688c39e16db8856232e5840bc22ac/sys/x86/x86/delay.c
> > >
> > > Thoughts?
> > >
> > > Index: amd64/tsc.c
> > > ===
> > > RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
> > > retrieving revision 1.20
> > > diff -u -p -r1.20 tsc.c
> > > --- amd64/tsc.c   23 Aug 2020 21:38:47 -  1.20
> > > +++ amd64/tsc.c   23 Aug 2020 22:59:25 -
> > > @@ -26,6 +26,7 @@
> > >
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #define RECALIBRATE_MAX_RETRIES  5
> > >  #define RECALIBRATE_SMI_THRESHOLD5
> > > @@ -252,7 +253,8 @@ tsc_timecounter_init(struct cpu_info *ci
> > >   tsc_timecounter.tc_quality = -1000;
> > >   tsc_timecounter.tc_user = 0;
> > >   tsc_is_invariant = 0;
> > > - }
> > > + } else
> > > + delay_func = tsc_delay;
> > >
> > >   tc_init(_timecounter);
> > >  }
> > > @@ -342,4 +344,15 @@ tsc_sync_ap(struct cpu_info *ci)
> &

Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-25 Thread Mike Larkin
On Mon, Aug 24, 2020 at 12:29:15AM -0500, Scott Cheloha wrote:
> On Sun, Aug 23, 2020 at 11:45:22PM -0500, Scott Cheloha wrote:
> >
> > [...]
> >
> > > > This patch (or something equivalent) is a prerequisite to running the
> > > > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > > > implement delay(9) when it isn't running in periodic mode is too
> > > > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > > > need an alternative.
> > >
> > > Hmm, but what are we going to use on machines where the TSC isn't
> > > constant/invariant?
> >
> > Probably fall back on the i8254?  Unless someone wants to add yet
> > another delay(9) implementation to amd64...
> >
> > > In what respect is the i8254 too slow?  Does it take more than a
> > > microsecond to read it?
> >
> > On my machine, the portion of gettick() *within* the mutex runs in ~19
> > microseconds.
> >
> > That's before any overhead from mtx_enter(9).  I think having multiple
> > threads in delay(9) should be relatively rare, but you have to keep
> > that in mind.
> >
> > No idea what the overhead would look like on real hardware.  I'm
> > pretty sure my i8254 is emulated.
> >
> > > We could use the HPET I suppose, whic may be a bit better.
> >
> > It's better.  No mutex.  On my machine it takes ~11 microseconds.
> > It's a start.
>
> Hmmm, now I'm worried I have screwed something up or misconfigured
> something.
>
> It doesn't seem right that it would take 20K cycles to read the HPET
> on this machine.
>
> Am I way off?  Or is 20K actually a reasonable number?
>

There have been reports of the HPET being really slow on some machines.
IIRC this is why we ended up getting a tsc timecounter a number of years
ago. Someone (reyk@?) found his skylake had a super slow HPET and that
ended up being part of the impetus to to a tsc timecounter.

Also, 20k cycles is totally expected if you are on a VM (not sure if
this is the case).


> For comparison, lapic_gettick() completes in... 80 nanoseconds (?) on
> the same machine.  Relevant sysctls:
>

LAPIC memory page accesses go to the CPU. It's not always the case that
the HPET does the same (they may be accessed via PCI). Also, in a VM,
on new CPUs, LAPIC virtualization can be enabled which means no exits
for LAPIC accesses. So, yeah, these numbers you are seeing aren't surprising.

> $ sysctl hw.{model,setperf,perfpolicy} machdep.{tscfreq,invarianttsc}
> hw.model=Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz
> hw.setperf=100
> hw.perfpolicy=high
> machdep.tscfreq=211200
> machdep.invarianttsc=1
>
> ... if it really takes that long, then "high precision" is a bit of a
> misnomer.
>



Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-25 Thread Mike Larkin
On Mon, Aug 24, 2020 at 01:55:45AM +0200, Mark Kettenis wrote:
> > Date: Sun, 23 Aug 2020 18:11:12 -0500
> > From: Scott Cheloha 
> >
> > Hi,
> >
> > Other BSDs use the TSC to implement delay(9) if the TSC is constant
> > and invariant.  Here's a patch to add something similar to our kernel.
>
> If the TSC is fine as a timecounter it should be absolutely fine for
> use as delay().  And we could even use if the TSC isn't synchronized
> between CPUs.
>
> >
> > This patch (or something equivalent) is a prerequisite to running the
> > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > implement delay(9) when it isn't running in periodic mode is too
> > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > need an alternative.
>
> Hmm, but what are we going to use on machines where the TSC isn't
> constant/invariant?
>
> In what respect is the i8254 too slow?  Does it take more than a
> microsecond to read it?
>

It's 3 outb/inb pairs to ensure you get the reading correct. So that could
be quite a long time (as cheloha@ points out). Also, that's 6 VM exits if
running virtually (I realize that's not the main use case here but just
saying...)

IIRC the 3 in/out pairs are the latch command followed by reading the LSB/MSB
of the counter. It's not MMIO like the HPET or ACPI timer.

And as cheloha@ also points out, it is highly likely that none of us have a
real i8254 anymore, much of this is probably implemented in some EC somewhere
and it's unlikely the developer of said EC put a lot of effort into optimizing
the implementation of a legacy device like this.

On the topic of virtualization:

while (rdtsc() - start < want)
 rdtsc();

..produces two VM exits (generally, on most hypervisors) since the TSC is
usually time corrected. That's a lot of exits, and it gets worse on faster
machines. I don't have a better idea, however. There may be a PV clock option
that is more optimized in some scenarios.

-ml


> We could use the HPET I suppose, whic may be a bit better.
>
> > As for the patch, it works for me here, though I'd appreciate a few
> > tests.  I admit that comparing function pointers is ugly, but I think
> > this is as simple as it can be without implementing some sort of
> > framework for "registering" delay(9) implementations and comparing
> > them and selecting the "best" implementation.
>
> What about:
>
>   if (delay_func == NULL)
>   delay_func = lapic_delay;
>
> > I'm not sure I put the prototypes in the right headers.  We don't have
> > a tsc.h but cpuvar.h looks sorta-correct for tsc_delay().
>
> I think cpuvar.h is fine since it has other TSC-related stuff.
> However, with my suggestion above you can drop that.
>
> > FreeBSD's x86/delay.c may be of note:
> >
> > https://github.com/freebsd/freebsd/blob/ed96335a07b688c39e16db8856232e5840bc22ac/sys/x86/x86/delay.c
> >
> > Thoughts?
> >
> > Index: amd64/tsc.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
> > retrieving revision 1.20
> > diff -u -p -r1.20 tsc.c
> > --- amd64/tsc.c 23 Aug 2020 21:38:47 -  1.20
> > +++ amd64/tsc.c 23 Aug 2020 22:59:25 -
> > @@ -26,6 +26,7 @@
> >
> >  #include 
> >  #include 
> > +#include 
> >
> >  #define RECALIBRATE_MAX_RETRIES5
> >  #define RECALIBRATE_SMI_THRESHOLD  5
> > @@ -252,7 +253,8 @@ tsc_timecounter_init(struct cpu_info *ci
> > tsc_timecounter.tc_quality = -1000;
> > tsc_timecounter.tc_user = 0;
> > tsc_is_invariant = 0;
> > -   }
> > +   } else
> > +   delay_func = tsc_delay;
> >
> > tc_init(_timecounter);
> >  }
> > @@ -342,4 +344,15 @@ tsc_sync_ap(struct cpu_info *ci)
> >  {
> > tsc_post_ap(ci);
> > tsc_post_ap(ci);
> > +}
> > +
> > +void
> > +tsc_delay(int usecs)
> > +{
> > +   uint64_t interval, start;
> > +
> > +   interval = (uint64_t)usecs * tsc_frequency / 100;
> > +   start = rdtsc_lfence();
> > +   while (rdtsc_lfence() - start < interval)
> > +   CPU_BUSY_CYCLE();
> >  }
> > Index: amd64/lapic.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> > retrieving revision 1.55
> > diff -u -p -r1.55 lapic.c
> > --- amd64/lapic.c   3 Aug 2019 14:57:51 -   1.55
> > +++ amd64/lapic.c   23 Aug 2020 22:59:25 -
> > @@ -41,6 +41,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -569,7 +570,8 @@ skip_calibration:
> >  * Now that the timer's calibrated, use the apic timer routines
> >  * for all our timing needs..
> >  */
> > -   delay_func = lapic_delay;
> > +   if (delay_func != tsc_delay)
> > +   delay_func = lapic_delay;
> > initclock_func = lapic_initclocks;
> > }
> >  }
> > Index: include/cpuvar.h
> > 

Re: kernel crash in setrunqueue

2020-07-29 Thread Mike Larkin
On Wed, Jul 29, 2020 at 10:14:11PM +0200, Mark Kettenis wrote:
> > Date: Wed, 29 Jul 2020 13:03:43 -0700
> > From: Mike Larkin 
> >
> > Hi,
> >
> >  I'm seeing crashes on amd64 GENERIC.MP on a few VMs recently. This happens
> > on GENERIC.MP regardless of whether or not the VM has one cpu or more than
> > one. It does not happen on GENERIC kernels.
> >
> >  The crash will happen fairly quickly after the kernel starts executing
> > processes. Sometimes it crashes instantly, sometimes it lasts for a minute
> > or two. It rarely makes it to the login prompt. The problem is 100%
> > reproducible on two different VMs I have, running on two different
> > hypervisors (Hyper-V and ESXi6.7U2).
> >
> >  I first started noticing the problem on the 24th July snap, but TBH these
> > machines were not frequently updated, so the previous snap I had installed
> > might have been a couple months old. Whatever older snap was on them before
> > worked fine.
> >
> >  Since this is happening on two different machines with two different VMs,
> > I'm gonna rule out hardware issues.
> >
> >  Crash:
> >
> > kernel: pretection fault trap, code=0
> > Stopped at  setrunqueue+0xa2:   addl$0x1,0x288(%r13)
> >
> >  Trace:
> > ddb{2}> trace
> > setrunqueue(27b3d6c24c3fab80, 800015e874e0,32) at setrunqueue+0xa2
> > sched_barrier_task(800015f1a168) at sched_barrier_task+0x6c
> > taskq_thread(82121548) at taskq_thread+0x8d
> > end trace frame: 0x0, count: -3
> >
> >  Registers:
> > ddb{2}> sh r
> > rdi 0x821ee728  sched_lock
> > rsi 0x800014cc6ff0
> > rbp 0x800015ea0e40
> > rbx  0
> > rdx   0x23ca94  acpi_pdirpa_0x2288fc
> > rcx0xc
> > rax0xc
> > r8   0x202
> > r9 0x2
> > r10  0
> > r11 0x57f79bf6968709d8
> > r12 0x800015e874e0
> > r13 0x27b3d6c24c3fab80
> > r14   0x32
> > r15 0x27b3d6c24c3fab80
> > rip 0x81b9df22  setrunqueue+0xa2
> > cs 0x8
> > rflags 0x10207  __ALIGN_SIZE+0xf207
> > rsp 0x800015ea0df0
> > ss0x10
> >
> >
> > The offending instruction is in kern_sched.c:260:
> >
> > spc->spc_nrun++;
> >
> > ... which indicates 'spc' is trash (and it is, based on %r13 above). In my
> > tests, %r13 always is this same trash value. That comes from 'ci', which is
> > either passed in or chosen by sched_choosecpu. Neither of these functions
> > have changed recently, so I'm guessing this corruption is coming from 
> > something
> > else.
> >
> >  Anyone have ideas where to start looking? I suppose I could start 
> > bisecting,
> > but does anyone know of any changes that would affect this area?
> >
> >  I can send dmesgs if needed, but these are pretty standard VMs,
> > nothing fancy configured in them. 4 CPUs, 8GB RAM, etc.
>
> They're VMs and it turns out that many of the "PV" drivers are/were
> using the intr_barrier() interface the wrong way.
>
> For Hyper-V, see my reply in the "Panic on boot with Hyper-V since Jun
> 17 snapshot" thread on bugs@ from earlier today.
>
> Cheers,
>
> Mark
>

Thanks. I don't subscribe to bugs@ anymore, so that's why I likely missed it.

-ml



Re: kernel crash in setrunqueue

2020-07-29 Thread Mike Larkin
On Wed, Jul 29, 2020 at 01:03:43PM -0700, Mike Larkin wrote:
> Hi,
>
>  I'm seeing crashes on amd64 GENERIC.MP on a few VMs recently. This happens
> on GENERIC.MP regardless of whether or not the VM has one cpu or more than
> one. It does not happen on GENERIC kernels.
>
>  The crash will happen fairly quickly after the kernel starts executing
> processes. Sometimes it crashes instantly, sometimes it lasts for a minute
> or two. It rarely makes it to the login prompt. The problem is 100%
> reproducible on two different VMs I have, running on two different
> hypervisors (Hyper-V and ESXi6.7U2).
>
>  I first started noticing the problem on the 24th July snap, but TBH these
> machines were not frequently updated, so the previous snap I had installed
> might have been a couple months old. Whatever older snap was on them before
> worked fine.
>
>  Since this is happening on two different machines with two different VMs,
> I'm gonna rule out hardware issues.
>
>  Crash:
>
> kernel: pretection fault trap, code=0
> Stopped atsetrunqueue+0xa2:   addl$0x1,0x288(%r13)
>
>  Trace:
> ddb{2}> trace
> setrunqueue(27b3d6c24c3fab80, 800015e874e0,32) at setrunqueue+0xa2
> sched_barrier_task(800015f1a168) at sched_barrier_task+0x6c
> taskq_thread(82121548) at taskq_thread+0x8d
> end trace frame: 0x0, count: -3
>
>  Registers:
> ddb{2}> sh r
> rdi   0x821ee728  sched_lock
> rsi   0x800014cc6ff0
> rbp   0x800015ea0e40
> rbx0
> rdx 0x23ca94  acpi_pdirpa_0x2288fc
> rcx  0xc
> rax  0xc
> r8 0x202
> r9   0x2
> r100
> r11   0x57f79bf6968709d8
> r12   0x800015e874e0
> r13   0x27b3d6c24c3fab80
> r14 0x32
> r15   0x27b3d6c24c3fab80
> rip   0x81b9df22  setrunqueue+0xa2
> cs   0x8
> rflags   0x10207  __ALIGN_SIZE+0xf207
> rsp   0x800015ea0df0
> ss  0x10
>
>
> The offending instruction is in kern_sched.c:260:
>
>   spc->spc_nrun++;
>
> ... which indicates 'spc' is trash (and it is, based on %r13 above). In my
> tests, %r13 always is this same trash value. That comes from 'ci', which is
> either passed in or chosen by sched_choosecpu. Neither of these functions
> have changed recently, so I'm guessing this corruption is coming from 
> something
> else.
>
>  Anyone have ideas where to start looking? I suppose I could start bisecting,
> but does anyone know of any changes that would affect this area?
>
>  I can send dmesgs if needed, but these are pretty standard VMs, nothing fancy
> configured in them. 4 CPUs, 8GB RAM, etc.
>
> -ml
>

Also I should note that the problem happens with snaps as well as kernels built
from source (-current), so this isn't likely something that is in snaps but not
yet in tree.

-ml



kernel crash in setrunqueue

2020-07-29 Thread Mike Larkin
Hi,

 I'm seeing crashes on amd64 GENERIC.MP on a few VMs recently. This happens
on GENERIC.MP regardless of whether or not the VM has one cpu or more than
one. It does not happen on GENERIC kernels.

 The crash will happen fairly quickly after the kernel starts executing
processes. Sometimes it crashes instantly, sometimes it lasts for a minute
or two. It rarely makes it to the login prompt. The problem is 100%
reproducible on two different VMs I have, running on two different
hypervisors (Hyper-V and ESXi6.7U2).

 I first started noticing the problem on the 24th July snap, but TBH these
machines were not frequently updated, so the previous snap I had installed
might have been a couple months old. Whatever older snap was on them before
worked fine.

 Since this is happening on two different machines with two different VMs,
I'm gonna rule out hardware issues.

 Crash:

kernel: pretection fault trap, code=0
Stopped at  setrunqueue+0xa2:   addl$0x1,0x288(%r13)

 Trace:
ddb{2}> trace
setrunqueue(27b3d6c24c3fab80, 800015e874e0,32) at setrunqueue+0xa2
sched_barrier_task(800015f1a168) at sched_barrier_task+0x6c
taskq_thread(82121548) at taskq_thread+0x8d
end trace frame: 0x0, count: -3

 Registers:
ddb{2}> sh r
rdi 0x821ee728  sched_lock
rsi 0x800014cc6ff0
rbp 0x800015ea0e40
rbx  0
rdx   0x23ca94  acpi_pdirpa_0x2288fc
rcx0xc
rax0xc
r8   0x202
r9 0x2
r10  0
r11 0x57f79bf6968709d8
r12 0x800015e874e0
r13 0x27b3d6c24c3fab80
r14   0x32
r15 0x27b3d6c24c3fab80
rip 0x81b9df22  setrunqueue+0xa2
cs 0x8
rflags 0x10207  __ALIGN_SIZE+0xf207
rsp 0x800015ea0df0
ss0x10


The offending instruction is in kern_sched.c:260:

spc->spc_nrun++;

... which indicates 'spc' is trash (and it is, based on %r13 above). In my
tests, %r13 always is this same trash value. That comes from 'ci', which is
either passed in or chosen by sched_choosecpu. Neither of these functions
have changed recently, so I'm guessing this corruption is coming from something
else.

 Anyone have ideas where to start looking? I suppose I could start bisecting,
but does anyone know of any changes that would affect this area?

 I can send dmesgs if needed, but these are pretty standard VMs, nothing fancy
configured in them. 4 CPUs, 8GB RAM, etc.

-ml



Re: Edgerouter 4 available for any OpenBSD dev that needs an octeon

2020-07-29 Thread Mike Larkin
On Tue, Jul 28, 2020 at 06:16:01PM -0700, Mike Larkin wrote:
> Someone (can't recall who) gave me an ER4. I found it while cleaning
> out my closet. Since I'm not active anymore, if any openbsd developer
> wants it, reach out to me privately and I'll see about sending it
> to you.
>
> Thanks.
>
> -ml
>

Thanks everyone, this is heading to an OpenBSD developer.

-ml



Edgerouter 4 available for any OpenBSD dev that needs an octeon

2020-07-28 Thread Mike Larkin
Someone (can't recall who) gave me an ER4. I found it while cleaning
out my closet. Since I'm not active anymore, if any openbsd developer
wants it, reach out to me privately and I'll see about sending it
to you.

Thanks.

-ml



Re: amd64: lapic: refactor lapic timer programming

2020-07-06 Thread Mike Larkin
On Fri, Jul 03, 2020 at 07:41:45PM -0500, Scott Cheloha wrote:
> Hi,
>
> I want to run the lapic timer in one-shot mode on amd64 as we do with
> other interrupt clocks on other platforms.  I aim to make the clock
> interrupt code MD where possible.
>
> However, nobody is going to test my MD clock interrupt work unless
> amd64 is ready to use it.  amd64 doesn't run in oneshot mode so there
> is preliminary work to do first.
>
> --
>
> Before we can run the lapic timer in one-shot mode we need to simplify
> the process of actually programming it.
>
> This patch refactors all lapic timer programming into a single
> routine.  We don't use any divisor other than 1 so I don't see a need
> to make it a parameter to lapic_timer_arm().  We can add TSC deadline
> support later if someone wants it.
>
> The way we program the timer differs from how e.g. Darwin and FreeBSD
> and Linux do it.  They write:
>
>  - lvtt (mode + vector + (maybe) mask)
>  - dcr
>  - icr
>
> while we do:
>
>  - lvtt (mode + mask)
>  - dcr
>  - icr
>  - (maybe) lvtt (mode + vector)
>
> I don't see a reason to arm the timer with four writes instead of
> three, so in this patch I use the three-write ordering.
>
> Am I missing something?  Do I need to disable interrupts before I
> reprogram the timer?
>

This reads ok to me. I am not aware of any requirements to disable
interrupts while reprogramming the timer.

-ml

> -Scott
>
> Index: lapic.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> retrieving revision 1.55
> diff -u -p -r1.55 lapic.c
> --- lapic.c   3 Aug 2019 14:57:51 -   1.55
> +++ lapic.c   4 Jul 2020 00:40:26 -
> @@ -413,6 +413,42 @@ u_int32_t lapic_frac_usec_per_cycle;
>  u_int64_t lapic_frac_cycle_per_usec;
>  u_int32_t lapic_delaytab[26];
>
> +void lapic_timer_arm(uint32_t, int, uint32_t);
> +void lapic_timer_arm_once(int, uint32_t);
> +void lapic_timer_arm_period(int, uint32_t);
> +
> +/*
> + * Start the local apic countdown timer.
> + *
> + * First set the mode, vector, and (maybe) the mask.
> + * then set the divisor,
> + * and finally set the cycle count.
> + */
> +void
> +lapic_timer_arm(uint32_t mode, int masked, uint32_t cycles)
> +{
> + uint32_t lvtt;
> +
> + lvtt = mode | LAPIC_TIMER_VECTOR;
> + lvtt |= (masked) ? LAPIC_LVTT_M : 0;
> +
> + lapic_writereg(LAPIC_LVTT, lvtt);
> + lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> + lapic_writereg(LAPIC_ICR_TIMER, cycles);
> +}
> +
> +void
> +lapic_timer_arm_once(int masked, uint32_t cycles)
> +{
> + lapic_timer_arm(LAPIC_LVTT_TM_ONESHOT, masked, cycles);
> +}
> +
> +void
> +lapic_timer_arm_period(int masked, uint32_t cycles)
> +{
> + lapic_timer_arm(LAPIC_LVTT_TM_PERIODIC, masked, cycles);
> +}
> +
>  void
>  lapic_clockintr(void *arg, struct intrframe frame)
>  {
> @@ -430,17 +466,7 @@ lapic_clockintr(void *arg, struct intrfr
>  void
>  lapic_startclock(void)
>  {
> - /*
> -  * Start local apic countdown timer running, in repeated mode.
> -  *
> -  * Mask the clock interrupt and set mode,
> -  * then set divisor,
> -  * then unmask and set the vector.
> -  */
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_TM|LAPIC_LVTT_M);
> - lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> - lapic_writereg(LAPIC_ICR_TIMER, lapic_tval);
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_TM|LAPIC_TIMER_VECTOR);
> + lapic_timer_arm_period(0, lapic_tval);
>  }
>
>  void
> @@ -498,9 +524,7 @@ lapic_calibrate_timer(struct cpu_info *c
>* Configure timer to one-shot, interrupt masked,
>* large positive number.
>*/
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_M);
> - lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> - lapic_writereg(LAPIC_ICR_TIMER, 0x8000);
> + lapic_timer_arm_once(1, 0x8000);
>
>   s = intr_disable();
>
> @@ -540,10 +564,7 @@ skip_calibration:
>   lapic_tval = (lapic_per_second * 2) / hz;
>   lapic_tval = (lapic_tval / 2) + (lapic_tval & 0x1);
>
> - lapic_writereg(LAPIC_LVTT, LAPIC_LVTT_TM | LAPIC_LVTT_M |
> - LAPIC_TIMER_VECTOR);
> - lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
> - lapic_writereg(LAPIC_ICR_TIMER, lapic_tval);
> + lapic_timer_arm_period(0, lapic_tval);
>
>   /*
>* Compute fixed-point ratios between cycles and
>



Re: 11n Tx aggregation for iwm(4)

2020-06-26 Thread Mike Larkin
On Fri, Jun 26, 2020 at 09:01:03PM -0700, Mike Larkin wrote:
> On Fri, Jun 26, 2020 at 02:45:53PM +0200, Stefan Sperling wrote:
> > This patch adds support for 11n Tx aggregation to iwm(4).
> >
> > Please help with testing if you can by running the patch and using wifi
> > as usual. Nothing should change, except that Tx speed may potentially
> > improve. If you have time to run before/after performance measurements with
> > tcpbench or such, that would be nice. But it's not required for testing.
> >
> > If Tx aggregation is active then netstat will show a non-zero output block 
> > ack
> > agreement counter:
> >
> > $ netstat -W iwm0 | grep 'output block'
> > 3 new output block ack agreements
> > 0 output block ack agreements timed out
> >
> > It would be great to get at least one test for all the chipsets the driver
> > supports: 7260, 7265, 3160, 3165, 3168, 8260, 8265, 9260, 9560
> > The behaviour of the access point also matters a great deal. It won't
> > hurt to test the same chipset against several different access points.
> >
> > I have tested this version on 8265 only so far. I've run older revisions
> > of this patch on 7265 so I'm confident that this chip will work, too.
> > So far, the APs I have tested against are athn(4) in 11a mode and in 11n
> > mode with the 'nomimo' nwflag, and a Sagemcom 11ac AP. All on 5Ghz channels.
>
> I tested this on my T490 Thinkpad:
>
> iwm0 at pci0 dev 20 function 3 "Intel Dual Band Wireless AC 9560" rev 0x30, 
> msix
> iwm0: hw rev 0x310, fw ver 34.3125811985.0
>
> It ended up having a heck of a time connecting to anything, most/all
> connections ended up timing out or just taking a really long time to complete.
>
> I looked in dmesg, and found a stream of fatal firmware errors and other
> errors (see end of this email).
>
> My iwm-firmware was updated before I tried the new kernel:
>
> -innsmouth- ~> pkg_info iwm-firmware
> Information for inst:iwm-firmware-20191022p1
>
> Comment:
> firmware binary images for iwm(4) driver
>
> Description:
> Firmware binary images for use with the iwm(4) driver.
>
> Maintainer: The OpenBSD ports mailing-list 
>
> WWW: https://wireless.wiki.kernel.org/en/users/Drivers/iwlwifi
>

PS, I did see 5 new output block ack agreements when I was running the diff,
so apparently at least it is doing ... something?

-ml

>
>
> I still have the kernel around if you want me to test something else. There
> is nothing in this tree except this Txagg diff. LMK if you need any more
> info.
>
> OpenBSD 6.7-current (GENERIC.MP) #1: Fri Jun 26 14:01:06 PDT 2020
> 
> mlar...@innsmouth.int.azathoth.net:/u/bin/src/OpenBSD/openbsd/sys/arch/amd64/compile/GENERIC.MP
> real mem = 51260506112 (48885MB)
> avail mem = 49691906048 (47389MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 3.1 @ 0x604f5000 (67 entries)
> bios0: vendor LENOVO version "N2IET61W (1.39 )" date 05/16/2019
> bios0: LENOVO 20N20046US
> acpi0 at bios0: ACPI 6.1
> acpi0: sleep states S0 S3 S4 S5
> acpi0: tables DSDT FACP SSDT SSDT SSDT SSDT UEFI SSDT HPET APIC MCFG ECDT 
> SSDT SSDT BOOT SLIC SSDT LPIT WSMT SSDT DBGP DBG2 MSDM BATB DMAR NHLT ASF! 
> FPDT UEFI
> acpi0: wakeup devices GLAN(S4) XHC_(S3) XDCI(S4) HDAS(S4) RP01(S4) PXSX(S4) 
> RP02(S4) PXSX(S4) RP03(S4) PXSX(S4) RP04(S4) PXSX(S4) RP05(S4) PXSX(S4) 
> RP06(S4) PXSX(S4) [...]
> acpitimer0 at acpi0: 3579545 Hz, 24 bits
> acpihpet0 at acpi0: 2399 Hz
> acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
> cpu0 at mainbus0: apid 0 (boot processor)
> cpu0: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1586.72 MHz, 06-8e-0c
> cpu0: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,SGX,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MPX,RDSEED,ADX,SMAP,CLFLUSHOPT,PT,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, package 0
> mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
> cpu0: apic clock running at 24MHz
> cpu0: mwait min=64, max=64, C-substates=0.2.1.2.4.1.1.1, IBE
> cpu1 at mainbus0: apid 2 (application processor)
> cpu1: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1333.05 MHz, 06-8e-0c
> cpu1: 
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,

Re: 11n Tx aggregation for iwm(4)

2020-06-26 Thread Mike Larkin
On Fri, Jun 26, 2020 at 02:45:53PM +0200, Stefan Sperling wrote:
> This patch adds support for 11n Tx aggregation to iwm(4).
>
> Please help with testing if you can by running the patch and using wifi
> as usual. Nothing should change, except that Tx speed may potentially
> improve. If you have time to run before/after performance measurements with
> tcpbench or such, that would be nice. But it's not required for testing.
>
> If Tx aggregation is active then netstat will show a non-zero output block ack
> agreement counter:
>
> $ netstat -W iwm0 | grep 'output block'
> 3 new output block ack agreements
>   0 output block ack agreements timed out
>
> It would be great to get at least one test for all the chipsets the driver
> supports: 7260, 7265, 3160, 3165, 3168, 8260, 8265, 9260, 9560
> The behaviour of the access point also matters a great deal. It won't
> hurt to test the same chipset against several different access points.
>
> I have tested this version on 8265 only so far. I've run older revisions
> of this patch on 7265 so I'm confident that this chip will work, too.
> So far, the APs I have tested against are athn(4) in 11a mode and in 11n
> mode with the 'nomimo' nwflag, and a Sagemcom 11ac AP. All on 5Ghz channels.

I tested this on my T490 Thinkpad:

iwm0 at pci0 dev 20 function 3 "Intel Dual Band Wireless AC 9560" rev 0x30, msix
iwm0: hw rev 0x310, fw ver 34.3125811985.0

It ended up having a heck of a time connecting to anything, most/all
connections ended up timing out or just taking a really long time to complete.

I looked in dmesg, and found a stream of fatal firmware errors and other
errors (see end of this email).

My iwm-firmware was updated before I tried the new kernel:

-innsmouth- ~> pkg_info iwm-firmware
Information for inst:iwm-firmware-20191022p1

Comment:
firmware binary images for iwm(4) driver

Description:
Firmware binary images for use with the iwm(4) driver.

Maintainer: The OpenBSD ports mailing-list 

WWW: https://wireless.wiki.kernel.org/en/users/Drivers/iwlwifi



I still have the kernel around if you want me to test something else. There
is nothing in this tree except this Txagg diff. LMK if you need any more
info.

OpenBSD 6.7-current (GENERIC.MP) #1: Fri Jun 26 14:01:06 PDT 2020

mlar...@innsmouth.int.azathoth.net:/u/bin/src/OpenBSD/openbsd/sys/arch/amd64/compile/GENERIC.MP
real mem = 51260506112 (48885MB)
avail mem = 49691906048 (47389MB)
random: good seed from bootblocks
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 3.1 @ 0x604f5000 (67 entries)
bios0: vendor LENOVO version "N2IET61W (1.39 )" date 05/16/2019
bios0: LENOVO 20N20046US
acpi0 at bios0: ACPI 6.1
acpi0: sleep states S0 S3 S4 S5
acpi0: tables DSDT FACP SSDT SSDT SSDT SSDT UEFI SSDT HPET APIC MCFG ECDT SSDT 
SSDT BOOT SLIC SSDT LPIT WSMT SSDT DBGP DBG2 MSDM BATB DMAR NHLT ASF! FPDT UEFI
acpi0: wakeup devices GLAN(S4) XHC_(S3) XDCI(S4) HDAS(S4) RP01(S4) PXSX(S4) 
RP02(S4) PXSX(S4) RP03(S4) PXSX(S4) RP04(S4) PXSX(S4) RP05(S4) PXSX(S4) 
RP06(S4) PXSX(S4) [...]
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpihpet0 at acpi0: 2399 Hz
acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1586.72 MHz, 06-8e-0c
cpu0: 
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,SGX,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MPX,RDSEED,ADX,SMAP,CLFLUSHOPT,PT,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES
cpu0: 256KB 64b/line 8-way L2 cache
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
cpu0: apic clock running at 24MHz
cpu0: mwait min=64, max=64, C-substates=0.2.1.2.4.1.1.1, IBE
cpu1 at mainbus0: apid 2 (application processor)
cpu1: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1333.05 MHz, 06-8e-0c
cpu1: 
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,SGX,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,MPX,RDSEED,ADX,SMAP,CLFLUSHOPT,PT,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES
cpu1: 256KB 64b/line 8-way L2 cache
cpu1: smt 0, core 1, package 0
cpu2 at mainbus0: apid 4 (application processor)
cpu2: Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz, 1125.81 MHz, 06-8e-0c
cpu2: 

Re: vmm(4): unterminated vm_name after strncpy

2020-03-15 Thread Mike Larkin
On Thu, Mar 12, 2020 at 10:31:13PM +0100, Tobias Heider wrote:
> vmm uses 'strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN)' to copy
> to buffers of size VMM_MAX_NAME_LEN, which can leave the resulting string
> unterminated.
> From strncpy(3):
>   strncpy() only NUL terminates the destination string when the length of
>   the source string is less than the length parameter.
> 
> I propose replacing it with 'strlcpy' which does the right thing and
> only copies up to dstsize - 1 characters.
> 
> ok?
> 

good find. Thanks!

> CID 1453255
> 
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /mount/openbsd/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.266
> diff -u -p -r1.266 vmm.c
> --- sys/arch/amd64/amd64/vmm.c11 Mar 2020 16:38:42 -  1.266
> +++ sys/arch/amd64/amd64/vmm.c12 Mar 2020 21:15:01 -
> @@ -1167,7 +1167,7 @@ vm_create(struct vm_create_params *vcp, 
>   memcpy(vm->vm_memranges, vcp->vcp_memranges,
>   vm->vm_nmemranges * sizeof(vm->vm_memranges[0]));
>   vm->vm_memory_size = memsize;
> - strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
> + strlcpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
>  
>   rw_enter_write(_softc->vm_lock);
>  
> @@ -3718,7 +3718,7 @@ vm_get_info(struct vm_info_params *vip)
>   out[i].vir_ncpus = vm->vm_vcpu_ct;
>   out[i].vir_id = vm->vm_id;
>   out[i].vir_creator_pid = vm->vm_creator_pid;
> - strncpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN);
> + strlcpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN);
>   rw_enter_read(>vm_vcpu_lock);
>   for (j = 0; j < vm->vm_vcpu_ct; j++) {
>   out[i].vir_vcpu_state[j] = VCPU_STATE_UNKNOWN;
> 



Re: [PATCH] Fixing an uninitialized variable that can lead to #GP.

2020-02-09 Thread Mike Larkin
On Sun, Feb 09, 2020 at 06:17:47PM -0800, Anthony Steinhauser wrote:
> In the current implementation of the TAA mitigation if the cpuid_level
> is 6 and it's an Intel CPU, the sefflags_edx variable is used without
> being initialized. If the SEFF0EDX_ARCH_CAP bit is accidentally flipped
> in it, the rdmsr on the unimplemented MSR_ARCH_CAPABILITIES index leads
> to a #GP fault.
> 
> This change initializes the sefflags_edx variable to 0 which is
> consistent with the MSR_ARCH_CAPABILITIES being unavailable.
> ---
>  sys/arch/amd64/amd64/cpu.c | 2 +-
>  sys/arch/i386/i386/cpu.c   | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c
> index 48ab6b5e7f3..f9beff0d5e3 100644
> --- a/sys/arch/amd64/amd64/cpu.c
> +++ b/sys/arch/amd64/amd64/cpu.c
> @@ -1164,7 +1164,7 @@ void
>  cpu_tsx_disable(struct cpu_info *ci)
>  {
>   uint64_t msr;
> - uint32_t dummy, sefflags_edx;
> + uint32_t dummy, sefflags_edx = 0;
>  
>   /* this runs before identifycpu() populates ci_feature_sefflags_edx */
>   if (cpuid_level >= 0x07)
> diff --git a/sys/arch/i386/i386/cpu.c b/sys/arch/i386/i386/cpu.c
> index b31a431c594..76f1b65bede 100644
> --- a/sys/arch/i386/i386/cpu.c
> +++ b/sys/arch/i386/i386/cpu.c
> @@ -473,7 +473,7 @@ void
>  cpu_tsx_disable(struct cpu_info *ci)
>  {
>   uint64_t msr;
> - uint32_t dummy, sefflags_edx;
> + uint32_t dummy, sefflags_edx = 0;
>  
>   /* this runs before identifycpu() populates ci_feature_sefflags_edx */
>   if (cpuid_level >= 0x07)
> -- 
> 2.25.0.341.g760bfbb309-goog
> 

Probably safer to use rdmsr_safe for this sort of thing also.

-ml



Re: Add mprotect_ept ioctl to vmm(4)

2020-02-07 Thread Mike Larkin
On Fri, Feb 07, 2020 at 01:25:38PM -0800, Mike Larkin wrote:
> On Fri, Feb 07, 2020 at 04:20:16AM +, Adam Steen wrote:
> > Hi
> > 
> > Please see the attached patch to add an 'IOCTL handler to sets the access
> > protections of the ept'
> > 
> > vmd(8) does not make use of this change, but solo5, which uses vmm(4) as
> > a backend hypervisor. The code calling 'VMM_IOC_MPROTECT_EPT' is
> > available here 
> > https://github.com/Solo5/solo5/compare/master...adamsteen:wnox
> > 
> > there are changes to vmd too, but this is just to ensure completeness,
> > if mprotect ept is called in the future, we would want the vm to be
> > stopped if we get a protection fault.
> > 
> > I was unsure what todo if called with execute only permissions on a cpu that
> > does not support it. I went with add read permissions and logging the
> > fact, instead of returning EINVAL.
> > 
> > Cheers
> > Adam
> > 
> 
> I have been giving Adam feedback on this diff for a while. There are a few
> minor comments below, but I think this is ok if someone wants to commit it 
> after
> the fixes below are incorporated.
> 
> -ml
> 

See updated comment below.

-ml

> > ? div
> > Index: sys/arch/amd64/amd64/vmm.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> > retrieving revision 1.258
> > diff -u -p -u -p -r1.258 vmm.c
> > --- sys/arch/amd64/amd64/vmm.c  31 Jan 2020 01:51:27 -  1.258
> > +++ sys/arch/amd64/amd64/vmm.c  7 Feb 2020 03:15:16 -
> > @@ -124,6 +124,7 @@ int vm_get_info(struct vm_info_params *)
> >  int vm_resetcpu(struct vm_resetcpu_params *);
> >  int vm_intr_pending(struct vm_intr_params *);
> >  int vm_rwregs(struct vm_rwregs_params *, int);
> > +int vm_mprotect_ept(struct vm_mprotect_ept_params *);
> >  int vm_rwvmparams(struct vm_rwvmparams_params *, int);
> >  int vm_find(uint32_t, struct vm **);
> >  int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
> > @@ -186,6 +187,8 @@ int svm_fault_page(struct vcpu *, paddr_
> >  int vmx_fault_page(struct vcpu *, paddr_t);
> >  int vmx_handle_np_fault(struct vcpu *);
> >  int svm_handle_np_fault(struct vcpu *);
> > +int vmx_mprotect_ept(vm_map_t, paddr_t, paddr_t, int);
> > +pt_entry_t *vmx_pmap_find_pte_ept(pmap_t, paddr_t);
> >  int vmm_alloc_vpid(uint16_t *);
> >  void vmm_free_vpid(uint16_t);
> >  const char *vcpu_state_decode(u_int);
> > @@ -493,6 +496,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
> > case VMM_IOC_WRITEREGS:
> > ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
> > break;
> > +   case VMM_IOC_MPROTECT_EPT:
> > +   ret = vm_mprotect_ept((struct vm_mprotect_ept_params *)data);
> > +   break;
> > case VMM_IOC_READVMPARAMS:
> > ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 0);
> > break;
> > @@ -531,6 +537,7 @@ pledge_ioctl_vmm(struct proc *p, long co
> > case VMM_IOC_INTR:
> > case VMM_IOC_READREGS:
> > case VMM_IOC_WRITEREGS:
> > +   case VMM_IOC_MPROTECT_EPT:
> > case VMM_IOC_READVMPARAMS:
> > case VMM_IOC_WRITEVMPARAMS:
> > return (0);
> > @@ -806,6 +813,288 @@ vm_rwregs(struct vm_rwregs_params *vrwp,
> >  }
> >  
> >  /*
> > + * vm_mprotect_ept
> > + *
> > + * IOCTL handler to sets the access protections of the ept
> > + *
> > + * Parameters:
> > + *   vmep: decribes the memory for which the protect will be applied..
> > + *
> > + * Return values:
> > + *  0: if successful
> > + *  ENOENT: if the VM defined by 'vmep' cannot be found
> > + *  EINVAL: if the sgpa or size is not page aligned, the prot is invalid,
> > + *  size is too large (512GB), there is wraparound
> > + *  (like start = 512GB-1 and end = 512GB-2),
> > + *  the address specified is not within the vm's mem range
> > + *  or the address lies inside reserved (MMIO) memory
> > + */
> > +int
> > +vm_mprotect_ept(struct vm_mprotect_ept_params *vmep)
> > +{
> > +   struct vm *vm;
> > +   struct vcpu *vcpu;
> > +   vaddr_t sgpa;
> > +   size_t size;
> > +   vm_prot_t prot;
> > +   uint64_t msr;
> > +   int ret, memtype;
> > +
> > +   /* If not EPT or RVI, nothing to do here */
> > +   if (!(vmm_softc->mode == VMM_MODE_EPT
> > +   || vmm_softc->mode == VMM_MODE_RVI))
> > +   return (0);
>

vmm(4): wrong comment

2020-02-07 Thread Mike Larkin
Free commit for someone. Noticed last night by my student team that is working
on vmm(4) virtio memory ballooning support as we were adding the viomb(4)
stats queue.

-ml


Index: vmm.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.257
diff -u -p -a -u -r1.257 vmm.c
--- vmm.c   13 Dec 2019 03:38:15 -  1.257
+++ vmm.c   7 Feb 2020 21:27:46 -
@@ -3666,7 +3666,6 @@ vcpu_vmx_compute_ctrl(uint64_t ctrlval, 
 /*
  * vm_get_info
  *
- * Returns information about the VM indicated by 'vip'.
  * Returns information about the VM indicated by 'vip'. The 'vip_size' field
  * in the 'vip' parameter is used to indicate the size of the caller's buffer.
  * If insufficient space exists in that buffer, the required size needed is



Re: Add mprotect_ept ioctl to vmm(4)

2020-02-07 Thread Mike Larkin
On Fri, Feb 07, 2020 at 04:20:16AM +, Adam Steen wrote:
> Hi
> 
> Please see the attached patch to add an 'IOCTL handler to sets the access
> protections of the ept'
> 
> vmd(8) does not make use of this change, but solo5, which uses vmm(4) as
> a backend hypervisor. The code calling 'VMM_IOC_MPROTECT_EPT' is
> available here https://github.com/Solo5/solo5/compare/master...adamsteen:wnox
> 
> there are changes to vmd too, but this is just to ensure completeness,
> if mprotect ept is called in the future, we would want the vm to be
> stopped if we get a protection fault.
> 
> I was unsure what todo if called with execute only permissions on a cpu that
> does not support it. I went with add read permissions and logging the
> fact, instead of returning EINVAL.
> 
> Cheers
> Adam
> 

I have been giving Adam feedback on this diff for a while. There are a few
minor comments below, but I think this is ok if someone wants to commit it after
the fixes below are incorporated.

-ml

> ? div
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.258
> diff -u -p -u -p -r1.258 vmm.c
> --- sys/arch/amd64/amd64/vmm.c31 Jan 2020 01:51:27 -  1.258
> +++ sys/arch/amd64/amd64/vmm.c7 Feb 2020 03:15:16 -
> @@ -124,6 +124,7 @@ int vm_get_info(struct vm_info_params *)
>  int vm_resetcpu(struct vm_resetcpu_params *);
>  int vm_intr_pending(struct vm_intr_params *);
>  int vm_rwregs(struct vm_rwregs_params *, int);
> +int vm_mprotect_ept(struct vm_mprotect_ept_params *);
>  int vm_rwvmparams(struct vm_rwvmparams_params *, int);
>  int vm_find(uint32_t, struct vm **);
>  int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
> @@ -186,6 +187,8 @@ int svm_fault_page(struct vcpu *, paddr_
>  int vmx_fault_page(struct vcpu *, paddr_t);
>  int vmx_handle_np_fault(struct vcpu *);
>  int svm_handle_np_fault(struct vcpu *);
> +int vmx_mprotect_ept(vm_map_t, paddr_t, paddr_t, int);
> +pt_entry_t *vmx_pmap_find_pte_ept(pmap_t, paddr_t);
>  int vmm_alloc_vpid(uint16_t *);
>  void vmm_free_vpid(uint16_t);
>  const char *vcpu_state_decode(u_int);
> @@ -493,6 +496,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
>   case VMM_IOC_WRITEREGS:
>   ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
>   break;
> + case VMM_IOC_MPROTECT_EPT:
> + ret = vm_mprotect_ept((struct vm_mprotect_ept_params *)data);
> + break;
>   case VMM_IOC_READVMPARAMS:
>   ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 0);
>   break;
> @@ -531,6 +537,7 @@ pledge_ioctl_vmm(struct proc *p, long co
>   case VMM_IOC_INTR:
>   case VMM_IOC_READREGS:
>   case VMM_IOC_WRITEREGS:
> + case VMM_IOC_MPROTECT_EPT:
>   case VMM_IOC_READVMPARAMS:
>   case VMM_IOC_WRITEVMPARAMS:
>   return (0);
> @@ -806,6 +813,288 @@ vm_rwregs(struct vm_rwregs_params *vrwp,
>  }
>  
>  /*
> + * vm_mprotect_ept
> + *
> + * IOCTL handler to sets the access protections of the ept
> + *
> + * Parameters:
> + *   vmep: decribes the memory for which the protect will be applied..
> + *
> + * Return values:
> + *  0: if successful
> + *  ENOENT: if the VM defined by 'vmep' cannot be found
> + *  EINVAL: if the sgpa or size is not page aligned, the prot is invalid,
> + *  size is too large (512GB), there is wraparound
> + *  (like start = 512GB-1 and end = 512GB-2),
> + *  the address specified is not within the vm's mem range
> + *  or the address lies inside reserved (MMIO) memory
> + */
> +int
> +vm_mprotect_ept(struct vm_mprotect_ept_params *vmep)
> +{
> + struct vm *vm;
> + struct vcpu *vcpu;
> + vaddr_t sgpa;
> + size_t size;
> + vm_prot_t prot;
> + uint64_t msr;
> + int ret, memtype;
> +
> + /* If not EPT or RVI, nothing to do here */
> + if (!(vmm_softc->mode == VMM_MODE_EPT
> + || vmm_softc->mode == VMM_MODE_RVI))
> + return (0);
> +
> + /* Find the desired VM */
> + rw_enter_read(_softc->vm_lock);
> + ret = vm_find(vmep->vmep_vm_id, );
> + rw_exit_read(_softc->vm_lock);
> +
> + /* Not found? exit. */
> + if (ret != 0) {
> + DPRINTF("%s: vm id %u not found\n", __func__,
> + vmep->vmep_vm_id);
> + return (ret);
> + }
> +
> + rw_enter_read(>vm_vcpu_lock);
> + SLIST_FOREACH(vcpu, >vm_vcpu_list, vc_vcpu_link) {
> + if (vcpu->vc_id == vmep->vmep_vcpu_id)
> + break;
> + }
> + rw_exit_read(>vm_vcpu_lock);
> +
> + if (vcpu == NULL) {
> + DPRINTF("%s: vcpu id %u of vm %u not found\n", __func__,
> + vmep->vmep_vcpu_id, vmep->vmep_vm_id);
> + return (ENOENT);
> + }
> +
> + if (vcpu->vc_state != VCPU_STATE_STOPPED) {
> + DPRINTF("%s: mprotect_ept %u on 

Re: vmm(4) patch - iniatialise eptp to zero for vmx like svm

2020-02-06 Thread Mike Larkin
On Thu, Feb 06, 2020 at 01:05:01AM -0800, Mike Larkin wrote:
> On Thu, Feb 06, 2020 at 02:34:47AM +, Adam Steen wrote:
> > Hi
> > 
> > Again while working on a larger patch i noticed that the eptp for vmx
> > was not getting initialised to zero like the svm code path, as part of
> > a VMM_IOC_RESETCPU ioctl call.
> > 
> > please see the attach patch to initialise eptp to zero
> > 
> > cheers
> > Adam
> > 
> > ? div
> > Index: sys/arch/amd64/amd64/vmm.c
> > ===
> > RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> > retrieving revision 1.258
> > diff -u -p -u -p -r1.258 vmm.c
> > --- sys/arch/amd64/amd64/vmm.c  31 Jan 2020 01:51:27 -  1.258
> > +++ sys/arch/amd64/amd64/vmm.c  6 Feb 2020 02:18:30 -
> > @@ -2895,6 +2895,8 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, s
> > /* xcr0 power on default sets bit 0 (x87 state) */
> > vcpu->vc_gueststate.vg_xcr0 = XCR0_X87 & xsave_mask;
> >  
> > +   vcpu->vc_parent->vm_map->pmap->eptp = 0;
> > +
> >  exit:
> > /* Flush the VMCS */
> > if (vmclear(>vc_control_pa)) {
> > 
> > 
> 
> I do not believe this is what you want to do.
> 
> The SVM path *should* reset the eptp to 0, since there is no EPTP in RVI based
> scenarios.
> 
> If you reset the eptp to 0 in an EPT environment, you'll lose the VPID and
> the PA of the EPT itself (which, if you look earlier in that function, is 
> properly initialized; you're going to be whacking it back to 0 here):
> 
> Around line 2620:
> 
> ...
> DPRINTF("Guest EPTP = 0x%llx\n", eptp);
> if (vmwrite(VMCS_GUEST_IA32_EPTP, eptp)) {
> DPRINTF("%s: error setting guest EPTP\n", __func__);
> ret = EINVAL;
> goto exit;
> }
> 
> vcpu->vc_parent->vm_map->pmap->eptp = eptp;
> ...
> 
> 
> -ml
> 

PS -

Note that although the EPTP is written to the VMCS in the previous code, we do
use the cached eptp value in the VM's pmap to do EPT TLB flushes. If you clear
it like this, you won't get the proper behaviour (note, the behaviour today
isn't actually 100% correct to begin with, I have a diff for that but it is
growing out of control it seems...)



Re: vmm(4) patch - iniatialise eptp to zero for vmx like svm

2020-02-06 Thread Mike Larkin
On Thu, Feb 06, 2020 at 02:34:47AM +, Adam Steen wrote:
> Hi
> 
> Again while working on a larger patch i noticed that the eptp for vmx
> was not getting initialised to zero like the svm code path, as part of
> a VMM_IOC_RESETCPU ioctl call.
> 
> please see the attach patch to initialise eptp to zero
> 
> cheers
> Adam
> 
> ? div
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.258
> diff -u -p -u -p -r1.258 vmm.c
> --- sys/arch/amd64/amd64/vmm.c31 Jan 2020 01:51:27 -  1.258
> +++ sys/arch/amd64/amd64/vmm.c6 Feb 2020 02:18:30 -
> @@ -2895,6 +2895,8 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, s
>   /* xcr0 power on default sets bit 0 (x87 state) */
>   vcpu->vc_gueststate.vg_xcr0 = XCR0_X87 & xsave_mask;
>  
> + vcpu->vc_parent->vm_map->pmap->eptp = 0;
> +
>  exit:
>   /* Flush the VMCS */
>   if (vmclear(>vc_control_pa)) {
> 
> 

I do not believe this is what you want to do.

The SVM path *should* reset the eptp to 0, since there is no EPTP in RVI based
scenarios.

If you reset the eptp to 0 in an EPT environment, you'll lose the VPID and
the PA of the EPT itself (which, if you look earlier in that function, is 
properly initialized; you're going to be whacking it back to 0 here):

Around line 2620:

...
DPRINTF("Guest EPTP = 0x%llx\n", eptp);
if (vmwrite(VMCS_GUEST_IA32_EPTP, eptp)) {
DPRINTF("%s: error setting guest EPTP\n", __func__);
ret = EINVAL;
goto exit;
}

vcpu->vc_parent->vm_map->pmap->eptp = eptp;
...


-ml



Re: em(4) diff to test

2020-01-31 Thread Mike Larkin
On Thu, Jan 30, 2020 at 09:15:35AM +0100, Martin Pieuchot wrote:
> On 21/01/20(Tue) 12:31, Martin Pieuchot wrote:
> > On 20/01/20(Mon) 16:42, Martin Pieuchot wrote:
> > > Diff below is a refactoring of the actual em(4) code and defines that
> > > will allows me to present a shorter diff to interrupt multiple CPUs and
> > > make use of multiple queues.
> > > 
> > > It contains the following items:
> > > 
> > >   - Abstract the allocation/freeing of TX/RX ring into em_dma_malloc().
> > > This will ease the introduction of multiple rings.
> > > 
> > >   - Split the 82576 variant out of 82575.  The distinction is necessary
> > > when it comes to setting multiple queues.
> > > 

I see in the diff where this is being done, but is the multi-queue part not
in this diff? Just looks like you separated things, and left it at that?

> > >   - Change multiple TX/RX related macro to take an index argument
> > > corresponding to a ring.  Currently only the index 0 and 1 are used.
> > > 
> > >   - Gather and print more stats counters
> > > 
> > >   - Switch to using a function, like FreeBSD, to translate 82542
> > > registers and get rid of a set of defines.
> > > 
> > > It has been tested one the models below, I'd like to be sure there isn't
> > > any fallout with this part before continuing the effort.
> > 
> > New diff that works with 82576, previous breakage reported by Hrvoje
> > Popovski.  So far the following models have been tested, I'm looking for
> > more tests :o)
> 
> So far this has been tested on the following, I'm confident enough and
> would like to move forward, ok?
> 
>   em2 at pci0 dev 4 function 0 "Intel 82540EM" rev 0x02: apic 9 int 14
>   em1 at pci3 dev 1 function 0 "Intel 82545GM" rev 0x04: apic 4 int 0
>   em0 at pci3 dev 1 function 0 "Intel 82546GB" rev 0x03: apic 3 int 0
>   em3 at pci2 dev 0 function 0 "Intel 82571EB" rev 0x06: apic 0 int 16
>   em0 at pci1 dev 0 function 0 "Intel 82572EI" rev 0x06: apic 0 int 16
>   em2 at pci5 dev 0 function 0 "Intel 82573E" rev 0x03: msi
>   em3 at pci6 dev 0 function 0 "Intel 82573L" rev 0x00: msi
>   em0 at pci3 dev 0 function 0 "Intel 82574L" rev 0x00: msi
>   em0 at pci0 dev 2 function 0 "Intel 82575GB" rev 0x02: msi
>   em0 at pci1 dev 0 function 0 "Intel 82576" rev 0x01: msi
>   em0 at pci0 dev 25 function 0 "Intel 82577LM" rev 0x06: msi
>   em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi
>   em0 at pci1 dev 0 function 0 "Intel 82583V" rev 0x00: msi
>   em0 at pci1 dev 0 function 0 "Intel I210" rev 0x03: msi
>   em0 at pci1 dev 0 function 0 "Intel I211" rev 0x03: msi
>   em0 at pci0 dev 25 function 0 "Intel I217-LM" rev 0x04: msi
>   em0 at pci0 dev 25 function 0 "Intel I218-V" rev 0x03: msi
>   em0 at pci0 dev 25 function 0 Intel I218-LM rev 0x04: msi

You can add this to the "works as intended" list (t490):

em0 at pci0 dev 31 function 6 "Intel I219-LM" rev 0x30: msi

Diff reads ok to me. So ok mlarkin@ if you are still looking for oks and didn't
commit it already.

-ml

>   em0 at pci0 dev 31 function 6 "Intel I219-V" rev 0x21: msi
>   em0 at pci7 dev 0 function 0 "Intel I350" rev 0x01: msi
> 
> 
> Index: pci/if_em.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_em.c,v
> retrieving revision 1.343
> diff -u -p -r1.343 if_em.c
> --- pci/if_em.c   20 Jan 2020 23:45:02 -  1.343
> +++ pci/if_em.c   21 Jan 2020 11:27:14 -
> @@ -260,6 +260,7 @@ void em_disable_aspm(struct em_softc *);
>  void em_txeof(struct em_softc *);
>  int  em_allocate_receive_structures(struct em_softc *);
>  int  em_allocate_transmit_structures(struct em_softc *);
> +int  em_allocate_desc_rings(struct em_softc *);
>  int  em_rxfill(struct em_softc *);
>  void em_rxrefill(void *);
>  int  em_rxeof(struct em_softc *);
> @@ -344,11 +345,6 @@ em_defer_attach(struct device *self)
>  
>   em_free_pci_resources(sc);
>  
> - sc->sc_rx_desc_ring = NULL;
> - em_dma_free(sc, >sc_rx_dma);
> - sc->sc_tx_desc_ring = NULL;
> - em_dma_free(sc, >sc_tx_dma);
> -
>   return;
>   }
>   
> @@ -464,6 +460,7 @@ em_attach(struct device *parent, struct 
>   case em_82572:
>   case em_82574:
>   case em_82575:
> + case em_82576:
>   case em_82580:
>   case em_i210:
>   case em_i350:
> @@ -494,23 +491,11 @@ em_attach(struct device *parent, struct 
>   sc->hw.min_frame_size = 
>   ETHER_MIN_LEN + ETHER_CRC_LEN;
>  
> - /* Allocate Transmit Descriptor ring */
> - if (em_dma_malloc(sc, sc->sc_tx_slots * sizeof(struct em_tx_desc),
> - >sc_tx_dma) != 0) {
> - printf("%s: Unable to allocate tx_desc memory\n", 
> -DEVNAME(sc));
> - goto err_tx_desc;
> - }
> - sc->sc_tx_desc_ring = (struct 

Re: Remove unused code from vmm

2020-01-30 Thread Mike Larkin
On Fri, Jan 31, 2020 at 01:40:14AM +, Adam Steen wrote:
>  Hi
> 
>  While working on a patch, i noticed that vmm_get_guest_faulttype was
>  incorrect for amd (VMM_MODE_RVI) cpus, apon further inspection realised
>  it was unused. Please see the patch below to remove it.
> 
> cheers
> Adam
> 

Thanks, will remove.

-ml

> ? div
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.257
> diff -u -p -u -p -r1.257 vmm.c
> --- sys/arch/amd64/amd64/vmm.c13 Dec 2019 03:38:15 -  1.257
> +++ sys/arch/amd64/amd64/vmm.c30 Jan 2020 06:47:41 -
> @@ -177,7 +177,6 @@ void vmx_handle_intr(struct vcpu *);
>  void vmx_handle_intwin(struct vcpu *);
>  void vmx_handle_misc_enable_msr(struct vcpu *);
>  int vmm_get_guest_memtype(struct vm *, paddr_t);
> -int vmm_get_guest_faulttype(void);
>  int vmx_get_guest_faulttype(void);
>  int svm_get_guest_faulttype(struct vmcb *);
>  int vmx_get_exit_qualification(uint64_t *);
> @@ -5073,23 +5072,6 @@ vmm_get_guest_memtype(struct vm *vm, pad
>  
>   DPRINTF("guest memtype @ 0x%llx unknown\n", (uint64_t)gpa);
>   return (VMM_MEM_TYPE_UNKNOWN);
> -}
> -
> -/*
> - * vmm_get_guest_faulttype
> - *
> - * Determines the type (R/W/X) of the last fault on the VCPU last run on
> - * this PCPU. Calls the appropriate architecture-specific subroutine.
> - */
> -int
> -vmm_get_guest_faulttype(void)
> -{
> - if (vmm_softc->mode == VMM_MODE_EPT)
> - return vmx_get_guest_faulttype();
> - else if (vmm_softc->mode == VMM_MODE_RVI)
> - return vmx_get_guest_faulttype();
> - else
> - panic("%s: unknown vmm mode: %d", __func__, vmm_softc->mode);
>  }
>  
>  /*
> 



Re: xbf(4): tsleep(9) -> tsleep_nsec(9)

2020-01-21 Thread Mike Belopuhov


Scott Cheloha writes:

> Given the SCSI_NOSLEEP split here I think the simplest thing we can do
> is ask to sleep as much as we delay(9).
>
> The question is: if you *could* poll in 10us intervals here with
> tsleep_nsec(9), would you want to?  If so, then this works.  If
> not, what is a more appropriate interval?
>

Hi,

I believe it would be fine to use the same value as in the delay,
"1" was just the smallest available for the tsleep.

OK mikeb for the change.

Cheers,
Mike

> Index: pv/xbf.c
> ===
> RCS file: /cvs/src/sys/dev/pv/xbf.c,v
> retrieving revision 1.32
> diff -u -p -r1.32 xbf.c
> --- pv/xbf.c  17 Jul 2017 10:30:03 -  1.32
> +++ pv/xbf.c  15 Jan 2020 06:20:25 -
> @@ -738,7 +738,7 @@ xbf_poll_cmd(struct scsi_xfer *xs)
>   if (ISSET(xs->flags, SCSI_NOSLEEP))
>   delay(10);
>   else
> - tsleep(xs, PRIBIO, "xbfpoll", 1);
> + tsleep_nsec(xs, PRIBIO, "xbfpoll", USEC_TO_NSEC(10));
>   xbf_intr(xs->sc_link->adapter_softc);
>   } while(--timo > 0);
>  



Re: ldomctl: download: select new configuration

2020-01-06 Thread Mike Larkin
On Fri, Jan 03, 2020 at 08:27:21PM +0100, Mark Kettenis wrote:
> > Date: Mon, 30 Dec 2019 21:07:59 +0100
> > From: Klemens Nanni 
> > 
> > The example in the manual implies that the download command also selects
> > it:
> > 
> > # ldomctl init-system ldom.conf
> > # cd ..
> > # ldomctl delete openbsd
> > # ldomctl download openbsd
> > # ldomctl list
> > factory-default [current]
> > openbsd [next]
> > 
> > But `ldomctl select openbsd' is required between downloading and listing
> > to get this result - at least on my T4 machine `download' never selected
> > any configuration, however I vaguely remember that this was the case
> > with older machines.
> > 
> > kettenis: Has this really been missing all the time or could there be
> > differences in the mdstore protocol and/or firmware that cause this?
> > 
> > Diff below explicitly selects configuration in code.
> > 
> > Feedback? OK?
> 
> I can't remember.  Maybe someone with a t1k/t2k or t5120/t5140 can
> test this?
> 

Do you still need testing here? I could reinstall my t5240 if nobody has
tested yet...

-ml

> It makes sense for the download command not to immediately select a
> configuration.  So maybe just the documentation needs changing?
> 
> > Index: ldomctl.c
> > ===
> > RCS file: /cvs/src/usr.sbin/ldomctl/ldomctl.c,v
> > retrieving revision 1.31
> > diff -u -p -r1.31 ldomctl.c
> > --- ldomctl.c   28 Dec 2019 18:36:02 -  1.31
> > +++ ldomctl.c   30 Dec 2019 19:51:59 -
> > @@ -415,6 +415,7 @@ download(int argc, char **argv)
> > ds_conn_handle(dc);
> >  
> > mdstore_download(dc, argv[1]);
> > +   mdstore_select(dc, argv[1]);
> >  }
> >  
> >  void
> > 
> > 
> 



Re: sparc64: find root device on hardware RAID

2019-12-27 Thread Mike Belopuhov


Klemens Nanni writes:

> On Thu, Dec 26, 2019 at 07:49:06PM +0100, Mark Kettenis wrote:
>> Well, there's your problem.  The mpii(4) doesn't fill in the WWNs for
>> the logical volume so there is nothing that can be matched to the WWN
>> from the bootpath.
> Obvious now that you mention it.
>
>> > See below a diff for debug printf() I use to look at thoes values.
>> > Complete console log from OBP prompt to multiuser follows to to show the
>> > boot process and debug output for all devices.
>> > 
>> > What I find odd is how 0aa32290d5dcd16c is the WWID of the RAID volume,
>> > and yet all devices attaching to scsibus* including those not being part
>> > of the RAID show the very same bp->val[0] of 3aa32290d5dcd16c.
>> 
>> bp->val[0] comes from the boot path; there is only one.
> Ha, sure that.  I confused myself with printing it for every device
> passing that code path where it is used as target, hence debug printfs
> showing the same value for multiple devices.
>
>> As you can see, the WWNs are filled in for the other disks (sd1, cd0)
>> that attach to the controller.  So you probably need some additional
>> code in mpii(4) to fill in the WWNs for logical volumes.  I recommend
>> talking to dlg@ and jmatthew@ directly about that.
> That makes sense, I didn't look toward mpii(4) yet.
>
> Thank you for pointing things out and asking such questions, this is
> very very helpful guidance.  I'm looking further into the controller
> driver now.


Looks like WWID for the RAID volume can be read from the RAID Volume
Page 1 (mpii_cfg_raid_vol_pg1).

Cheers,
Mike



Re: vmctl: print root user in status owner field

2019-12-15 Thread Mike Larkin
On Sat, Dec 14, 2019 at 02:16:20AM +0100, Klemens Nanni wrote:
> With "owner root:wheel" (any group) the `vmctl status' output
> will omit the "root" part in the OWNER column:
> 
>   vm "generic" {
>   owner "root:vms"
>   ...
>   }
> 
>   $ vmctl status
>  ID   PID VCPUS  MAXMEM  CURMEM TTYOWNERSTATE NAME
>   1 - 1512M   -   - :vms  stopped generic
> 
> It only omits it if the user is root, presumably to say "only the group
> matters".
> 
> I find this special case confusing as it looks incomplete, instead just
> print whatever is configured: 
> 
>   $ ./obj/vmctl status
>  ID   PID VCPUS  MAXMEM  CURMEM TTYOWNERSTATE NAME
>   1 - 1512M   -   - root:vms  stopped generic
> 
> Feedback? OK?
> 
> 
> Index: vmctl.c
> ===
> RCS file: /cvs/src/usr.sbin/vmctl/vmctl.c,v
> retrieving revision 1.72
> diff -u -p -r1.72 vmctl.c
> --- vmctl.c   12 Dec 2019 03:53:38 -  1.72
> +++ vmctl.c   14 Dec 2019 00:54:23 -
> @@ -768,8 +768,6 @@ print_vm_info(struct vmop_info_result *l
>   (void)strlcpy(user, name, sizeof(user));
>   /* get group name */
>   if (vmi->vir_gid != -1) {
> - if (vmi->vir_uid == 0)
> - *user = '\0';
>   name = group_from_gid(vmi->vir_gid, 1);
>   if (name == NULL)
>   (void)snprintf(group, sizeof(group),
> 

sure



Re: vmm(4) question: unneeded vmclear() in vcpu_readregs_vmx()?

2019-12-12 Thread Mike Larkin
On Tue, Oct 22, 2019 at 06:57:32PM +0900, Iori YONEJI wrote:
> On Tue, Oct 22, 2019 at 11:17 AM Mike Larkin  wrote:
> >
> > On Mon, Oct 21, 2019 at 03:52:52AM +0900, Iori YONEJI wrote:
> > > Hello tech@,
> > >
> > > I have a question (or maybe a suggestion) about vmm(4).
> > >
> > > I'm writing a small additional feature to sys/arch/amd64/amd64/vmm.c
> > > and found a seemingly unneeded vmclear() at the end of
> > > vcpu_readregs_vmx(). This function didn't seem to affect VM state at
> > > first glance because, obviously, its name is _read_. Against this
> > > intuition, this function clears VM state, which makes the vmexit
> > > handling stop (eg. "advance rip" failure) and the VM die if it is used
> > > in there. Possibly this was added as a counterpart of
> > > vcpu_reload_vmcs_vmx() at the very beginning of the read function, but
> > > I don't think it does undo the reload.
> > >
> > > This vmclear can be removed in my understanding and also I confirmed
> > > that Alpine Linux and OpenBSD guest VMs can run on the vmm(4) without
> > > this vmclear call.
> > >
> > > Or I am just wrong and it may be actually needed in some corner cases.
> > > If so, I will restore (vmptrld) vcpu->vc_control_pa every time the VM
> > > context continues after calling readregs function or add another flag
> > > to indicate whether vmclear is needed here if all of the corner cases
> > > are known.
> > >
> > > Would you mind to tell me I am correct or not, or how it has to be
> > > dealt with?
> > >
> > >
> > > Index: sys/arch/amd64/amd64/vmm.c
> > > ===
> > > RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> > > retrieving revision 1.254
> > > diff -u -p -r1.254 vmm.c
> > > --- sys/arch/amd64/amd64/vmm.c22 Sep 2019 08:47:54 -1.254
> > > +++ sys/arch/amd64/amd64/vmm.c20 Oct 2019 18:04:17 -
> > > @@ -1602,8 +1602,10 @@ vcpu_readregs_vmx(struct vcpu *vcpu, uin
> > >  errout:
> > >  ret = EINVAL;
> > >  out:
> > > +/* XXX Why do we need vmclear here?
> > >  if (vmclear(>vc_control_pa))
> > >  ret = EINVAL;
> > > +*/
> > >  return (ret);
> > >  }
> > >
> >
> > The assumption in vcpu_run_vmx is that we will always come in with a cleared
> > VMCS state, unless we are staying in the run loop (the "resume" flag in that
> > function tracks this). So any ioctl (including things like read/write regs)
> > that might possibly manipulate the VMCS, need to clear the VMCS before they
> > return to user space. That was the original reason for that clear at the end
> > of the function.
> >
> > If you are adding new functionality that uses read/write regs internally, 
> > then
> > you're right; on return, you'll have a cleared VMCS and subsequent vmreads 
> > and
> > vmwrites will fail.
> >
> > It looks like there are a couple of other users of readregs/writeregs
> > internally - the presently unused GVA translator function as well as the 
> > in/out
> > exit handler structure preparation code. Both of those look like they got 
> > lucky
> > in how they were handling things; they aren't touching the VMCS after 
> > read/write
> > regs or we would have seen them explode like you're saying.
> >
> > It looks like the start of vcpu_run_vmx *almost* properly handles coming
> > in with an arbitrary  VMCS. The "almost" part is the bit under #ifdef 
> > VMM_DEBUG
> > that handles triple faults. It seems to assume that the proper VMCS is 
> > currently
> > loaded, which is certainly not true all the time. I'll fix that part.
> >
> > Based on the fact that the other code paths look clean, I think we can 
> > remove
> > the vmclear from the end of read/write regs, as you suggest. I'll do that as
> > well.
> >
> > Might I ask what you're working on? Just in case I know of someone else 
> > working
> > on the same thing? (There are about a dozen vmm side-projects going on that 
> > I'm
> > aware of and it would be a waste for two people to be working on the same 
> > thing
> > without knowing of the other). Feel free to mail me off-list if you don't 
> > want
> > to share in public.
> >
> > -ml
> >
> Thank you. I didn't notice that the VMCS must be cleared before
> returning to user space.
> 
> Ta

Re: pfctl: Do not optimize empty rulesets

2019-12-12 Thread Mike Belopuhov


Klemens Nanni writes:

> On Wed, Nov 27, 2019 at 08:04:47PM +0100, Klemens Nanni wrote:
>> If an anchor/ruleset contains no rules, there is no point in creating
>> a temporary copy, optimizing and replacing it.
>> 
>> Regress passes on amd64.
>> 
>> Feedback? OK?
> Anyone?
>

FWIW, it looks good to me. Ok mikeb

> All optimizations work on actual rules;  if there are none, we don't
> need to look further, especially not in "profile" mode where existing
> rules are read from the kernel as feedback: an empty ruleset will stay
> empty after optimization is done.
>
> This also does not affect `set' or `table' lines in any way, e.g.
>
>   # echo 'table ' | pfctl -o basic -d -nf-
>
> still is an empty ruleset.
>
>
> I came across when debugging anchors, but with -DOPT_DEBUG as well this
> time where `-d' output for multiple anchors wouldn't really be helpful:
>
>   $ pfctl -dnf test.pf
>   pfctl_optimize_ruleset: optimizing ruleset
>   pfctl_optimize_ruleset: optimizing ruleset
>   pfctl_optimize_ruleset: optimizing ruleset
>
> So below is an updated diff that also prints the anchor path, letting
> developers know which anchor is being optimized in wha order:
>
>   pfctl_optimize_ruleset: optimizing ruleset ""
>   pfctl_optimize_ruleset: optimizing ruleset "a1"
>   pfctl_optimize_ruleset: optimizing ruleset "_1/a2"
>
> Yes, the main anchor prints as "" but all that is behind compile time
> -DOPT_DEBUG so regular users won't deal with it anyway, so keep the code
> simple instead of adding logging around `rs->anchor->path'.
>
> OK?
>
>
> Index: pfctl_optimize.c
> ===
> RCS file: /cvs/src/sbin/pfctl/pfctl_optimize.c,v
> retrieving revision 1.42
> diff -u -p -r1.42 pfctl_optimize.c
> --- pfctl_optimize.c  28 Jun 2019 13:32:45 -  1.42
> +++ pfctl_optimize.c  12 Dec 2019 20:06:15 -
> @@ -270,7 +270,10 @@ pfctl_optimize_ruleset(struct pfctl *pf,
>   struct pf_rule *r;
>   struct pf_rulequeue *old_rules;
>  
> - DEBUG("optimizing ruleset");
> + if (TAILQ_EMPTY(rs->rules.active.ptr))
> + return (0);
> +
> + DEBUG("optimizing ruleset \"%s\"", rs->anchor->path);
>   memset(_buffer, 0, sizeof(table_buffer));
>   skip_init();
>   TAILQ_INIT(_queue);



Re: [PATCH] staggered start of vms in vm.conf

2019-12-08 Thread Mike Larkin
On Sun, Dec 08, 2019 at 02:07:46AM -0800, Pratik Vyas wrote:
> Hi!
> 
> This is an attempt to address 'thundering herd' problem when a lot of
> vms are configured in vm.conf.  A lot of vms booting in parallel can
> overload the host and also mess up tsc calibration in openbsd guests as
> it uses PIT which doesn't fire reliably if the host is overloaded.
> 
> 
> This diff makes vmd start vms in a staggered fashion with default parallelism 
> of
> number of cpus on the host and a delay of 30s.  Default can be overridden with
> a line like following in vm.conf
> 
> staggered start parallel 4 delay 30
> 
> 
> Every non-disabled vm starts in waiting state.  If you are eager to
> start a vm that is way further in the list, you can vmctl start it.
> 
> Discussed the idea with ori@, mlarkin@ and phessler@.
> 
> Comments / ok?
> 
> --
> Pratik
> 

See below. Other than the nits below, ok mlarkin when you are ready.

-ml

> Index: usr.sbin/vmctl/vmctl.c
> ===
> RCS file: /home/cvs/src/usr.sbin/vmctl/vmctl.c,v
> retrieving revision 1.71
> diff -u -p -a -u -r1.71 vmctl.c
> --- usr.sbin/vmctl/vmctl.c7 Sep 2019 09:11:14 -   1.71
> +++ usr.sbin/vmctl/vmctl.c8 Dec 2019 09:29:39 -
> @@ -716,6 +716,8 @@ vm_state(unsigned int mask)
> {
>   if (mask & VM_STATE_PAUSED)
>   return "paused";
> + else if (mask & VM_STATE_WAITING)
> + return "waiting";
>   else if (mask & VM_STATE_RUNNING)
>   return "running";
>   else if (mask & VM_STATE_SHUTDOWN)
> Index: usr.sbin/vmd/parse.y
> ===
> RCS file: /home/cvs/src/usr.sbin/vmd/parse.y,v
> retrieving revision 1.52
> diff -u -p -a -u -r1.52 parse.y
> --- usr.sbin/vmd/parse.y  14 May 2019 06:05:45 -  1.52
> +++ usr.sbin/vmd/parse.y  8 Dec 2019 09:29:39 -
> @@ -122,7 +122,8 @@ typedef struct {
> %tokenINCLUDE ERROR
> %tokenADD ALLOW BOOT CDROM DEVICE DISABLE DISK DOWN ENABLE FORMAT 
> GROUP
> %tokenINET6 INSTANCE INTERFACE LLADDR LOCAL LOCKED MEMORY NET NIFS 
> OWNER
> -%token   PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID
> +%token   PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID STAGGERED 
> START
> +%token  PARALLEL DELAY
> %token  NUMBER
> %token  STRING
> %type   lladdr
> @@ -217,6 +218,11 @@ main : LOCAL INET6 {
>   env->vmd_ps.ps_csock.cs_uid = $3.uid;
>   env->vmd_ps.ps_csock.cs_gid = $3.gid == -1 ? 0 : $3.gid;
>   }
> + | STAGGERED START PARALLEL NUMBER DELAY NUMBER {
> + env->vmd_cfg.cfg_flags |= VMD_CFG_STAGGERED_START;
> + env->vmd_cfg.delay.tv_sec = $6;
> + env->vmd_cfg.parallelism = $4;
> + }
>   ;
> 
> switch: SWITCH string {
> @@ -368,6 +374,8 @@ vm: VM string vm_instance {
>   } else {
>   if (vcp_disable)
>   vm->vm_state |= 
> VM_STATE_DISABLED;
> + else
> + vm->vm_state |= 
> VM_STATE_WAITING;
>   log_debug("%s:%d: vm \"%s\" "
>   "registered (%s)",
>   file->name, yylval.lineno,
> @@ -766,6 +774,7 @@ lookup(char *s)
>   { "allow",  ALLOW },
>   { "boot",   BOOT },
>   { "cdrom",  CDROM },
> + { "delay",  DELAY },
>   { "device", DEVICE },
>   { "disable",DISABLE },
>   { "disk",   DISK },
> @@ -785,10 +794,13 @@ lookup(char *s)
>   { "memory", MEMORY },
>   { "net",NET },
>   { "owner",  OWNER },
> + { "parallel",   PARALLEL },
>   { "prefix", PREFIX },
>   { "rdomain",RDOMAIN },
>   { "size",   SIZE },
>   { "socket", SOCKET },
> + { "staggered",  STAGGERED },
> + { "start",  START  },
>   { "switch", SWITCH },
>   { "up", UP },
>   { "vm", VM }
> Index: usr.sbin/vmd/vm.conf.5
> ===
> RCS file: /home/cvs/src/usr.sbin/vmd/vm.conf.5,v
> retrieving revision 1.44
> diff -u -p -a -u -r1.44 vm.conf.5
> --- usr.sbin/vmd/vm.conf.514 May 2019 12:47:17 -  1.44
> +++ usr.sbin/vmd/vm.conf.58 Dec 2019 09:29:39 -
> @@ -91,6 +91,16 

Re: [PATCH] attach pvclock with lower priority if tsc is unstable

2019-12-06 Thread Mike Larkin
On Fri, Dec 06, 2019 at 02:16:43PM -0800, Pratik Vyas wrote:
> * Pratik Vyas  [2019-11-24 23:07:26 -0800]:
> 
> > Hello tech@,
> > 
> > This diff attaches pvclock with lower priority (500) in case of unstable
> > tsc (PVCLOCK_FLAG_TSC_STABLE) instead of not attaching at all.
> > 
> > For reference current priorities,
> > tsc (variant)  : -2000
> > i8254  : 0
> > acpitimer  : 1000
> > acpihpet0  : 1000
> > pvclock (stable tsc)   : 1500
> > tsc (invariant, stable): 2000
> > 
> > --
> > Pratik
> > 
> 
> Does this look ok? (or any comments?)
> 
> --
> Pratik
> 

That seems like a reasonable compromise. People who still want to force
set their timecounter can do so.

-ml



Re: uvm/uvm_map.h cleanup

2019-12-06 Thread Mike Larkin
On Thu, Dec 05, 2019 at 07:25:51PM +0100, Martin Pieuchot wrote:
> Following cleanup diff:
> 
> - reduces gratuitous differences with NetBSD,
> - merges multiple '#ifdef _KERNEL' blocks,
> - kills unused 'struct vm_map_intrsafe'
> - turns 'union vm_map_object' into a anonymous union (following to NetBSD)
> - move questionable vm_map_modflags() into uvm/uvm_map.c
> - remove guards around MAX_KMAPENT, it is defined only once
> - document lock differences
> - fix tab vs space
> 
> Ok?
> 

ok mlarkin

> Index: uvm/uvm_extern.h
> ===
> RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
> retrieving revision 1.151
> diff -u -p -r1.151 uvm_extern.h
> --- uvm/uvm_extern.h  29 Nov 2019 06:34:45 -  1.151
> +++ uvm/uvm_extern.h  5 Dec 2019 16:06:33 -
> @@ -65,9 +65,6 @@ typedef int vm_fault_t;
>  typedef int vm_inherit_t;/* XXX: inheritance codes */
>  typedef off_t voff_t;/* XXX: offset within a uvm_object */
>  
> -union vm_map_object;
> -typedef union vm_map_object vm_map_object_t;
> -
>  struct vm_map_entry;
>  typedef struct vm_map_entry *vm_map_entry_t;
>  
> Index: uvm/uvm_map.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> retrieving revision 1.256
> diff -u -p -r1.256 uvm_map.c
> --- uvm/uvm_map.c 4 Dec 2019 08:28:29 -   1.256
> +++ uvm/uvm_map.c 5 Dec 2019 16:27:22 -
> @@ -230,7 +230,6 @@ void   vmspace_validate(struct 
> vm_map*)
>  #define PMAP_PREFER(addr, off)   (addr)
>  #endif
>  
> -
>  /*
>   * The kernel map will initially be VM_MAP_KSIZE_INIT bytes.
>   * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA 
> bytes.
> @@ -334,6 +333,14 @@ vaddr_t uvm_maxkaddr;
>   MUTEX_ASSERT_LOCKED(&(_map)->mtx);  \
>   }   \
>   } while (0)
> +
> +#define  vm_map_modflags(map, set, clear)
> \
> + do {\
> + mtx_enter(&(map)->flags_lock);  \
> + (map)->flags = ((map)->flags | (set)) & ~(clear);   \
> + mtx_leave(&(map)->flags_lock);  \
> + } while (0)
> +
>  
>  /*
>   * Tree describing entries by address.
> Index: uvm/uvm_map.h
> ===
> RCS file: /cvs/src/sys/uvm/uvm_map.h,v
> retrieving revision 1.65
> diff -u -p -r1.65 uvm_map.h
> --- uvm/uvm_map.h 29 Nov 2019 06:34:46 -  1.65
> +++ uvm/uvm_map.h 5 Dec 2019 16:26:09 -
> @@ -86,16 +86,6 @@
>  #ifdef _KERNEL
>  
>  /*
> - * Internal functions.
> - *
> - * Required by clipping macros.
> - */
> -void  uvm_map_clip_end(struct vm_map*, struct vm_map_entry*,
> - vaddr_t);
> -void  uvm_map_clip_start(struct vm_map*,
> - struct vm_map_entry*, vaddr_t);
> -
> -/*
>   * UVM_MAP_CLIP_START: ensure that the entry begins at or after
>   * the starting address, if it doesn't we split the entry.
>   * 
> @@ -133,26 +123,6 @@ void  uvm_map_clip_start(struct 
> vm_map
>  #include 
>  
>  /*
> - * types defined:
> - *
> - *   vm_map_tthe high-level address map data structure.
> - *   vm_map_entry_t  an entry in an address map.
> - *   vm_map_version_ta timestamp of a map, for use with vm_map_lookup
> - */
> -
> -/*
> - * Objects which live in maps may be either VM objects, or another map
> - * (called a "sharing map") which denotes read-write sharing with other maps.
> - *
> - * XXXCDC: private pager data goes here now
> - */
> -
> -union vm_map_object {
> - struct uvm_object   *uvm_obj;   /* UVM OBJECT */
> - struct vm_map   *sub_map;   /* belongs to another map */
> -};
> -
> -/*
>   * Address map entries consist of start and end addresses,
>   * a VM object (or sharing map) and offset into that object,
>   * and user-exported inheritance and protection information.
> @@ -177,23 +147,23 @@ struct vm_map_entry {
>   vsize_t guard;  /* bytes in guard */
>   vsize_t fspace; /* free space */
>  
> - union vm_map_object object; /* object I point to */
> + union {
> + struct uvm_object *uvm_obj; /* uvm object */
> + struct vm_map   *sub_map;   /* belongs to another map */
> + } object;   /* object I point to */
>   voff_t  offset; /* offset into object */
>   struct vm_aref  aref;   /* anonymous overlay */
> -
>   int etype;  /* entry type */
> -
>   vm_prot_t   protection; /* protection code */
>   

Re: Kill uvm/uvm_stat.c

2019-12-04 Thread Mike Larkin
On Wed, Dec 04, 2019 at 03:19:41PM +0100, Martin Pieuchot wrote:
> Less is more.  Fewer files to look at, simpler it becomes to understand
> UVM.  uvm/uvm_stat.c contains just a ddb(4) function.  Let's move it to
> uvm/uvm_meter.c which also deals with counters. ok?
> 

Also reads ok to me.

-ml

> Index: conf/files
> ===
> RCS file: /cvs/src/sys/conf/files,v
> retrieving revision 1.677
> diff -u -p -r1.677 files
> --- conf/files5 Nov 2019 08:18:47 -   1.677
> +++ conf/files4 Dec 2019 14:15:03 -
> @@ -964,7 +964,6 @@ file uvm/uvm_page.c
>  file uvm/uvm_pager.c
>  file uvm/uvm_pdaemon.c
>  file uvm/uvm_pmemrange.c
> -file uvm/uvm_stat.c
>  file uvm/uvm_swap.c
>  file uvm/uvm_swap_encrypt.c  uvm_swap_encrypt
>  file uvm/uvm_unix.c
> Index: uvm/uvm_meter.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
> retrieving revision 1.38
> diff -u -p -r1.38 uvm_meter.c
> --- uvm/uvm_meter.c   6 Nov 2018 07:49:38 -   1.38
> +++ uvm/uvm_meter.c   4 Dec 2019 14:16:01 -
> @@ -43,6 +43,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #ifdef UVM_SWAP_ENCRYPT
>  #include 
> @@ -312,3 +313,62 @@ uvm_total(struct vmtotal *totalp)
>   totalp->t_rmshr = 0;/* XXX */
>   totalp->t_armshr = 0;   /* XXX */
>  }
> +
> +#ifdef DDB
> +
> +/*
> + * uvmexp_print: ddb hook to print interesting uvm counters
> + */
> +void
> +uvmexp_print(int (*pr)(const char *, ...))
> +{
> +
> + (*pr)("Current UVM status:\n");
> + (*pr)("  pagesize=%d (0x%x), pagemask=0x%x, pageshift=%d\n",
> + uvmexp.pagesize, uvmexp.pagesize, uvmexp.pagemask,
> + uvmexp.pageshift);
> + (*pr)("  %d VM pages: %d active, %d inactive, %d wired, %d free (%d 
> zero)\n",
> + uvmexp.npages, uvmexp.active, uvmexp.inactive, uvmexp.wired,
> + uvmexp.free, uvmexp.zeropages);
> + (*pr)("  min  %d%% (%d) anon, %d%% (%d) vnode, %d%% (%d) vtext\n",
> + uvmexp.anonminpct, uvmexp.anonmin, uvmexp.vnodeminpct,
> + uvmexp.vnodemin, uvmexp.vtextminpct, uvmexp.vtextmin);
> + (*pr)("  freemin=%d, free-target=%d, inactive-target=%d, "
> + "wired-max=%d\n", uvmexp.freemin, uvmexp.freetarg, uvmexp.inactarg,
> + uvmexp.wiredmax);
> + (*pr)("  faults=%d, traps=%d, intrs=%d, ctxswitch=%d fpuswitch=%d\n",
> + uvmexp.faults, uvmexp.traps, uvmexp.intrs, uvmexp.swtch,
> + uvmexp.fpswtch);
> + (*pr)("  softint=%d, syscalls=%d, kmapent=%d\n",
> + uvmexp.softs, uvmexp.syscalls, uvmexp.kmapent);
> +
> + (*pr)("  fault counts:\n");
> + (*pr)("noram=%d, noanon=%d, noamap=%d, pgwait=%d, pgrele=%d\n",
> + uvmexp.fltnoram, uvmexp.fltnoanon, uvmexp.fltnoamap,
> + uvmexp.fltpgwait, uvmexp.fltpgrele);
> + (*pr)("ok relocks(total)=%d(%d), anget(retries)=%d(%d), "
> + "amapcopy=%d\n", uvmexp.fltrelckok, uvmexp.fltrelck,
> + uvmexp.fltanget, uvmexp.fltanretry, uvmexp.fltamcopy);
> + (*pr)("neighbor anon/obj pg=%d/%d, gets(lock/unlock)=%d/%d\n",
> + uvmexp.fltnamap, uvmexp.fltnomap, uvmexp.fltlget, uvmexp.fltget);
> + (*pr)("cases: anon=%d, anoncow=%d, obj=%d, prcopy=%d, przero=%d\n",
> + uvmexp.flt_anon, uvmexp.flt_acow, uvmexp.flt_obj, uvmexp.flt_prcopy,
> + uvmexp.flt_przero);
> +
> + (*pr)("  daemon and swap counts:\n");
> + (*pr)("woke=%d, revs=%d, scans=%d, obscans=%d, anscans=%d\n",
> + uvmexp.pdwoke, uvmexp.pdrevs, uvmexp.pdscans, uvmexp.pdobscan,
> + uvmexp.pdanscan);
> + (*pr)("busy=%d, freed=%d, reactivate=%d, deactivate=%d\n",
> + uvmexp.pdbusy, uvmexp.pdfreed, uvmexp.pdreact, uvmexp.pddeact);
> + (*pr)("pageouts=%d, pending=%d, nswget=%d\n", uvmexp.pdpageouts,
> + uvmexp.pdpending, uvmexp.nswget);
> + (*pr)("nswapdev=%d\n",
> + uvmexp.nswapdev);
> + (*pr)("swpages=%d, swpginuse=%d, swpgonly=%d paging=%d\n",
> + uvmexp.swpages, uvmexp.swpginuse, uvmexp.swpgonly, uvmexp.paging);
> +
> + (*pr)("  kernel pointers:\n");
> + (*pr)("objs(kern)=%p\n", uvm.kernel_object);
> +}
> +#endif
> Index: uvm/uvm_stat.c
> ===
> RCS file: uvm/uvm_stat.c
> diff -N uvm/uvm_stat.c
> --- uvm/uvm_stat.c19 Jun 2018 22:35:07 -  1.30
> +++ /dev/null 1 Jan 1970 00:00:00 -
> @@ -1,98 +0,0 @@
> -/*   $OpenBSD: uvm_stat.c,v 1.30 2018/06/19 22:35:07 krw Exp $*/
> -/*   $NetBSD: uvm_stat.c,v 1.18 2001/03/09 01:02:13 chs Exp $ */
> -
> -/*
> - * Copyright (c) 1997 Charles D. Cranor and Washington University.
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. 

Re: un-boolean_t amd64's pmap

2019-12-04 Thread Mike Larkin
On Wed, Dec 04, 2019 at 03:31:07PM +0100, Martin Pieuchot wrote:
> Similar to recent ddb(4) changes, replace boolean_t/TRUE/FALSE by
> int/1/0.
> 
> ok?
> 

No objection here, unsure if anyone else has commented either way.

-ml

> Index: arch/amd64/amd64/pmap.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
> retrieving revision 1.136
> diff -u -p -r1.136 pmap.c
> --- arch/amd64/amd64/pmap.c   3 Nov 2019 09:44:23 -   1.136
> +++ arch/amd64/amd64/pmap.c   4 Dec 2019 14:26:15 -
> @@ -254,7 +254,7 @@ paddr_t cr3_pcid_proc_intel;
>   */
>  
>  pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */
> -boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
> +int pmap_initialized = 0;/* pmap_init done yet? */
>  
>  /*
>   * pv management structures.
> @@ -312,7 +312,7 @@ void pmap_free_ptp(struct pmap *, struct
>  vaddr_t, struct pg_to_free *);
>  void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free 
> *);
>  #ifdef MULTIPROCESSOR
> -static boolean_t pmap_is_active(struct pmap *, int);
> +static int pmap_is_active(struct pmap *, int);
>  #endif
>  paddr_t pmap_map_ptes(struct pmap *);
>  struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
> @@ -320,7 +320,7 @@ void pmap_do_remove(struct pmap *, vaddr
>  void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
>  void pmap_do_remove_ept(struct pmap *, vaddr_t);
>  int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
> -boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
> +int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
>  vaddr_t, int, struct pv_entry **);
>  void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
>  vaddr_t, vaddr_t, int, struct pv_entry **);
> @@ -328,8 +328,8 @@ void pmap_remove_ptes(struct pmap *, str
>  #define PMAP_REMOVE_SKIPWIRED1   /* skip wired mappings */
>  
>  void pmap_unmap_ptes(struct pmap *, paddr_t);
> -boolean_t pmap_get_physpage(vaddr_t, int, paddr_t *);
> -boolean_t pmap_pdes_valid(vaddr_t, pd_entry_t *);
> +int pmap_get_physpage(vaddr_t, int, paddr_t *);
> +int pmap_pdes_valid(vaddr_t, pd_entry_t *);
>  void pmap_alloc_level(vaddr_t, int, long *);
>  
>  static inline
> @@ -353,7 +353,7 @@ void pmap_tlb_shootwait(void);
>   *   of course the kernel is always loaded
>   */
>  
> -static __inline boolean_t
> +static __inline int
>  pmap_is_curpmap(struct pmap *pmap)
>  {
>   return((pmap == pmap_kernel()) ||
> @@ -365,7 +365,7 @@ pmap_is_curpmap(struct pmap *pmap)
>   */
>  
>  #ifdef MULTIPROCESSOR
> -static __inline boolean_t
> +static __inline int
>  pmap_is_active(struct pmap *pmap, int cpu_id)
>  {
>   return (pmap == pmap_kernel() ||
> @@ -1402,7 +1402,7 @@ pmap_deactivate(struct proc *p)
>   * some misc. functions
>   */
>  
> -boolean_t
> +int
>  pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde)
>  {
>   int i;
> @@ -1424,7 +1424,7 @@ pmap_pdes_valid(vaddr_t va, pd_entry_t *
>   * pmap_extract: extract a PA for the given VA
>   */
>  
> -boolean_t
> +int
>  pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
>  {
>   pt_entry_t *ptes;
> @@ -1433,7 +1433,7 @@ pmap_extract(struct pmap *pmap, vaddr_t 
>   if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
>   va < PMAP_DIRECT_END) {
>   *pap = va - PMAP_DIRECT_BASE;
> - return (TRUE);
> + return 1;
>   }
>  
>   level = pmap_find_pte_direct(pmap, va, , );
> @@ -1441,12 +1441,12 @@ pmap_extract(struct pmap *pmap, vaddr_t 
>   if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
>   if (pap != NULL)
>   *pap = (ptes[offs] & PG_FRAME) | (va & PAGE_MASK);
> - return (TRUE);
> + return 1;
>   }
>   if (level == 1 && (ptes[offs] & (PG_PS|PG_V)) == (PG_PS|PG_V)) {
>   if (pap != NULL)
>   *pap = (ptes[offs] & PG_LGFRAME) | (va & PAGE_MASK_L2);
> - return (TRUE);
> + return 1;
>   }
>  
>   return FALSE;
> @@ -1588,7 +1588,7 @@ pmap_remove_ptes(struct pmap *pmap, stru
>   * => returns true if we removed a mapping
>   */
>  
> -boolean_t
> +int
>  pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
>  vaddr_t va, int flags, struct pv_entry **free_pvs)
>  {
> @@ -1597,9 +1597,9 @@ pmap_remove_pte(struct pmap *pmap, struc
>   pt_entry_t opte;
>  
>   if (!pmap_valid_entry(*pte))
> - return(FALSE);  /* VA not mapped */
> + return 0;   /* VA not mapped */
>   if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
> - return(FALSE);
> + return 0;
>   }
>  
>   /* atomically save the old PTE and zap! it */
> @@ -1623,7 +1623,7 @@ pmap_remove_pte(struct pmap *pmap, struc
>   

Re: [PATCH] fix vmm pvclock accuracy

2019-11-25 Thread Mike Larkin
On Mon, Nov 25, 2019 at 07:06:19PM -0800, Pratik Vyas wrote:
> Hi tech@,
> 
> This patch fixes vmm pvclock accuracy issues.  Shift math error
> discovered by George Koehler.  This diff also fixes the error in tsc
> multiplier which was correct only if the host timecounter is tsc.
> 
> --
> Pratik
> 

Provided there is no reported fallout for just this piece, ok mlarkin@.

-ml

> 
> Index: sys/arch/amd64/amd64/vmm.c
> ===
> RCS file: /home/cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.254
> diff -u -p -a -u -r1.254 vmm.c
> --- sys/arch/amd64/amd64/vmm.c22 Sep 2019 08:47:54 -  1.254
> +++ sys/arch/amd64/amd64/vmm.c26 Nov 2019 00:08:10 -
> @@ -28,7 +28,6 @@
> #include 
> #include 
> #include 
> -#include 
> 
> #include 
> 
> @@ -6879,8 +6878,11 @@ void
> vmm_init_pvclock(struct vcpu *vcpu, paddr_t gpa)
> {
>   vcpu->vc_pvclock_system_gpa = gpa;
> - vcpu->vc_pvclock_system_tsc_mul =
> - (int) ((10L << 20) / tc_getfrequency());
> + if (tsc_frequency > 0)
> + vcpu->vc_pvclock_system_tsc_mul =
> + (int) ((10L << 20) / tsc_frequency);
> + else
> + vcpu->vc_pvclock_system_tsc_mul = 0;
>   vmm_update_pvclock(vcpu);
> }
> 
> @@ -6906,7 +6908,7 @@ vmm_update_pvclock(struct vcpu *vcpu)
>   nanotime();
>   pvclock_ti->ti_system_time =
>   tv.tv_sec * 10L + tv.tv_nsec;
> - pvclock_ti->ti_tsc_shift = -20;
> + pvclock_ti->ti_tsc_shift = 12;
>   pvclock_ti->ti_tsc_to_system_mul =
>   vcpu->vc_pvclock_system_tsc_mul;
>   pvclock_ti->ti_flags = PVCLOCK_FLAG_TSC_STABLE;
> 



Re: sdhc(4): no 0V one some Intel

2019-11-19 Thread Mike Larkin
On Tue, Nov 19, 2019 at 10:44:54AM +0100, Patrick Wildt wrote:
> Hi,
> 
> on some GPD Pocket mlarkin@ has the eMMC doesn't come up.  One issue
> is that we shouldn't go to 0V on some/most(?) Intel controllers.  This
> only adds it for his machine, but I know that the Appollo Lake versions
> might also work if added to this check.  But unless verified I'll not
> add it.  This makes mlarkin@'s machine work, even though it's utterly
> slow.  That would be the next thing to fix... probably low clocks or
> doesn't use 8-bit.
> 
> ok?
> 

Sure. And you get the handoff to fix the rest :)

-ml

> Patrick
> 
> diff --git a/sys/dev/pci/sdhc_pci.c b/sys/dev/pci/sdhc_pci.c
> index d1b6688f573..dd6bc79c29c 100644
> --- a/sys/dev/pci/sdhc_pci.c
> +++ b/sys/dev/pci/sdhc_pci.c
> @@ -127,6 +127,11 @@ sdhc_pci_attach(struct device *parent, struct device 
> *self, void *aux)
>   PCI_PRODUCT(pa->pa_id) == PCI_PRODUCT_ENE_SDCARD)
>   sc->sc.sc_flags |= SDHC_F_NOPWR0;
>  
> + /* Some Intel controllers break if set to 0V bus power. */
> + if (PCI_VENDOR(pa->pa_id) == PCI_VENDOR_INTEL &&
> + PCI_PRODUCT(pa->pa_id) == PCI_PRODUCT_INTEL_100SERIES_LP_EMMC)
> + sc->sc.sc_flags |= SDHC_F_NOPWR0;
> +
>   /* Some RICOH controllers need to be bumped into the right mode. */
>   if (PCI_VENDOR(pa->pa_id) == PCI_VENDOR_RICOH &&
>   (PCI_PRODUCT(pa->pa_id) == PCI_PRODUCT_RICOH_R5U822 ||
> 



Re: iwm: support 9260 devices

2019-11-16 Thread Mike Larkin
On Sat, Nov 16, 2019 at 05:09:40PM +0100, Stefan Sperling wrote:
> On Sat, Nov 16, 2019 at 04:51:44PM +0100, Stefan Sperling wrote:
> > This diff adds support for iwm(4) 9260 devices and hopefully 9560
> > devices as well but I have not yet had time to test those.
> > 
> > Joint work with patrick@. Some parts were lifted from FreeBSD.
> > 
> > If you have the followng device in pcidump it should at least get
> > an IP address from DHCP and be able to ping:
> >  4:0:0: Intel Dual Band Wireless-AC 9260
> >  0x: Vendor ID: 8086, Product ID: 2526
> > 
> > The firmware is not in fw_update yet.
> > In the meantime firmware can be fetched from here:
> > https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git/tree/
> > 
> > Copy these files to /etc/firmware as indicated:
> > for 9260: iwlwifi-9260-th-b0-jf-b0-34.ucode -> /etc/firmware/iwm-9260-34
> > for 9560: iwlwifi-9000-pu-b0-jf-b0-34.ucode -> /etc/firmware/iwm-9000-34
> > 
> > Checks for regressions on already supported devices are also welcome,
> > in which case the firmware isn't needed.
> 
> Better diff which fixes an Rx throughput issue which was present in
> the previous diff.
> 

Cool. Seems like it works here, I'm using it now, but it seems to have had at
least one firmware error in the past few minutes:

iwm0 at pci0 dev 20 function 3 "Intel Dual Band Wireless AC 9560" rev 0x30, msi
iwm0: hw rev 0x310, fw ver 34.3125811985.0, address xxx
iwm0: fatal firmware error
iwm0: could not remove MAC context (error 35)

I didn't notice any problem though. Other than that it seems ok!

-ml



Re: iked(8): fix error handling in msg_send

2019-11-15 Thread Mike Belopuhov


Tobias Heider writes:

> On Thu, Nov 14, 2019 at 09:57:27AM -0700, Theo de Raadt wrote:
>> > 
>> > The problem here is that log_warn can change errno,
>> 
>> No, it specifically avoids touching errno.
>> 
>> log_warn(const char *emsg, ...)
>> {
>> char*nfmt;
>> va_list  ap;
>> int  saved_errno = errno;
>> ...
>> errno = saved_errno;
>> }
>> 
>
> Good to know, thanks! In that case I really prefer Mike's diff.
> Here is an update with msg->msg_sa used consistently. We can also do it
> the other way around, but I would prefer to use either sa or msg_sa.

I'm sorry, it was a bit irresponsible of me to put msg_sa everywhere
in my diff.  In fact, almost all of the code in this file uses 'sa'
so it would be consistent with other functions if 'sa' would be used
instead of 'msg_sa'.  If you don't mind changing it to 'sa', I would
appreciate it.  OK mikeb either way.

>
> Index: ikev2_msg.c
> ===
> RCS file: /cvs/src/sbin/iked/ikev2_msg.c,v
> retrieving revision 1.58
> diff -u -p -r1.58 ikev2_msg.c
> --- ikev2_msg.c   13 Nov 2019 12:24:40 -  1.58
> +++ ikev2_msg.c   14 Nov 2019 17:15:42 -
> @@ -303,7 +303,6 @@ ikev2_msg_valid_ike_sa(struct iked *env,
>  int
>  ikev2_msg_send(struct iked *env, struct iked_message *msg)
>  {
> - struct iked_sa  *sa = msg->msg_sa;
>   struct ibuf *buf = msg->msg_data;
>   uint32_t natt = 0x;
>   int  isnatt = 0;
> @@ -338,7 +337,8 @@ ikev2_msg_send(struct iked *env, struct 
>   if (sendtofrom(msg->msg_fd, ibuf_data(buf), ibuf_size(buf), 0,
>   (struct sockaddr *)>msg_peer, msg->msg_peerlen,
>   (struct sockaddr *)>msg_local, msg->msg_locallen) == -1) {
> - if (errno == EADDRNOTAVAIL) {
> + log_warn("%s: sendtofrom", __func__);
> + if (msg->msg_sa != NULL && errno == EADDRNOTAVAIL) {
>   sa_state(env, msg->msg_sa, IKEV2_STATE_CLOSING);
>   timer_del(env, >msg_sa->sa_timer);
>   timer_set(env, >msg_sa->sa_timer,
> @@ -346,11 +346,11 @@ ikev2_msg_send(struct iked *env, struct 
>   timer_add(env, >msg_sa->sa_timer,
>   IKED_IKE_SA_DELETE_TIMEOUT);
>   }
> - log_warn("%s: sendtofrom", __func__);
> - return (-1);
> + if (msg->msg_sa != NULL)
> + return (-1);
>   }
>  
> - if (!sa)
> + if (msg->msg_sa == NULL)
>   return (0);
>  
>   if ((m = ikev2_msg_copy(env, msg)) == NULL) {
> @@ -360,11 +360,11 @@ ikev2_msg_send(struct iked *env, struct 
>   m->msg_exchange = exchange;
>  
>   if (flags & IKEV2_FLAG_RESPONSE) {
> - TAILQ_INSERT_TAIL(>sa_responses, m, msg_entry);
> + TAILQ_INSERT_TAIL(>msg_sa->sa_responses, m, msg_entry);
>   timer_set(env, >msg_timer, ikev2_msg_response_timeout, m);
>   timer_add(env, >msg_timer, IKED_RESPONSE_TIMEOUT);
>   } else {
> - TAILQ_INSERT_TAIL(>sa_requests, m, msg_entry);
> + TAILQ_INSERT_TAIL(>msg_sa->sa_requests, m, msg_entry);
>   timer_set(env, >msg_timer, ikev2_msg_retransmit_timeout, m);
>   timer_add(env, >msg_timer, IKED_RETRANSMIT_TIMEOUT);
>   }



Re: iked(8): fix error handling in msg_send

2019-11-14 Thread Mike Belopuhov


Tobias Heider writes:

> Hi,
>
> in the error case ikev2_msg_send the accesses the sa before checking for
> NULL. The diff adds explicit checks in those cases.
> If sendtofrom fails for any other reason than EADDRNOTAVAIL and sa is not NULL
> we should continue instead of returning (-1) so that the error is handled with
> retransmission.
>
> ok?
>

Hi Tobias,

you can write a simpler diff w/o repeating log_warn:

diff --git a/sbin/iked/ikev2_msg.c b/sbin/iked/ikev2_msg.c
index 2baea5f5508..396fea88c16 100644
--- a/sbin/iked/ikev2_msg.c
+++ b/sbin/iked/ikev2_msg.c
@@ -338,7 +338,8 @@ ikev2_msg_send(struct iked *env, struct iked_message *msg)
if (sendtofrom(msg->msg_fd, ibuf_data(buf), ibuf_size(buf), 0,
(struct sockaddr *)>msg_peer, msg->msg_peerlen,
(struct sockaddr *)>msg_local, msg->msg_locallen) == -1) {
-   if (errno == EADDRNOTAVAIL) {
+   log_warn("%s: sendtofrom", __func__);
+   if (msg->msg_sa != NULL && errno == EADDRNOTAVAIL) {
sa_state(env, msg->msg_sa, IKEV2_STATE_CLOSING);
timer_del(env, >msg_sa->sa_timer);
timer_set(env, >msg_sa->sa_timer,
@@ -346,8 +347,8 @@ ikev2_msg_send(struct iked *env, struct iked_message *msg)
timer_add(env, >msg_sa->sa_timer,
IKED_IKE_SA_DELETE_TIMEOUT);
}
-   log_warn("%s: sendtofrom", __func__);
-   return (-1);
+   if (msg->msg_sa != NULL)
+   return (-1);
}
 
if (!sa)


Regards,
Mike


> Index: ikev2_msg.c
> ===
> RCS file: /mount/openbsd/cvs/src/sbin/iked/ikev2_msg.c,v
> retrieving revision 1.58
> diff -u -p -r1.58 ikev2_msg.c
> --- ikev2_msg.c   13 Nov 2019 12:24:40 -  1.58
> +++ ikev2_msg.c   14 Nov 2019 15:37:11 -
> @@ -339,15 +339,20 @@ ikev2_msg_send(struct iked *env, struct 
>   (struct sockaddr *)>msg_peer, msg->msg_peerlen,
>   (struct sockaddr *)>msg_local, msg->msg_locallen) == -1) {
>   if (errno == EADDRNOTAVAIL) {
> - sa_state(env, msg->msg_sa, IKEV2_STATE_CLOSING);
> - timer_del(env, >msg_sa->sa_timer);
> - timer_set(env, >msg_sa->sa_timer,
> - ikev2_ike_sa_timeout, msg->msg_sa);
> - timer_add(env, >msg_sa->sa_timer,
> - IKED_IKE_SA_DELETE_TIMEOUT);
> + if (sa != NULL) {
> + sa_state(env, sa, IKEV2_STATE_CLOSING);
> + timer_del(env, >sa_timer);
> + timer_set(env, >sa_timer,
> + ikev2_ike_sa_timeout, sa);
> + timer_add(env, >sa_timer,
> + IKED_IKE_SA_DELETE_TIMEOUT);
> + }
> + log_warn("%s: sendtofrom", __func__);
> + return (-1);
>   }
>   log_warn("%s: sendtofrom", __func__);
> - return (-1);
> + if (!sa)
> + return (-1);
>   }
>  
>   if (!sa)



Re: [PATCH: 1/3] MMIO handler in vmm(4)

2019-11-13 Thread Mike Larkin
On Sat, Nov 02, 2019 at 06:40:52AM +0900, Iori YONEJI wrote:
> On Tue, Oct 29, 2019 at 02:17:28AM -0700, Mike Larkin wrote:
> > On Thu, Oct 24, 2019 at 08:54:58AM +0900, Iori YONEJI wrote:
> > > Hello tech@,
> > > 
> > > Here is the patch discussed in the previous email. This part mainly
> > > covers changes in the declaration part and fault handlers.
> > > 
> > 
> > Hello,
> > 
> >  I read through the three diffs and have some feedback.
> > 
> > First, please reformat all 3 diffs using style(9) guidelines. There are many
> > spaces vs tabs issues in the diffs.
> Thank you for your review. I'll fix the issues on this weekend.
> 
> First of all, the reason of the spaces vs. tabs issues was due to
> mangling by the mailer. I fixed this now, but I'm not aware of other
> types of style issues, even I will go through the patches to find other
> style mismatches after started working on them.
> 
> > Second, there seems to be quite a bit of important code missing here:
> Most of them convinced me, but let me leave a few comment for them.
> 

Iori Yoneji and I are discussing this in another thread, but for the benefit
of searchability/posterity, see below for a summary.

> 1. Segment base and limit check
> > 1. There appears to be no handling of segment base and limits for 32 bit
> >instructions. These need to be read from the gdt and strictly adhered
> >to. For 64 bit mode, it's not as important, unless the guest has enabled
> >LMSLE (long mode segment limit enable) on AMD CPUs, in which case the
> >limits need to be checked also. If I recall correctly, there are also
> >some different rules that need to be followed for 32 bit segment use
> >relating to permission checks and segment types.
> I wasn't aware of limit check, but I think vcpu would be triggered #GP
> if the segment was out of range instead of EPT violation, wouldn't it?
>

The SDM seems to be a bit vague here about the rules for checking segment limits
in VMX non-root mode (eg, inside a VM) and whether that causes an exit or not
based on various configurations, specifically around segment limits. This is
likely an issue only for 32 bit guests (we can block LMSLE).

> 2. Privilege level check
> > 2. For that matter, there appears to be no handling of any permission or
> >privilege checks in the instruction emulator. This means any privilege
> >level can read or write any memory in the VM.
> Sorry, it is my very big fault. I must check CPU mode in next post.
> 

In the generic EPT case (not a segment limit issue), this is likely handled
properly already. We are discussing if it makes sense to put the permission
checks in anyway, in case this code ends up being used in other places.

> 3-4. 'A'ccessed and 'D'irty bit management
> > 3. There appears to be no handling of the updating of the 'A'ccessed or
> >'D'irty bits on successful page table walk and writing to a page
> >computed by that translation. Granted, this probably needs to go into
> >the existing translate_gva function for the 'A' bit, but the 'D' bit
> >needs to also be handled here.
> > 
> > 4. There appears to be no handling of the updating of the 'A' bit in the
> >GDT for the segment descriptor in use.
> Yes, but I'm not sure what would be the expected use of A/D bit
> management on MMIO region because it wouldn't be subject to swap in/out.
> I think I will understand the expected behavior after working on it,
> however, I couldn't get how it compromise the virtual machine features.
> 

There are certain degenerate cases that might require this, and we are unclear
under these circumstances if the CPU does this for us or if the emulator
needs to. As pointed out above, the need for A/D bits in this particular case
is likely minimal, but in the same theme as above, we may want to make the
code work generically instead of assuming the emulator will only ever be
used for accesses in the MMIO region.

-ml


> 5-7. Prefixes
> > 5. The code seems to ignore any segment prefix override bytes.
> > 
> > 6. The code seems to ignore forbidden instruction prefixes (like rex.w
> >or lock prefixes being placed before instructions where they don't
> >make sense or are prohibited by the SDM)
> > 
> > 7. For that matter, operand size encoding prefixes don't seem to be
> >honoured at all
> Yes, there are some prefixes I missed in the patches. I will enhance
> them to handle those prefixes.
> One thing to confirm: Is it OK not to calculate in the segment registers
> (whether override-ed or not) but just count the bytes, because we know
> the address the instruction intended to access?
> 
> 8. %rflags
> 

Re: iked(8): add configuration option for esn

2019-11-12 Thread Mike Belopuhov
On Tue, 12 Nov 2019 at 16:08, Tobias Heider  wrote:

> On Tue, Nov 12, 2019 at 09:57:31AM +0100, Mike Belopuhov wrote:
> > Hi Tobias,
> >
> > I see, however, I don't think iked would negotiate an SA
> > without ESN support if the other side supports ESN, so I'm
> > not sure how "enforcing" changes that.
>
> It doesn't, but if I have an iked on both sides one will have to
> make the decision. I have another case where I actually can not
> use ESN, with two ikeds this can not be configured currently.
>
> > In any case, I'm not opposed to adding a toggle if you guys
> > need it, but could you please adjust the grammar so that "esn"
> > and "no esn" are used instead of "on" and "off" since that's
> > what we're normally doing.  "on" and "off" are clutches for
> > simple file formats, parse.y allows you to make it a bit nicer.
>
> Makes sense. Here is the updated diff including a fix for bluhms
> comment.
>
>
While I meant "no esn" with a space, I see that you and Patrick have
been adding things like "nofragmentation" and "nomobike".  These
should be written with a space as well.

Nevertheless ok mikeb, hopefully you'll come around fixing grammar
later on.


  1   2   3   4   5   6   7   8   9   10   >