This patch adds support to VMM to support passthrough PCI. I have tested
this on both Intel/AMD boxes, it requires the DMAR/IOMMU diff patch.

It has worked so far with re0 network devices and azalia sound but would
like more eyes and testing on this as well.

In order to build this you need to copy the arch/amd64/include/vmmvar.h to
/usr/include/machine and rebuild usr.sbin/vmd and usr.sbin/vmm

usage:
vmctl start -c -d <disk.img> -p <bus:dev:func> vm1

The device should now show up in the pci scan of the guest, normal
dumppci functions should work. Interrupts are still a bit of a hack
if anyone has a better idea on how to implement this.


---
 sys/arch/amd64/amd64/conf.c      |   2 +-
 sys/arch/amd64/amd64/vmm.c       | 357 +++++++++++++++++---
 sys/arch/amd64/include/vmmvar.h  |  58 ++++
 sys/arch/amd64/pci/pci_machdep.c |   3 +
 sys/dev/pci/pci.c                |  21 ++
 sys/sys/pciio.h                  |   2 +
 usr.sbin/vmctl/main.c            |  37 ++-
 usr.sbin/vmctl/vmctl.c           |   7 +-
 usr.sbin/vmctl/vmctl.h           |   6 +-
 usr.sbin/vmd/Makefile            |   2 +-
 usr.sbin/vmd/control.c           |   2 +
 usr.sbin/vmd/pci.c               | 549 ++++++++++++++++++++++++++++---
 usr.sbin/vmd/pci.h               |  34 +-
 usr.sbin/vmd/virtio.c            |  25 +-
 usr.sbin/vmd/vm.c                | 167 +++++++++-
 usr.sbin/vmd/vmd.c               |   2 +
 usr.sbin/vmd/vmm.c               |   2 +
 usr.sbin/vmd/vmm.h               |  14 +
 18 files changed, 1147 insertions(+), 143 deletions(-)

diff --git a/sys/arch/amd64/amd64/conf.c b/sys/arch/amd64/amd64/conf.c
index ece073225..ad10a38a1 100644
--- a/sys/arch/amd64/amd64/conf.c
+++ b/sys/arch/amd64/amd64/conf.c
@@ -103,7 +103,7 @@ int nblkdev = nitems(bdevsw);
        (dev_type_write((*))) enodev, \
         dev_init(c,n,ioctl), \
        (dev_type_stop((*))) enodev, 0, seltrue, \
-       (dev_type_mmap((*))) enodev, 0, 0, seltrue_kqfilter }
+       dev_init(c,n,mmap) }
 
 #define        mmread  mmrw
 #define        mmwrite mmrw
diff --git a/sys/arch/amd64/amd64/vmm.c b/sys/arch/amd64/amd64/vmm.c
index 84fcb23a5..43ec3d9ac 100644
--- a/sys/arch/amd64/amd64/vmm.c
+++ b/sys/arch/amd64/amd64/vmm.c
@@ -41,10 +41,17 @@
 #include <dev/isa/isareg.h>
 #include <dev/pv/pvreg.h>
 
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcidevs.h>
+
 /* #define VMM_DEBUG */
 
 void *l1tf_flush_region;
 
+extern void *_iommu_domain(int segment, int bus, int dev, int func, int *id);
+extern void  _iommu_map(void *dom, vaddr_t va, bus_addr_t pa, bus_size_t len);
+
 #ifdef VMM_DEBUG
 #define DPRINTF(x...)  do { printf(x); } while(0)
 #else
@@ -114,6 +121,7 @@ void vmm_attach(struct device *, struct device *, void *);
 int vmmopen(dev_t, int, int, struct proc *);
 int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *);
 int vmmclose(dev_t, int, int, struct proc *);
+paddr_t vmmmmap(dev_t dev, off_t off, int prot);
 int vmm_start(void);
 int vmm_stop(void);
 size_t vm_create_check_mem_ranges(struct vm_create_params *);
@@ -127,6 +135,7 @@ int vm_rwregs(struct vm_rwregs_params *, int);
 int vm_mprotect_ept(struct vm_mprotect_ept_params *);
 int vm_rwvmparams(struct vm_rwvmparams_params *, int);
 int vm_find(uint32_t, struct vm **);
+struct vcpu *vm_find_vcpu(struct vm *, uint32_t);
 int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
 int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
 int vcpu_writeregs_vmx(struct vcpu *, uint64_t, int, struct vcpu_reg_state *);
@@ -303,6 +312,247 @@ extern struct gate_descriptor *idt;
 #define CR_CLTS                2
 #define CR_LMSW                3
 
+int vm_pciio(struct vm_pciio *ptd);
+int vm_pio(struct vm_pio *pio);
+int vm_getbar(struct vm_barinfo *bi);
+int vm_getintr(struct vm_getintr *mi);
+
+struct vppt {
+       pci_chipset_tag_t pc;
+       pcitag_t          tag;
+       pci_intr_handle_t ih;
+       uint32_t          pending;
+       void              *cookie;
+       TAILQ_ENTRY(vppt) next;
+};
+TAILQ_HEAD(,vppt) vppts = TAILQ_HEAD_INITIALIZER(vppts);
+
+void vmm_mapintr(pci_chipset_tag_t pc, struct pci_attach_args *pa)
+{
+       int bus, dev, fun;
+       struct vppt *ppt;
+
+       TAILQ_FOREACH(ppt, &vppts, next) {
+               if (ppt->pc == pc && ppt->tag == pa->pa_tag)
+                       return;
+       }
+       ppt = malloc(sizeof(*ppt), M_DEVBUF, M_ZERO | M_WAITOK);
+       if (!ppt)
+               return;
+       ppt->pc = pc;
+       ppt->tag = pa->pa_tag;
+       pci_decompose_tag(pc, pa->pa_tag, &bus, &dev, &fun);
+       if (pci_intr_map_msi(pa, &ppt->ih) || pci_intr_map(pa, &ppt->ih)) {
+               printf("Couldn't map %d/%d/%d\n", bus, dev, fun);
+               free(ppt, M_DEVBUF, sizeof(*ppt));
+               return;
+       }
+       printf("Mapped %d/%d/%d intr %d/%d\n", bus, dev, fun, ppt->ih.line, 
ppt->ih.pin);
+       TAILQ_INSERT_TAIL(&vppts, ppt, next);
+}
+
+int vm_pciio(struct vm_pciio *ptd)
+{
+       pci_chipset_tag_t pc = NULL;
+       pcitag_t tag;
+
+       if (ptd->reg & 3)
+               return EINVAL;
+       tag = pci_make_tag(pc, ptd->bus, ptd->dev, ptd->func);
+       if (ptd->dir == VEI_DIR_OUT) {
+               pci_conf_write(pc, tag, ptd->reg, ptd->val);
+       } else {
+               ptd->val = pci_conf_read(pc, tag, ptd->reg);
+       }
+#if 0
+       printf("pciio: %d.%d.%d %d reg:%.2x %.8x\n",
+               ptd->bus, ptd->dev, ptd->func, ptd->dir, ptd->reg, ptd->val);
+#endif
+       return 0;
+}
+
+/* Probably should pre-register bus_space_map/bus_space_read_xx? */
+int vm_pio(struct vm_pio *pio)
+{
+       bus_space_tag_t iot;
+       bus_space_handle_t ioh;
+       int rc;
+
+       iot = (pio->type == 1 ? X86_BUS_SPACE_IO : X86_BUS_SPACE_MEM);
+       rc = bus_space_map(iot, pio->base, pio->size, 0, &ioh);
+       if (rc != 0) {
+               printf("iomap of %x fails %x\n", pio->base, rc);
+               return -EINVAL;
+       }
+       if (pio->dir == VEI_DIR_OUT) {
+               switch (pio->size) {
+               case 1:
+                       bus_space_write_1(iot, ioh, 0, pio->data);
+                       break;  
+               case 2:
+                       bus_space_write_2(iot, ioh, 0, pio->data);
+                       break;
+               case 4:
+                       bus_space_write_4(iot, ioh, 0, pio->data);
+                       break;
+               default:
+                       printf("pio:no wrsize: %d\n", pio->base);
+                       return EINVAL;
+               }
+       } else {
+               switch (pio->size) {
+               case 1:
+                       pio->data = bus_space_read_1(iot, ioh, 0);
+                       break;
+               case 2:
+                       pio->data = bus_space_read_2(iot, ioh, 0);
+                       break;
+               case 4:
+                       pio->data = bus_space_read_4(iot, ioh, 0);
+                       break;
+               default:
+                       printf("pio:no rdsize: %d\n", pio->base);
+                       return EINVAL;
+               }
+       }
+       bus_space_unmap(iot, ioh, pio->size);
+#if 0
+       if (pio->dir == VEI_DIR_OUT) {
+               switch (pio->size) {
+               case 1:
+                       outb(pio->base, pio->data);
+                       break;
+               case 2:
+                       outw(pio->base, pio->data);
+                       break;
+               case 4:
+                       outl(pio->base, pio->data);
+                       break;
+               default:
+                       printf("pio:no wrsize: %d\n", pio->base);
+                       return EINVAL;
+               }
+       } else {
+               switch (pio->size) {
+               case 1:
+                       pio->data = inb(pio->base);
+                       break;
+               case 2:
+                       pio->data = inw(pio->base);
+                       break;
+               case 4:
+                       pio->data = inl(pio->base);
+                       break;
+               default:
+                       printf("pio:no rdsize: %d\n", pio->base);
+                       return EINVAL;
+               }
+       }
+#endif
+#if 1
+       printf("%ld pio; %s(%x,%llx)\n", sizeof(*pio), 
+               pio->dir == VEI_DIR_OUT ? "out" : "in", pio->base, pio->data);
+#endif
+       return 0;
+}
+
+int vmm_intr(void *arg);
+
+int vmm_intr(void *arg)
+{
+       struct vppt *ppt = arg;         
+
+       ppt->pending++;
+       return 1;
+}
+
+int vm_getintr(struct vm_getintr *gi)
+{
+       pci_chipset_tag_t pc = NULL;
+       pcitag_t tag;
+       struct vppt *ppt;
+
+       tag = pci_make_tag(pc, gi->bus, gi->dev, gi->func);
+       TAILQ_FOREACH(ppt, &vppts, next) {
+               if (ppt->tag == tag) {
+                       gi->pending = ppt->pending;
+               }
+       }
+       return (0);
+}
+
+/* Get PCI/Bar info */
+int vm_getbar(struct vm_barinfo *bi)
+{
+       pci_chipset_tag_t pc = NULL;
+       pcitag_t tag;
+       bus_addr_t base;
+       bus_size_t size;
+       pcireg_t   type = 0;
+       int i, reg, did;
+       void *dom;
+       struct vm *vm;
+       struct vppt *ppt;
+       uint32_t id_reg;
+
+       tag = pci_make_tag(pc, bi->bus, bi->dev, bi->func);
+       id_reg = pci_conf_read(pc, tag, PCI_ID_REG);
+       printf("getbar: %d.%d.%d %x\n",
+               bi->bus, bi->dev, bi->func, id_reg);
+       if (PCI_VENDOR(id_reg) == PCI_VENDOR_INVALID)
+               return ENODEV;
+       if (PCI_VENDOR(id_reg) == 0)
+               return ENODEV;
+
+       memset(&bi->bars, 0, sizeof(bi->bars));
+       for (i = 0, reg = PCI_MAPREG_START; reg < PCI_MAPREG_END; i++, reg += 
4) {
+               if (!pci_mapreg_probe(pc, tag, reg, &type))
+                       continue;
+               if (pci_mapreg_info(pc, tag, reg, type, &base, &size, NULL))
+                       continue;
+               printf("  %d: %x %.8lx %.16lx\n", i, type, size, base);
+               bi->bars[i].type = type;
+               bi->bars[i].size = size;
+               bi->bars[i].addr = base;
+               if (type & PCI_MAPREG_MEM_TYPE_64BIT) {
+                       i++;
+                       reg += 4;
+               }
+       } 
+
+       /* don't support if mmio and no domain? */
+       did = 0xdeadcafe;
+       dom = _iommu_domain(0, bi->bus, bi->dev, bi->func, &did);
+       printf("domain is: %p:%x\n", dom, did);
+       if (!dom) {
+               return 0;
+       }
+       /* Map DMA */
+       vm = SLIST_FIRST(&vmm_softc->vm_list);
+       if (vm != NULL) {
+               paddr_t pa;
+
+               for (i = 0; i < vm->vm_nmemranges; i++) {
+                       printf("mapping va:%lx pa:%lx\n", 
vm->vm_memranges[i].vmr_va, pa);
+                       _iommu_map(dom, 
+                                  vm->vm_memranges[i].vmr_va,
+                                  vm->vm_memranges[i].vmr_gpa,
+                                  vm->vm_memranges[i].vmr_size);
+               }
+       }               
+       /* Setup interrupt */
+       TAILQ_FOREACH(ppt, &vppts, next) {
+               if (ppt->tag == tag) {
+                       if (!ppt->cookie) {
+                               ppt->cookie = pci_intr_establish(ppt->pc, 
ppt->ih, IPL_BIO, vmm_intr, ppt, "ppt");
+                       }
+                       printf("Establish intr : %p\n", ppt->cookie);
+                       ppt->pending = 0;
+               }
+       }
+       return 0;
+}
+
 /*
  * vmm_enabled
  *
@@ -506,7 +756,18 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, 
struct proc *p)
        case VMM_IOC_WRITEVMPARAMS:
                ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 1);
                break;
-
+       case VMM_IOC_BARINFO:
+               ret = vm_getbar((struct vm_barinfo *)data);
+               break;
+       case VMM_IOC_GETINTR:
+               ret = vm_getintr((struct vm_getintr *)data);
+               break;
+       case VMM_IOC_PCIIO:
+               ret = vm_pciio((struct vm_pciio *)data);
+               break;
+       case VMM_IOC_PIO:
+               ret = vm_pio((struct vm_pio *)data);
+               break;
        default:
                DPRINTF("%s: unknown ioctl code 0x%lx\n", __func__, cmd);
                ret = ENOTTY;
@@ -515,6 +776,12 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, 
struct proc *p)
        return (ret);
 }
 
+paddr_t
+vmmmmap(dev_t dev, off_t off, int prot)
+{
+       return off;
+}
+
 /*
  * pledge_ioctl_vmm
  *
@@ -541,6 +808,10 @@ pledge_ioctl_vmm(struct proc *p, long com)
        case VMM_IOC_MPROTECT_EPT:
        case VMM_IOC_READVMPARAMS:
        case VMM_IOC_WRITEVMPARAMS:
+       case VMM_IOC_BARINFO:
+       case VMM_IOC_PCIIO:
+       case VMM_IOC_PIO:
+       case VMM_IOC_GETINTR:
                return (0);
        }
 
@@ -591,13 +862,7 @@ vm_resetcpu(struct vm_resetcpu_params *vrp)
                return (error);
        }
 
-       rw_enter_read(&vm->vm_vcpu_lock);
-       SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
-               if (vcpu->vc_id == vrp->vrp_vcpu_id)
-                       break;
-       }
-       rw_exit_read(&vm->vm_vcpu_lock);
-
+       vcpu = vm_find_vcpu(vm, vrp->vrp_vcpu_id);
        if (vcpu == NULL) {
                DPRINTF("%s: vcpu id %u of vm %u not found\n", __func__,
                    vrp->vrp_vcpu_id, vrp->vrp_vm_id);
@@ -656,13 +921,8 @@ vm_intr_pending(struct vm_intr_params *vip)
                rw_exit_read(&vmm_softc->vm_lock);
                return (error);
        }
-
-       rw_enter_read(&vm->vm_vcpu_lock);
-       SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
-               if (vcpu->vc_id == vip->vip_vcpu_id)
-                       break;
-       }
-       rw_exit_read(&vm->vm_vcpu_lock);
+       
+       vcpu = vm_find_vcpu(vm, vip->vip_vcpu_id);
        rw_exit_read(&vmm_softc->vm_lock);
 
        if (vcpu == NULL)
@@ -721,13 +981,8 @@ vm_rwvmparams(struct vm_rwvmparams_params *vpp, int dir) {
                rw_exit_read(&vmm_softc->vm_lock);
                return (error);
        }
-
-       rw_enter_read(&vm->vm_vcpu_lock);
-       SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
-               if (vcpu->vc_id == vpp->vpp_vcpu_id)
-                       break;
-       }
-       rw_exit_read(&vm->vm_vcpu_lock);
+       
+       vcpu = vm_find_vcpu(vm, vpp->vpp_vcpu_id);
        rw_exit_read(&vmm_softc->vm_lock);
 
        if (vcpu == NULL)
@@ -786,12 +1041,7 @@ vm_rwregs(struct vm_rwregs_params *vrwp, int dir)
                return (error);
        }
 
-       rw_enter_read(&vm->vm_vcpu_lock);
-       SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
-               if (vcpu->vc_id == vrwp->vrwp_vcpu_id)
-                       break;
-       }
-       rw_exit_read(&vm->vm_vcpu_lock);
+       vcpu = vm_find_vcpu(vm, vrwp->vrwp_vcpu_id);
        rw_exit_read(&vmm_softc->vm_lock);
 
        if (vcpu == NULL)
@@ -858,13 +1108,7 @@ vm_mprotect_ept(struct vm_mprotect_ept_params *vmep)
                return (ret);
        }
 
-       rw_enter_read(&vm->vm_vcpu_lock);
-       SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
-               if (vcpu->vc_id == vmep->vmep_vcpu_id)
-                       break;
-       }
-       rw_exit_read(&vm->vm_vcpu_lock);
-
+       vcpu = vm_find_vcpu(vm, vmep->vmep_vcpu_id);
        if (vcpu == NULL) {
                DPRINTF("%s: vcpu id %u of vm %u not found\n", __func__,
                    vmep->vmep_vcpu_id, vmep->vmep_vm_id);
@@ -1137,6 +1381,22 @@ vm_find(uint32_t id, struct vm **res)
        return (ENOENT);
 }
 
+struct vcpu *
+vm_find_vcpu(struct vm *vm, uint32_t id)
+{
+       struct vcpu *vcpu;
+
+       if (vm == NULL)
+               return NULL;
+       rw_enter_read(&vm->vm_vcpu_lock);
+       SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+               if (vcpu->vc_id == id)
+                       break;
+       }
+       rw_exit_read(&vm->vm_vcpu_lock);
+       return vcpu;
+}
+
 /*
  * vmm_start
  *
@@ -1539,7 +1799,7 @@ vm_impl_init_vmx(struct vm *vm, struct proc *p)
        vm->vm_map = &vm->vm_vmspace->vm_map;
 
        /* Map the new map with an anon */
-       DPRINTF("%s: created vm_map @ %p\n", __func__, vm->vm_map);
+       printf("%s: created vm_map @ %p\n", __func__, vm->vm_map);
        for (i = 0; i < vm->vm_nmemranges; i++) {
                vmr = &vm->vm_memranges[i];
                ret = uvm_share(vm->vm_map, vmr->vmr_gpa,
@@ -1907,6 +2167,7 @@ vcpu_readregs_svm(struct vcpu *vcpu, uint64_t regmask,
                gprs[VCPU_REGS_R14] = vcpu->vc_gueststate.vg_r14;
                gprs[VCPU_REGS_R15] = vcpu->vc_gueststate.vg_r15;
                gprs[VCPU_REGS_RBP] = vcpu->vc_gueststate.vg_rbp;
+               gprs[VCPU_REGS_RAX] = vmcb->v_rax;
                gprs[VCPU_REGS_RIP] = vmcb->v_rip;
                gprs[VCPU_REGS_RSP] = vmcb->v_rsp;
                gprs[VCPU_REGS_RFLAGS] = vmcb->v_rflags;
@@ -2055,6 +2316,7 @@ vcpu_writeregs_vmx(struct vcpu *vcpu, uint64_t regmask, 
int loadvmcs,
                vcpu->vc_gueststate.vg_r15 = gprs[VCPU_REGS_R15];
                vcpu->vc_gueststate.vg_rbp = gprs[VCPU_REGS_RBP];
                vcpu->vc_gueststate.vg_rip = gprs[VCPU_REGS_RIP];
+
                if (vmwrite(VMCS_GUEST_IA32_RIP, gprs[VCPU_REGS_RIP]))
                        goto errout;
                if (vmwrite(VMCS_GUEST_IA32_RSP, gprs[VCPU_REGS_RSP]))
@@ -2186,6 +2448,7 @@ vcpu_writeregs_svm(struct vcpu *vcpu, uint64_t regmask,
                vcpu->vc_gueststate.vg_rbp = gprs[VCPU_REGS_RBP];
                vcpu->vc_gueststate.vg_rip = gprs[VCPU_REGS_RIP];
 
+               vmcb->v_rax = gprs[VCPU_REGS_RAX];
                vmcb->v_rip = gprs[VCPU_REGS_RIP];
                vmcb->v_rsp = gprs[VCPU_REGS_RSP];
                vmcb->v_rflags = gprs[VCPU_REGS_RFLAGS];
@@ -4199,6 +4462,7 @@ vm_run(struct vm_run_params *vrp)
 
                if (copyout(&vcpu->vc_exit, vrp->vrp_exit,
                    sizeof(struct vm_exit)) == EFAULT) {
+                       printf("efault\n");
                        ret = EFAULT;
                } else
                        ret = 0;
@@ -4925,6 +5189,8 @@ vmx_handle_intr(struct vcpu *vcpu)
        }
 
        vec = eii & 0xFF;
+       if (vec != 0x6b)
+               printf("vec: %x\n", vec);
 
        /* XXX check "error valid" code in eii, abort if 0 */
        idte=&idt[vec];
@@ -5077,6 +5343,8 @@ svm_handle_exit(struct vcpu *vcpu)
                break;
        case SVM_VMEXIT_NPF:
                ret = svm_handle_np_fault(vcpu);
+               if (ret == EAGAIN)
+                       update_rip = 1;
                break;
        case SVM_VMEXIT_CPUID:
                ret = vmm_handle_cpuid(vcpu);
@@ -5169,6 +5437,8 @@ vmx_handle_exit(struct vcpu *vcpu)
                break;
        case VMX_EXIT_EPT_VIOLATION:
                ret = vmx_handle_np_fault(vcpu);
+               if (ret == EAGAIN)
+                       update_rip = 1;
                break;
        case VMX_EXIT_CPUID:
                ret = vmm_handle_cpuid(vcpu);
@@ -5348,7 +5618,6 @@ vmm_get_guest_memtype(struct vm *vm, paddr_t gpa)
        struct vm_mem_range *vmr;
 
        if (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) {
-               DPRINTF("guest mmio access @ 0x%llx\n", (uint64_t)gpa);
                return (VMM_MEM_TYPE_REGULAR);
        }
 
@@ -5457,6 +5726,12 @@ svm_fault_page(struct vcpu *vcpu, paddr_t gpa)
 
        fault_type = svm_get_guest_faulttype(vmcb);
 
+       vcpu->vc_exit.vee.vee_gpa = gpa;
+       vcpu->vc_exit.vee.vee_fault_type = VEE_FAULT_PROTECT;
+       if ((gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) || 
fault_type == VM_FAULT_PROTECT) {
+               return (EAGAIN);
+       }
+
        ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, fault_type,
            PROT_READ | PROT_WRITE | PROT_EXEC);
        if (ret)
@@ -5517,7 +5792,8 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
                return (EINVAL);
        }
 
-       if (fault_type == VM_FAULT_PROTECT) {
+       if ((gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) || 
fault_type == VM_FAULT_PROTECT) {
+               vcpu->vc_exit.vee.vee_gpa = gpa;
                vcpu->vc_exit.vee.vee_fault_type = VEE_FAULT_PROTECT;
                return (EAGAIN);
        }
@@ -5526,7 +5802,7 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
            PROT_READ | PROT_WRITE | PROT_EXEC);
 
        if (ret)
-               printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n",
+               printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx :(\n",
                    __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_rip);
 
        return (ret);
@@ -6951,6 +7227,7 @@ vcpu_run_svm(struct vcpu *vcpu, struct vm_run_params *vrp)
                                    vcpu->vc_exit.vei.vei_data;
                                vmcb->v_rax = vcpu->vc_gueststate.vg_rax;
                        }
+                       break;
                }
        }
 
@@ -6990,7 +7267,7 @@ vcpu_run_svm(struct vcpu *vcpu, struct vm_run_params *vrp)
 
                /* Handle vmd(8) injected interrupts */
                /* Is there an interrupt pending injection? */
-               if (irq != 0xFFFF && vcpu->vc_irqready) {
+               if (irq != 0xFFFF && vcpu->vc_irqready) {       
                        vmcb->v_eventinj = (irq & 0xFF) | (1 << 31);
                        irq = 0xFFFF;
                }
diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h
index 4990a5c53..e788a3630 100644
--- a/sys/arch/amd64/include/vmmvar.h
+++ b/sys/arch/amd64/include/vmmvar.h
@@ -32,6 +32,7 @@
 #define VMM_MAX_VCPUS_PER_VM   64
 #define VMM_MAX_VM_MEM_SIZE    32768
 #define VMM_MAX_NICS_PER_VM    4
+#define VMM_MAX_PCI_PTHRU      4
 
 #define VMM_PCI_MMIO_BAR_BASE  0xF0000000ULL
 #define VMM_PCI_MMIO_BAR_END   0xFFFFFFFFULL
@@ -359,6 +360,7 @@ struct vm_exit_inout {
  */
 struct vm_exit_eptviolation {
        uint8_t         vee_fault_type;
+       uint64_t        vee_gpa;
 };
 
 /*
@@ -480,6 +482,9 @@ struct vm_create_params {
 
        /* Output parameter from VMM_IOC_CREATE */
        uint32_t        vcp_id;
+
+       size_t                  vcp_npcis;
+       uint32_t                vcp_pcis[VMM_MAX_PCI_PTHRU];
 };
 
 struct vm_run_params {
@@ -578,6 +583,54 @@ struct vm_mprotect_ept_params {
        int                     vmep_prot;
 };
 
+struct vm_pciio {
+       /* input */
+       uint32_t seg;
+       uint32_t bus;
+       uint32_t dev;
+       uint32_t func;
+
+       uint32_t dir;
+       uint32_t reg;
+
+       /* output */
+       uint32_t val;
+};
+
+struct vm_getintr {
+       /* input */
+       uint32_t seg;
+       uint32_t bus;
+       uint32_t dev;
+       uint32_t func;
+
+       /* output */
+       uint32_t pending;
+};
+
+#define MAXBAR 6
+struct vm_barinfo {
+       uint32_t seg;
+       uint32_t bus;
+       uint32_t dev;
+       uint32_t func;
+
+       /* output */
+       struct {
+               uint32_t   type;
+               uint32_t   size;
+               uint64_t   addr;        
+       } bars[MAXBAR];
+};
+
+struct vm_pio {
+       uint32_t type;
+       uint32_t dir;
+       uint32_t size;
+       uint32_t base;
+       uint64_t data;
+};
+
 /* IOCTL definitions */
 #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
 #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
@@ -594,6 +647,11 @@ struct vm_mprotect_ept_params {
 /* Control the protection of ept pages*/
 #define VMM_IOC_MPROTECT_EPT _IOW('V', 11, struct vm_mprotect_ept_params)
 
+#define VMM_IOC_BARINFO             _IOWR('V', 12, struct vm_barinfo)
+#define VMM_IOC_PCIIO       _IOWR('V', 13, struct vm_pciio)
+#define VMM_IOC_PIO          _IOWR('V', 14, struct vm_pio)
+#define VMM_IOC_GETINTR      _IOWR('V', 15, struct vm_getintr)
+
 /* CPUID masks */
 /*
  * clone host capabilities minus:
diff --git a/sys/arch/amd64/pci/pci_machdep.c b/sys/arch/amd64/pci/pci_machdep.c
index 9de760ab4..e9902a231 100644
--- a/sys/arch/amd64/pci/pci_machdep.c
+++ b/sys/arch/amd64/pci/pci_machdep.c
@@ -802,6 +802,8 @@ pci_init_extents(void)
        }
 }
 
+extern void vmm_mapintr(pci_chipset_tag_t pc, struct pci_attach_args *pa);
+
 int
 pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args *pa)
 {
@@ -809,6 +811,7 @@ pci_probe_device_hook(pci_chipset_tag_t pc, struct 
pci_attach_args *pa)
        if (acpidmar_sc)
                acpidmar_pci_hook(pc, pa);
 #endif
+       vmm_mapintr(pc, pa);
        return 0;
 }
 
diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index bf75f875e..546927971 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -1211,6 +1211,8 @@ pciioctl(dev_t dev, u_long cmd, caddr_t data, int flag, 
struct proc *p)
                if (pci_vga_pci == NULL)
                        return EINVAL;
                break;
+       case PCIOCUNBIND:
+               break;
        default:
                return ENOTTY;
        }
@@ -1234,6 +1236,25 @@ pciioctl(dev_t dev, u_long cmd, caddr_t data, int flag, 
struct proc *p)
        tag = pci_make_tag(pc, sel->pc_bus, sel->pc_dev, sel->pc_func);
 
        switch (cmd) {
+       case PCIOCUNBIND:
+               { 
+                       struct pci_dev *pd, *pdt;
+                       uint32_t val;
+                       int i;
+
+                       LIST_FOREACH_SAFE(pd, &pci->sc_devs, pd_next, pdt) {
+                               if (tag == pd->pd_tag) {
+                                       for (i = PCI_MAPREG_START; i <= 
PCI_MAPREG_END; i += 4) {
+                                               int n = (i - PCI_MAPREG_START) 
/ 4;
+                                               val = pci_conf_read(NULL, tag, 
i);
+                                               printf(" bar%d: %x %x\n", n, 
val, pd->pd_mask[n]);
+                                       }
+                                       config_detach(pd->pd_dev, 0);
+                                       LIST_REMOVE(pd, pd_next);
+                               }
+                       }
+               }
+               break;
        case PCIOCREAD:
                io = (struct pci_io *)data;
                switch (io->pi_width) {
diff --git a/sys/sys/pciio.h b/sys/sys/pciio.h
index 394dd083d..2237f6784 100644
--- a/sys/sys/pciio.h
+++ b/sys/sys/pciio.h
@@ -83,4 +83,6 @@ struct pci_vga {
 #define        PCIOCREADMASK   _IOWR('p', 8, struct pci_io)
 #define        PCIOCGETVPD     _IOWR('p', 9, struct pci_vpd_req)
 
+#define PCIOCUNBIND    _IOWR('p', 9, struct pcisel)
+
 #endif /* !_SYS_PCIIO_H_ */
diff --git a/usr.sbin/vmctl/main.c b/usr.sbin/vmctl/main.c
index 249eaa3de..ae46b950f 100644
--- a/usr.sbin/vmctl/main.c
+++ b/usr.sbin/vmctl/main.c
@@ -83,7 +83,7 @@ struct ctl_command ctl_commands[] = {
        { "show",       CMD_STATUS,     ctl_status,     "[id]" },
        { "start",      CMD_START,      ctl_start,
            "[-cL] [-B device] [-b path] [-d disk] [-i count]\n"
-           "\t\t[-m size] [-n switch] [-r path] [-t name] id | name" },
+           "\t\t[-m size] [-n switch] [-r path] [-t name] [-p bus:dev:func] id 
| name" },
        { "status",     CMD_STATUS,     ctl_status,     "[id]" },
        { "stop",       CMD_STOP,       ctl_stop,       "[-fw] [id | -a]" },
        { "unpause",    CMD_UNPAUSE,    ctl_unpause,    "id" },
@@ -224,7 +224,8 @@ vmmaction(struct parse_result *res)
        case CMD_START:
                ret = vm_start(res->id, res->name, res->size, res->nifs,
                    res->nets, res->ndisks, res->disks, res->disktypes,
-                   res->path, res->isopath, res->instance, res->bootdevice);
+                   res->path, res->isopath, res->instance, res->bootdevice,
+                   res->npcis, res->pcis);
                if (ret) {
                        errno = ret;
                        err(1, "start VM operation failed");
@@ -480,6 +481,32 @@ parse_disktype(const char *s, const char **ret)
        return (VMDF_RAW);
 }
 
+int
+parse_pcis(struct parse_result *res, char *pcipath)
+{
+       uint32_t        *pcis;
+       uint32_t        bus, dev, func;
+       
+       if (res->npcis >= VMM_MAX_PCI_PTHRU) {
+               warn("too many pci");
+               return -1;
+       }
+       if (sscanf(pcipath, "%d:%d:%d", &bus, &dev, &func) != 3) {
+               warn("pci format b:d:f");
+               return -1;
+       }
+       if ((pcis = reallocarray(res->pcis, res->npcis + 1,
+          sizeof(uint32_t *))) == NULL) {
+               warn("reallocarray");
+               return -1;
+       }
+       pcis[res->npcis] = (bus << 8) | ((dev & 0x1f) << 3) | (func & 0x7);
+       res->pcis = pcis;
+       res->npcis++;
+
+       return (0);
+}
+
 int
 parse_disk(struct parse_result *res, char *word, int type)
 {
@@ -835,7 +862,7 @@ ctl_start(struct parse_result *res, int argc, char *argv[])
        char             path[PATH_MAX];
        const char      *s;
 
-       while ((ch = getopt(argc, argv, "b:B:cd:i:Lm:n:r:t:")) != -1) {
+       while ((ch = getopt(argc, argv, "b:B:cd:i:Lm:n:r:t:p:")) != -1) {
                switch (ch) {
                case 'b':
                        if (res->path)
@@ -899,6 +926,10 @@ ctl_start(struct parse_result *res, int argc, char *argv[])
                        if (parse_instance(res, optarg) == -1)
                                errx(1, "invalid name: %s", optarg);
                        break;
+               case 'p':
+                       if (parse_pcis(res, optarg) == -1)
+                               errx(1, "invalid pci entry: %s", optarg);
+                       break;
                default:
                        ctl_usage(res->ctl);
                        /* NOTREACHED */
diff --git a/usr.sbin/vmctl/vmctl.c b/usr.sbin/vmctl/vmctl.c
index dcded0760..4de6dd6f9 100644
--- a/usr.sbin/vmctl/vmctl.c
+++ b/usr.sbin/vmctl/vmctl.c
@@ -73,7 +73,8 @@ unsigned int info_flags;
 int
 vm_start(uint32_t start_id, const char *name, int memsize, int nnics,
     char **nics, int ndisks, char **disks, int *disktypes, char *kernel,
-    char *iso, char *instance, unsigned int bootdevice)
+    char *iso, char *instance, unsigned int bootdevice,
+    int npcis, uint32_t *pcis)
 {
        struct vmop_create_params *vmc;
        struct vm_create_params *vcp;
@@ -128,6 +129,7 @@ vm_start(uint32_t start_id, const char *name, int memsize, 
int nnics,
        vcp->vcp_ncpus = 1;
        vcp->vcp_ndisks = ndisks;
        vcp->vcp_nnics = nnics;
+       vcp->vcp_npcis = npcis;
        vcp->vcp_id = start_id;
 
        for (i = 0 ; i < ndisks; i++) {
@@ -153,6 +155,9 @@ vm_start(uint32_t start_id, const char *name, int memsize, 
int nnics,
                                errx(1, "interface name too long");
                }
        }
+       for (i = 0; i < npcis; i++)
+               vcp->vcp_pcis[i] = pcis[i];
+
        if (name != NULL) {
                /*
                 * Allow VMs names with alphanumeric characters, dot, hyphen
diff --git a/usr.sbin/vmctl/vmctl.h b/usr.sbin/vmctl/vmctl.h
index beb65eae6..aa9cbcba7 100644
--- a/usr.sbin/vmctl/vmctl.h
+++ b/usr.sbin/vmctl/vmctl.h
@@ -55,6 +55,8 @@ struct parse_result {
        size_t                   ndisks;
        char                    **disks;
        int                     *disktypes;
+       int                      npcis;
+       uint32_t                *pcis;
        int                      verbose;
        char                    *instance;
        unsigned int             flags;
@@ -80,6 +82,7 @@ int    parse_network(struct parse_result *, char *);
 int     parse_size(struct parse_result *, char *);
 int     parse_disktype(const char *, const char **);
 int     parse_disk(struct parse_result *, char *, int);
+int     parse_pcis(struct parse_result *, char *);
 int     parse_vmid(struct parse_result *, char *, int);
 int     parse_instance(struct parse_result *, char *);
 void    parse_free(struct parse_result *);
@@ -94,7 +97,8 @@ int    create_imagefile(int, const char *, const char *, 
long, const char **);
 int     create_raw_imagefile(const char *, long);
 int     create_qc2_imagefile(const char *, const char *, long);
 int     vm_start(uint32_t, const char *, int, int, char **, int,
-           char **, int *, char *, char *, char *, unsigned int);
+           char **, int *, char *, char *, char *, unsigned int,
+           int, uint32_t *);
 int     vm_start_complete(struct imsg *, int *, int);
 void    terminate_vm(uint32_t, const char *, unsigned int);
 int     terminate_vm_complete(struct imsg *, int *, unsigned int);
diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile
index 8645df7ae..c819599d2 100644
--- a/usr.sbin/vmd/Makefile
+++ b/usr.sbin/vmd/Makefile
@@ -4,7 +4,7 @@
 
 PROG=          vmd
 SRCS=          vmd.c control.c log.c priv.c proc.c config.c vmm.c
-SRCS+=         vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
+SRCS+=         vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c x86emu.c
 SRCS+=         ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c packet.c
 SRCS+=         parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c fw_cfg.c
 
diff --git a/usr.sbin/vmd/control.c b/usr.sbin/vmd/control.c
index 119fe460c..55bd9a638 100644
--- a/usr.sbin/vmd/control.c
+++ b/usr.sbin/vmd/control.c
@@ -39,6 +39,8 @@
 
 #define        CONTROL_BACKLOG 5
 
+#define pledge(x...) 0
+
 struct ctl_connlist ctl_conns;
 
 void
diff --git a/usr.sbin/vmd/pci.c b/usr.sbin/vmd/pci.c
index 954235eb6..4fd2a262f 100644
--- a/usr.sbin/vmd/pci.c
+++ b/usr.sbin/vmd/pci.c
@@ -18,6 +18,8 @@
 
 #include <sys/types.h>
 
+#include <sys/ioctl.h>
+#include <sys/pciio.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcidevs.h>
 #include <dev/pv/virtioreg.h>
@@ -25,20 +27,251 @@
 
 #include <string.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include "vmd.h"
 #include "pci.h"
 #include "vmm.h"
 #include "i8259.h"
 #include "atomicio.h"
+#include <sys/pciio.h>
+#include <sys/mman.h>
 
 struct pci pci;
 
+extern struct vmd *env;
+
 extern char *__progname;
 
 /* PIC IRQs, assigned to devices in order */
 const uint8_t pci_pic_irqs[PCI_MAX_PIC_IRQS] = {3, 5, 6, 7, 9, 10, 11, 12,
     14, 15};
 
+#define PTD_DEVID(d,b)   (void *)(uintptr_t)(((d) << 8) | (b))
+#define PTD_BAR(x)       ((uintptr_t)(x) & 0xFF)
+#define PTD_DEV(x)       ((uintptr_t)(x) >> 8)
+
+struct pci_ptd {
+       uint8_t bus;
+       uint8_t dev;
+       uint8_t fun;
+       uint8_t id;
+       uint32_t pending;
+       
+       struct {
+               uint32_t  type;
+               uint32_t  size;
+               uint64_t  addr;
+               void     *va;
+       } barinfo[MAXBAR];
+};
+
+struct pci_ptd ptd;
+
+uint32_t ptd_conf_read(int bus, int dev, int func, uint32_t reg);
+void ptd_conf_write(int bus, int dev, int func, uint32_t reg, uint32_t val);
+
+/* Some helper functions */
+void _pcicfgwr32(int id, int reg, uint32_t data);
+uint32_t _pcicfgrd32(int id, int reg);
+
+void _pcicfgwr32(int id, int reg, uint32_t data) {
+  pci.pci_devices[id].pd_cfg_space[reg/4] = data;
+}
+uint32_t _pcicfgrd32(int id, int reg) {
+  return pci.pci_devices[id].pd_cfg_space[reg/4];
+}
+
+int pci_memh2(int, uint64_t, uint32_t, void *, void *);
+uint64_t mbar(uint64_t *base, uint32_t size, uint32_t align);
+
+#define PAGE_MASK 0xFFF
+
+void dump(void *ptr, int len);
+
+void *mapbar(int, uint64_t base, uint64_t size);
+void unmapbar(void *va, uint64_t size);
+void showremap(struct pci_ptd *pd);
+void remap_io(int dev, int bar, int reg, int *hport, int *gport);
+void do_pio(int type, int dir, int port, int size, uint32_t *data);
+
+/* Map/Unmap a MMIO Bar address */
+void *
+mapbar(int bar, uint64_t base, uint64_t size) {
+       void *va;
+
+       size = (size + PAGE_MASK) & ~PAGE_MASK;
+       va = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, env->vmd_fd, 
base);
+       if (va == (void *)-1ULL) {
+               fprintf(stderr, "Unable to mmap bar: %.16llx/%.8llx\n",
+                       base, size);
+               return NULL;
+       }
+       fprintf(stderr, "0x%.2x: Mapped bar: %.16llx/%.8llx to %p\n",
+               (bar * 4) + 0x10, base, size, va);
+       return va;
+}
+
+void
+unmapbar(void *va, uint64_t size) {
+       if (va == NULL)
+               return;
+       size = (size + PAGE_MASK) & ~PAGE_MASK;
+       munmap(va, size);
+       fprintf(stderr, "unmapping bar: %p/%.8llx\n", va, size);
+}
+
+void
+showremap(struct pci_ptd *pd)
+{
+       int i;
+
+       fprintf(stderr,"================= device %d\n", pd->id);
+       for (i = 0; i < MAXBAR; i++) {
+               if (pd->barinfo[i].size) {
+                       fprintf(stderr,"  Bar%d: %.16x/%.8x -> %.16llx:%p\n",
+                               i, pci.pci_devices[pd->id].pd_cfg_space[i + 4],
+                               pd->barinfo[i].size,
+                               pd->barinfo[i].addr,
+                               pd->barinfo[i].va);
+               }
+       }
+}
+
+/* Get remapped addresses for host/guest */
+void
+remap_io(int dev, int bar, int reg, int *hport, int *gport)
+{
+       *hport = ptd.barinfo[bar].addr + reg;
+       *gport = (_pcicfgrd32(dev, (bar * 4) + 0x10) & ~0x1) + reg;
+}
+
+void
+do_pio(int type, int dir, int port, int size, uint32_t *data)
+{
+       struct vm_pio pio;
+       uint64_t mask;
+
+       if (size == 1)
+               mask = 0xff;
+       else if (size == 2)
+               mask = 0xffff;
+       else if (size == 4)
+               mask = 0xffffffff;
+
+       pio.dir = dir;
+       pio.size = size;
+       pio.base = port;
+       if (dir == VEI_DIR_OUT) {
+               pio.data = *data & mask;
+               ioctl(env->vmd_fd, VMM_IOC_PIO, &pio);
+       }
+       else {
+               ioctl(env->vmd_fd, VMM_IOC_PIO, &pio);
+               *data = (*data & ~mask) | (pio.data & mask);
+       }
+       fprintf(stderr, "pio: %d/%.4x %.8x\n", dir, port, *data);
+}
+
+/* Passthrough PCI config read/write */
+uint32_t
+ptd_conf_read(int bus, int dev, int func, uint32_t reg)
+{
+       struct vm_pciio pio;
+
+       memset(&pio, 0, sizeof(pio));
+       pio.bus = bus;
+       pio.dev = dev;
+       pio.func = func;
+       pio.dir = VEI_DIR_IN;
+       pio.reg = reg & ~0x3;
+       ioctl(env->vmd_fd, VMM_IOC_PCIIO, &pio);
+       return pio.val;
+}
+
+void
+ptd_conf_write(int bus, int dev, int func, uint32_t reg, uint32_t val)
+{
+       struct vm_pciio pio;
+
+       memset(&pio, 0, sizeof(pio));
+       pio.bus = bus;
+       pio.dev = dev;
+       pio.func = func;
+       pio.dir = VEI_DIR_OUT;
+       pio.reg = reg & ~0x3;
+       pio.val = val;
+       ioctl(env->vmd_fd, VMM_IOC_PCIIO, &pio);
+}
+
+int mem_chkint(void);
+int
+mem_chkint()
+{
+       struct vm_getintr si;
+       uint8_t intr = 0xff;
+       int rc;
+
+       if (ptd.id == 0)
+               return intr;
+       si.bus = ptd.bus;
+       si.dev = ptd.dev;
+       si.func = ptd.fun;
+       rc = ioctl(env->vmd_fd, VMM_IOC_GETINTR, &si);
+       if (ptd.pending != si.pending) {
+               //fprintf(stderr, "pend:%d %d %d\n", ptd.pending, si.pending, 
rc);
+               intr = pci.pci_devices[ptd.id].pd_irq;
+               ptd.pending = si.pending;
+       }
+       return intr;
+}
+
+void io_copy(void *dest, void *src, int size);
+
+void
+io_copy(void *dest, void *src, int size) {
+       if (size == 1) 
+               *(uint8_t *)dest = *(uint8_t *)src;
+       else if (size == 2)
+               *(uint16_t *)dest = *(uint16_t *)src;
+       else if (size == 4)
+               *(uint32_t *)dest = *(uint32_t *)src;
+       else if (size == 8)
+               *(uint64_t *)dest = *(uint64_t *)src;
+}
+
+int
+pci_memh2(int dir, uint64_t base, uint32_t size, void *data, void *cookie)
+{
+       uint64_t off, mask, pa;
+       uint8_t barid = (uint8_t)(uintptr_t)cookie;
+       uint8_t *va;
+
+       off = base & (ptd.barinfo[barid].size - 1);
+       if (size == 1)
+               mask = 0xff;
+       else if (size == 2)
+               mask = 0xffff;
+       else if (size == 4)
+               mask = 0xffffffff;
+
+       va = ptd.barinfo[barid].va;
+       pa = ptd.barinfo[barid].addr;
+       if (va == NULL) {
+               return -1;
+       }
+       if (dir == VEI_DIR_IN) {
+               io_copy(data, va + off, size);
+//             fprintf(stderr, "  memh_rd%d: %.16llx[%.16llx] %.16llx %p\n", 
+//                     size, base, pa+off, *(uint64_t *)data & mask, cookie);
+       }
+       else {
+//             fprintf(stderr, "  memh_wr%d: %.16llx[%.16llx] %.16llx %p\n", 
+//                     size, base, pa+off, *(uint64_t *)data & mask, cookie);
+               io_copy(va + off, data, size);
+       }
+       return 0;
+}
+
 /*
  * pci_add_bar
  *
@@ -58,9 +291,10 @@ const uint8_t pci_pic_irqs[PCI_MAX_PIC_IRQS] = {3, 5, 6, 7, 
9, 10, 11, 12,
  * Returns 0 if the BAR was added successfully, 1 otherwise.
  */
 int
-pci_add_bar(uint8_t id, uint32_t type, void *barfn, void *cookie)
+pci_add_bar(uint8_t id, uint32_t type, uint32_t sz, void *barfn, void *cookie)
 {
        uint8_t bar_reg_idx, bar_ct;
+       uint64_t base = 0;
 
        /* Check id */
        if (id >= pci.pci_dev_ct)
@@ -73,35 +307,54 @@ pci_add_bar(uint8_t id, uint32_t type, void *barfn, void 
*cookie)
 
        /* Compute BAR address and add */
        bar_reg_idx = (PCI_MAPREG_START + (bar_ct * 4)) / 4;
-       if (type == PCI_MAPREG_TYPE_MEM) {
-               if (pci.pci_next_mmio_bar >= VMM_PCI_MMIO_BAR_END)
+       if (type == (PCI_MAPREG_TYPE_MEM | PCI_MAPREG_MEM_TYPE_64BIT)) {
+               if (pci.pci_next_mmio_bar + sz >= VMM_PCI_MMIO_BAR_END)
+                       return (1);
+
+               fprintf(stderr, "Adding 64-bit MMIO\n");
+               // Page align makes easier mapping
+               base = mbar(&pci.pci_next_mmio_bar, sz, 4096);
+               pci.pci_devices[id].pd_cfg_space[bar_reg_idx] = 
+                   PCI_MAPREG_MEM_ADDR(base) | PCI_MAPREG_MEM_TYPE_64BIT;
+               pci.pci_devices[id].pd_barfunc[bar_ct] = barfn;
+               pci.pci_devices[id].pd_bar_cookie[bar_ct] = cookie;
+               pci.pci_devices[id].pd_bartype[bar_ct] = PCI_BAR_TYPE_MMIO;
+               pci.pci_devices[id].pd_barsize[bar_ct] = sz;
+               pci.pci_devices[id].pd_bar_ct++;
+               pci.pci_devices[id].pd_bartype[bar_ct+1] = PCI_BAR_TYPE_MMIO;
+       } else if (type == PCI_MAPREG_TYPE_MEM) {
+               if (pci.pci_next_mmio_bar + sz >= VMM_PCI_MMIO_BAR_END)
                        return (1);
 
-               pci.pci_devices[id].pd_cfg_space[bar_reg_idx] =
-                   PCI_MAPREG_MEM_ADDR(pci.pci_next_mmio_bar);
-               pci.pci_next_mmio_bar += VMM_PCI_MMIO_BAR_SIZE;
+               // Page align makes easier mapping
+               base = mbar(&pci.pci_next_mmio_bar, sz, 4096);
+               pci.pci_devices[id].pd_cfg_space[bar_reg_idx] = 
+                   PCI_MAPREG_MEM_ADDR(base);
                pci.pci_devices[id].pd_barfunc[bar_ct] = barfn;
                pci.pci_devices[id].pd_bar_cookie[bar_ct] = cookie;
                pci.pci_devices[id].pd_bartype[bar_ct] = PCI_BAR_TYPE_MMIO;
-               pci.pci_devices[id].pd_barsize[bar_ct] = VMM_PCI_MMIO_BAR_SIZE;
+               pci.pci_devices[id].pd_barsize[bar_ct] = sz;
                pci.pci_devices[id].pd_bar_ct++;
        } else if (type == PCI_MAPREG_TYPE_IO) {
-               if (pci.pci_next_io_bar >= VMM_PCI_IO_BAR_END)
+               if (pci.pci_next_io_bar + sz >= VMM_PCI_IO_BAR_END)
                        return (1);
 
-               pci.pci_devices[id].pd_cfg_space[bar_reg_idx] =
-                   PCI_MAPREG_IO_ADDR(pci.pci_next_io_bar) |
+               base = mbar(&pci.pci_next_io_bar, sz, 4);
+               pci.pci_devices[id].pd_cfg_space[bar_reg_idx] = 
+                   PCI_MAPREG_IO_ADDR(base) |
                    PCI_MAPREG_TYPE_IO;
-               pci.pci_next_io_bar += VMM_PCI_IO_BAR_SIZE;
                pci.pci_devices[id].pd_barfunc[bar_ct] = barfn;
                pci.pci_devices[id].pd_bar_cookie[bar_ct] = cookie;
                DPRINTF("%s: adding pci bar cookie for dev %d bar %d = %p",
                    __progname, id, bar_ct, cookie);
                pci.pci_devices[id].pd_bartype[bar_ct] = PCI_BAR_TYPE_IO;
-               pci.pci_devices[id].pd_barsize[bar_ct] = VMM_PCI_IO_BAR_SIZE;
+               pci.pci_devices[id].pd_barsize[bar_ct] = sz;
                pci.pci_devices[id].pd_bar_ct++;
        }
 
+       fprintf(stderr, "@@@ PCI_ADDBAR(%d, %d, %x, %x)\n",
+               bar_ct, type, pci.pci_devices[id].pd_cfg_space[bar_reg_idx], 
sz);
+
        return (0);
 }
 
@@ -165,8 +418,10 @@ pci_get_dev_irq(uint8_t id)
 int
 pci_add_device(uint8_t *id, uint16_t vid, uint16_t pid, uint8_t class,
     uint8_t subclass, uint16_t subsys_vid, uint16_t subsys_id,
-    uint8_t irq_needed, pci_cs_fn_t csfunc)
+    uint8_t irq_needed, pci_cs_fn_t csfunc, void *cookie)
 {
+       log_warnx("%s: add_pci: %x.%x.%x\n", __progname, vid, pid, class);
+
        /* Exceeded max devices? */
        if (pci.pci_dev_ct >= PCI_CONFIG_MAX_DEV)
                return (1);
@@ -186,6 +441,7 @@ pci_add_device(uint8_t *id, uint16_t vid, uint16_t pid, 
uint8_t class,
        pci.pci_devices[*id].pd_subsys_id = subsys_id;
 
        pci.pci_devices[*id].pd_csfunc = csfunc;
+       pci.pci_devices[*id].pd_cookie = cookie;
 
        if (irq_needed) {
                pci.pci_devices[*id].pd_irq =
@@ -202,6 +458,173 @@ pci_add_device(uint8_t *id, uint16_t vid, uint16_t pid, 
uint8_t class,
        return (0);
 }
 
+/* Allocate an aligned BAR address */
+uint64_t
+mbar(uint64_t *base, uint32_t size, uint32_t align)
+{
+       uint64_t mask = size-1;
+       uint64_t cbase;
+
+       cbase = (*base + mask) & ~mask;
+       *base = (*base + size + mask) & ~mask;
+       fprintf(stderr,"make mbar: %llx/%x\n", cbase, size);
+       return cbase;
+}
+
+void pci_add_pthru(struct vmd_vm *vm, int bus, int dev, int fun);
+
+#define PCIOCUNBIND    _IOWR('p', 9, struct pcisel)
+
+int ppt_csfn(int dir, uint8_t reg, uint8_t sz, uint32_t *data, void *cookie);
+int ppt_iobar(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, void 
*cookie, uint8_t size);
+int ppt_mmiobar(int dir, uint32_t ofs, uint32_t *data);
+
+/* Only certain PCI values are writeable and most are already cached
+ * 00h vendor   : ro
+ * 02h device   : ro
+ * 04h command  : rw
+ * 06h status   : rw
+ * 08h revision : ro
+ * 09h class    : ro
+ * 0ah subclass : ro
+ * 0bh interface: ro
+ * 0ch
+ * 0dh
+ * 0eh hdr type : ro
+ * 0fh
+ * 10h bar      : rw
+ * 14h bar      : rw
+ * 18h bar      : rw
+ * 1ch bar      : rw
+ * 20h bar      : rw
+ * 24h bar      : rw
+ */
+int
+ppt_csfn(int dir, uint8_t reg, uint8_t sz, uint32_t *data, void *cookie)
+{
+       struct pci_ptd *pd = cookie;
+       struct pci_dev *pdev;
+
+       pdev = &pci.pci_devices[pd->id];
+       fprintf(stderr, "@pciio: %c:%.2x %d %.8x\n", dir == VEI_DIR_IN ? 'r' : 
'w', reg, sz, *data);
+       return 0;
+}
+
+/* Callback for I/O ports. Map to new I/O port and do it */
+int
+ppt_iobar(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, void *cookie, 
uint8_t size)
+{
+       uint8_t barid = PTD_BAR(cookie);
+       uint8_t devid = PTD_DEV(cookie);
+       int hp, gp;
+
+       *intr = 0xFF;
+       remap_io(devid, barid, reg, &hp, &gp);
+       do_pio(1, dir, hp, size, data);
+       return 0;
+}
+
+int
+ppt_mmiobar(int dir, uint32_t ofs, uint32_t *data)
+{
+       fprintf(stderr,"mmiobar: %d.%x\n", dir, ofs);
+       return 0;
+}
+
+void dump(void *ptr, int len)
+{
+  int i, j, c;
+  unsigned char *b = ptr;
+
+  for (i = 0; i < len; i+=16) {
+    fprintf(stderr,"%.4x ", i);
+    for (j = 0; j < 16; j++) {
+      fprintf(stderr,"%.2x ", b[i+j]);
+    }
+    fprintf(stderr,"  ");
+    for (j = 0; j < 16; j++) {
+      c = b[i+j];
+      if (c < ' ' || c > 'z') c = '.';
+      fprintf(stderr,"%c", c);
+    }
+    fprintf(stderr,"\n");
+  }
+}
+
+void
+pci_add_pthru(struct vmd_vm *vm, int bus, int dev, int fun)
+{
+       struct vm_barinfo bif;
+       uint32_t id_reg, subid_reg, class_reg, cmd_reg, intr_reg;
+       int i, rc;
+
+       /* Read PCI config space */
+       id_reg = ptd_conf_read(bus, dev, fun, PCI_ID_REG);
+       if (PCI_VENDOR(id_reg) == PCI_VENDOR_INVALID || PCI_VENDOR(id_reg) == 
0x0000) {
+               fprintf(stderr, "Error: No PCI device @ %u:%u:%u\n", bus, dev, 
fun);
+               return;
+       }
+       subid_reg = ptd_conf_read(bus, dev, fun, PCI_SUBSYS_ID_REG);
+       class_reg = ptd_conf_read(bus, dev, fun, PCI_CLASS_REG);
+       cmd_reg = ptd_conf_read(bus, dev, fun, PCI_COMMAND_STATUS_REG);
+       intr_reg = ptd_conf_read(bus, dev, fun, PCI_INTERRUPT_REG);
+
+       fprintf(stderr, "intr: pin:%.2x line:%.2x\n",
+               PCI_INTERRUPT_PIN(intr_reg), PCI_INTERRUPT_LINE(intr_reg));
+       ptd_conf_write(bus, dev, fun, 0x4, cmd_reg & 
~(PCI_COMMAND_IO_ENABLE|PCI_COMMAND_MEM_ENABLE));
+
+       /* Unregister previous VMM */
+       for (i = 0; i < MAXBAR; i++) {
+               if (ptd.barinfo[i].va) {
+                       unmapbar(ptd.barinfo[i].va, ptd.barinfo[i].size);
+               }
+       }
+       ptd.bus = bus;
+       ptd.dev = dev;
+       ptd.fun = fun;
+       pci_add_device(&ptd.id, PCI_VENDOR(id_reg), PCI_PRODUCT(id_reg),
+                       PCI_CLASS(class_reg), PCI_SUBCLASS(class_reg),
+                       PCI_VENDOR(subid_reg), PCI_PRODUCT(subid_reg),
+                       1, NULL, &ptd);
+
+       /* Get BARs of native device */
+       bif.seg = 0;
+       bif.bus = bus;
+       bif.dev = dev;
+       bif.func = fun;
+       rc = ioctl(env->vmd_fd, VMM_IOC_BARINFO,  &bif);
+       if (rc != 0) {
+               fprintf(stderr, "%d:%d:%d not valid pci device\n", bus, dev, 
fun);
+               return;
+       }
+       for (i = 0; i < MAXBAR; i++) {
+               int type;
+
+               type = bif.bars[i].type;
+               ptd.barinfo[i].type = bif.bars[i].type;
+               ptd.barinfo[i].size = bif.bars[i].size;
+               ptd.barinfo[i].addr = bif.bars[i].addr;
+
+               fprintf(stderr," Bar%d: type:%x base:%llx size:%x\n",
+                       i, ptd.barinfo[i].type, ptd.barinfo[i].addr, 
ptd.barinfo[i].size);
+               if (!ptd.barinfo[i].size) {
+                       /* Kick bar index */
+                       pci.pci_devices[ptd.id].pd_bar_ct++;
+               }
+               else if (PCI_MAPREG_TYPE(type) == PCI_MAPREG_TYPE_MEM) {
+                       pci_add_bar(ptd.id, type, ptd.barinfo[i].size, 
+                                   ppt_mmiobar, PTD_DEVID(ptd.id, i));
+                       ptd.barinfo[i].va = mapbar(i, ptd.barinfo[i].addr, 
ptd.barinfo[i].size);
+               }
+               else if (PCI_MAPREG_TYPE(type) == PCI_MAPREG_TYPE_IO) {
+                       /* This will get callback via pci_handle_io */
+                       pci_add_bar(ptd.id, PCI_MAPREG_TYPE_IO,  
ptd.barinfo[i].size, 
+                                   ppt_iobar, PTD_DEVID(ptd.id, i));
+               }
+       }
+       showremap(&ptd);        
+}
+
 /*
  * pci_init
  *
@@ -219,7 +642,7 @@ pci_init(void)
 
        if (pci_add_device(&id, PCI_VENDOR_OPENBSD, PCI_PRODUCT_OPENBSD_PCHB,
            PCI_CLASS_BRIDGE, PCI_SUBCLASS_BRIDGE_HOST,
-           PCI_VENDOR_OPENBSD, 0, 0, NULL)) {
+           PCI_VENDOR_OPENBSD, 0, 0, NULL, NULL)) {
                log_warnx("%s: can't add PCI host bridge", __progname);
                return;
        }
@@ -264,6 +687,8 @@ pci_handle_io(struct vm_run_params *vrp)
 
        for (i = 0 ; i < pci.pci_dev_ct ; i++) {
                for (j = 0 ; j < pci.pci_devices[i].pd_bar_ct; j++) {
+                       if (pci.pci_devices[i].pd_bartype[j] != PCI_BAR_TYPE_IO)
+                               continue;
                        b_lo = PCI_MAPREG_IO_ADDR(pci.pci_devices[i].pd_bar[j]);
                        b_hi = b_lo + VMM_PCI_IO_BAR_SIZE;
                        if (reg >= b_lo && reg < b_hi) {
@@ -286,7 +711,7 @@ pci_handle_io(struct vm_run_params *vrp)
                            __progname);
                }
        } else {
-               DPRINTF("%s: no pci i/o function for reg 0x%llx (dir=%d "
+               fprintf(stderr,"%s: no pci i/o function for reg 0x%llx (dir=%d "
                    "guest %%rip=0x%llx", __progname, (uint64_t)reg, dir,
                    vei->vrs.vrs_gprs[VCPU_REGS_RIP]);
                /* Reads from undefined ports return 0xFF */
@@ -306,6 +731,8 @@ pci_handle_data_reg(struct vm_run_params *vrp)
 {
        struct vm_exit *vei = vrp->vrp_exit;
        uint8_t b, d, f, o, baridx, ofs, sz;
+       uint32_t data, barval, barsize, bartype;
+       uint64_t wrdata;
        int ret;
        pci_cs_fn_t csfunc;
 
@@ -328,9 +755,27 @@ pci_handle_data_reg(struct vm_run_params *vrp)
        f = (pci.pci_addr_reg >> 8) & 0x7;
        o = (pci.pci_addr_reg & 0xfc);
 
+       if ((o == 0x04 || o == 0x08 || o == 0x34 || o >= 0x40) && (d == ptd.id 
&& d > 0)) {
+               /* Passthrough PCI Cfg Space */
+               if (vei->vei.vei_dir == VEI_DIR_IN) {
+                       data = ptd_conf_read(ptd.bus, ptd.dev, ptd.fun, o);
+                       _pcicfgwr32(d, o, data);
+               }
+               else {
+                       data = vei->vei.vei_data;
+                       ptd_conf_write(ptd.bus, ptd.dev, ptd.fun, o, data);
+               }
+       }
+
+       wrdata = vei->vei.vei_data;
+       data = 0;
+       if (d < pci.pci_dev_ct && !b && !f) {
+               data = _pcicfgrd32(d, o);
+       }
+       
        csfunc = pci.pci_devices[d].pd_csfunc;
        if (csfunc != NULL) {
-               ret = csfunc(vei->vei.vei_dir, (o / 4), &vei->vei.vei_data);
+               ret = csfunc(vei->vei.vei_dir, o, sz, &vei->vei.vei_data, 
pci.pci_devices[d].pd_cookie);
                if (ret)
                        log_warnx("cfg space access function failed for "
                            "pci device %d", d);
@@ -340,39 +785,46 @@ pci_handle_data_reg(struct vm_run_params *vrp)
        /* No config space function, fallback to default simple r/w impl. */
 
        o += ofs;
-
+       
        /*
         * vei_dir == VEI_DIR_OUT : out instruction
         *
         * The guest wrote to the config space location denoted by the current
         * value in the address register.
         */
+       if (o >= 0x10 && o <= 0x24 && 0) {
+               fprintf(stderr, "%d %.2x bar: %c %.8x\n", 
+                       d, o, (vei->vei.vei_dir == VEI_DIR_OUT) ? 'w' : 'r',
+                       (vei->vei.vei_dir == VEI_DIR_OUT) ? wrdata : data);
+       }                       
        if (vei->vei.vei_dir == VEI_DIR_OUT) {
-               if ((o >= 0x10 && o <= 0x24) &&
-                   vei->vei.vei_data == 0xffffffff) {
-                       /*
-                        * Compute BAR index:
-                        * o = 0x10 -> baridx = 0
-                        * o = 0x14 -> baridx = 1
-                        * o = 0x18 -> baridx = 2
-                        * o = 0x1c -> baridx = 3
-                        * o = 0x20 -> baridx = 4
-                        * o = 0x24 -> baridx = 5
-                        */
-                       baridx = (o / 4) - 4;
-                       if (baridx < pci.pci_devices[d].pd_bar_ct)
-                               vei->vei.vei_data = 0xfffff000;
-                       else
-                               vei->vei.vei_data = 0;
-               }
-
-               /* IOBAR registers must have bit 0 set */
                if (o >= 0x10 && o <= 0x24) {
-                       baridx = (o / 4) - 4;
-                       if (baridx < pci.pci_devices[d].pd_bar_ct &&
-                           pci.pci_devices[d].pd_bartype[baridx] ==
-                           PCI_BAR_TYPE_IO)
-                               vei->vei.vei_data |= 1;
+                       baridx = (o - 0x10) / 4;
+                       barval = pci.pci_devices[d].pd_cfg_space[o/4];
+                       barsize = pci.pci_devices[d].pd_barsize[baridx];
+                       bartype = pci.pci_devices[d].pd_bartype[baridx];
+
+                       if (barsize) {  
+                               /* Mask off invalid bits */
+                               wrdata &= ~(barsize - 1);
+                               if (bartype == PCI_BAR_TYPE_IO)
+                                       wrdata |= (barval & 0x1);
+                               else {
+                                       /* Modify memory handler */
+                                       unregister_mem(barval & ~0xF);
+                                       register_mem(wrdata, barsize, 
pci_memh2, 
+                                                       PTD_DEVID(d, baridx));  
+                                       wrdata |= (barval & 0xF);
+                               }
+                       }
+                       else if (bartype != PCI_BAR_TYPE_MMIO)
+                               wrdata = 0;
+#if 0
+                       fprintf(stderr, "%d %.2x val:%.8x/%.8llx new:%.8x 
[%.8x] ip:%.16llx\n", 
+                               d, o, barval, barsize, wrdata, 
vei->vei.vei_data,
+                               vei->vrs.vrs_gprs[VCPU_REGS_RIP]);
+#endif
+                       vei->vei.vei_data = wrdata;
                }
 
                /*
@@ -381,8 +833,7 @@ pci_handle_data_reg(struct vm_run_params *vrp)
                 * writes and copy data to config space registers.
                 */
                if (o != PCI_EXROMADDR_0)
-                       get_input_data(vei,
-                           &pci.pci_devices[d].pd_cfg_space[o / 4]);
+                       get_input_data(vei, 
&pci.pci_devices[d].pd_cfg_space[o/4]);
        } else {
                /*
                 * vei_dir == VEI_DIR_IN : in instruction
@@ -395,20 +846,16 @@ pci_handle_data_reg(struct vm_run_params *vrp)
                else {
                        switch (sz) {
                        case 4:
-                               set_return_data(vei,
-                                   pci.pci_devices[d].pd_cfg_space[o / 4]);
+                               set_return_data(vei, data);
                                break;
                        case 2:
                                if (ofs == 0)
-                                       set_return_data(vei, pci.pci_devices[d].
-                                           pd_cfg_space[o / 4]);
+                                       set_return_data(vei, data);
                                else
-                                       set_return_data(vei, pci.pci_devices[d].
-                                           pd_cfg_space[o / 4] >> 16);
+                                       set_return_data(vei, data >> 16);
                                break;
                        case 1:
-                               set_return_data(vei, pci.pci_devices[d].
-                                   pd_cfg_space[o / 4] >> (ofs * 8));
+                               set_return_data(vei, data >> (ofs * 8));
                                break;
                        }
                }
diff --git a/usr.sbin/vmd/pci.h b/usr.sbin/vmd/pci.h
index 01902d77d..9cd624779 100644
--- a/usr.sbin/vmd/pci.h
+++ b/usr.sbin/vmd/pci.h
@@ -27,15 +27,16 @@
 
 #define PCI_MAX_PIC_IRQS       10
 
-typedef int (*pci_cs_fn_t)(int dir, uint8_t reg, uint32_t *data);
+typedef int (*pci_cs_fn_t)(int dir, uint8_t reg, uint8_t sz, uint32_t *data, 
void *cookie);
 typedef int (*pci_iobar_fn_t)(int dir, uint16_t reg, uint32_t *data, uint8_t *,
     void *, uint8_t);
 typedef int (*pci_mmiobar_fn_t)(int dir, uint32_t ofs, uint32_t *data);
 
-union pci_dev {
-       uint32_t pd_cfg_space[PCI_CONFIG_SPACE_SIZE / 4];
+struct pci_dev {
+       union {
+               uint32_t pd_cfg_space[PCI_CONFIG_SPACE_SIZE / 4];
 
-       struct {
+               struct {
                uint16_t pd_vid;
                uint16_t pd_did;
                uint16_t pd_cmd;
@@ -60,17 +61,20 @@ union pci_dev {
                uint8_t pd_int;
                uint8_t pd_min_grant;
                uint8_t pd_max_grant;
+               } __packed;
+       };
+       uint8_t pd_bar_ct;
+       pci_cs_fn_t pd_csfunc;
 
-               uint8_t pd_bar_ct;
-               pci_cs_fn_t pd_csfunc;
-
-               uint8_t pd_bartype[PCI_MAX_BARS];
-               uint32_t pd_barsize[PCI_MAX_BARS];
-               void *pd_barfunc[PCI_MAX_BARS];
-               void *pd_bar_cookie[PCI_MAX_BARS];
-       } __packed;
+       uint8_t pd_bartype[PCI_MAX_BARS];
+       uint32_t pd_barsize[PCI_MAX_BARS];
+       void *pd_barfunc[PCI_MAX_BARS];
+       void *pd_bar_cookie[PCI_MAX_BARS];
+       void *pd_cookie;
 };
 
+typedef struct pci_dev pcidev_t;
+
 struct pci {
        uint8_t pci_dev_ct;
        uint64_t pci_next_mmio_bar;
@@ -79,7 +83,7 @@ struct pci {
        uint32_t pci_addr_reg;
        uint32_t pci_data_reg;
 
-       union pci_dev pci_devices[PCI_CONFIG_MAX_DEV];
+       pcidev_t pci_devices[PCI_CONFIG_MAX_DEV];
 };
 
 void pci_handle_address_reg(struct vm_run_params *);
@@ -87,8 +91,8 @@ void pci_handle_data_reg(struct vm_run_params *);
 uint8_t pci_handle_io(struct vm_run_params *);
 void pci_init(void);
 int pci_add_device(uint8_t *, uint16_t, uint16_t, uint8_t, uint8_t, uint16_t,
-    uint16_t, uint8_t, pci_cs_fn_t);
-int pci_add_bar(uint8_t, uint32_t, void *, void *);
+    uint16_t, uint8_t, pci_cs_fn_t, void *);
+int pci_add_bar(uint8_t, uint32_t, uint32_t, void *, void *);
 int pci_set_bar_fn(uint8_t, uint8_t, void *, void *);
 uint8_t pci_get_dev_irq(uint8_t);
 int pci_dump(int);
diff --git a/usr.sbin/vmd/virtio.c b/usr.sbin/vmd/virtio.c
index 8800594fc..430f41995 100644
--- a/usr.sbin/vmd/virtio.c
+++ b/usr.sbin/vmd/virtio.c
@@ -1797,13 +1797,13 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
            PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
            PCI_SUBCLASS_SYSTEM_MISC,
            PCI_VENDOR_OPENBSD,
-           PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
+           PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL, NULL)) {
                log_warnx("%s: can't add PCI virtio rng device",
                    __progname);
                return;
        }
 
-       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
+       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, VMM_PCI_IO_BAR_SIZE, 
virtio_rnd_io, NULL)) {
                log_warnx("%s: can't add bar for virtio rng device",
                    __progname);
                return;
@@ -1835,14 +1835,14 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
                            PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
                            PCI_SUBCLASS_SYSTEM_MISC,
                            PCI_VENDOR_OPENBSD,
-                           PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
+                           PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL, NULL)) {
                                log_warnx("%s: can't add PCI virtio net device",
                                    __progname);
                                return;
                        }
 
-                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_net_io,
-                           &vionet[i])) {
+                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, 
VMM_PCI_IO_BAR_SIZE,
+                           virtio_net_io, &vionet[i])) {
                                log_warnx("%s: can't add bar for virtio net "
                                    "device", __progname);
                                return;
@@ -1923,13 +1923,13 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
                            PCI_CLASS_MASS_STORAGE,
                            PCI_SUBCLASS_MASS_STORAGE_SCSI,
                            PCI_VENDOR_OPENBSD,
-                           PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
+                           PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL, NULL)) {
                                log_warnx("%s: can't add PCI virtio block "
                                    "device", __progname);
                                return;
                        }
-                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_blk_io,
-                           &vioblk[i])) {
+                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, 
VMM_PCI_IO_BAR_SIZE,
+                           virtio_blk_io, &vioblk[i])) {
                                log_warnx("%s: can't add bar for virtio block "
                                    "device", __progname);
                                return;
@@ -1971,13 +1971,14 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
                    PCI_CLASS_MASS_STORAGE,
                    PCI_SUBCLASS_MASS_STORAGE_SCSI,
                    PCI_VENDOR_OPENBSD,
-                   PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
+                   PCI_PRODUCT_VIRTIO_SCSI, 1, NULL, NULL)) {
                        log_warnx("%s: can't add PCI vioscsi device",
                            __progname);
                        return;
                }
 
-               if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
+               if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, VMM_PCI_IO_BAR_SIZE,
+                   vioscsi_io, vioscsi)) {
                        log_warnx("%s: can't add bar for vioscsi device",
                            __progname);
                        return;
@@ -2013,13 +2014,13 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
            PCI_CLASS_COMMUNICATIONS,
            PCI_SUBCLASS_COMMUNICATIONS_MISC,
            PCI_VENDOR_OPENBSD,
-           PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
+           PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL, NULL)) {
                log_warnx("%s: can't add PCI vmm control device",
                    __progname);
                return;
        }
 
-       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
+       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, VMM_PCI_IO_BAR_SIZE, vmmci_io, 
NULL)) {
                log_warnx("%s: can't add bar for vmm control device",
                    __progname);
                return;
diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c
index a9fcce4fa..7284b9927 100644
--- a/usr.sbin/vmd/vm.c
+++ b/usr.sbin/vmd/vm.c
@@ -24,6 +24,7 @@
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/mman.h>
+#include <sys/queue.h>
 
 #include <dev/ic/i8253reg.h>
 #include <dev/isa/isareg.h>
@@ -63,6 +64,9 @@
 #include "mc146818.h"
 #include "fw_cfg.h"
 #include "atomicio.h"
+#include "x86emu.h"
+
+#define pledge(x...) 0
 
 io_fn_t ioports_map[MAX_PORTS];
 
@@ -289,6 +293,7 @@ start_vm(struct vmd_vm *vm, int fd)
        if (!(vm->vm_state & VM_STATE_RECEIVED))
                create_memory_map(vcp);
 
+       fprintf(stderr, "---- alloc guest mem\n");
        ret = alloc_guest_mem(vcp);
 
        if (ret) {
@@ -857,6 +862,7 @@ create_memory_map(struct vm_create_params *vcp)
 {
        size_t len, mem_bytes, mem_mb;
 
+       fprintf(stderr, "create memory map\n");
        mem_mb = vcp->vcp_memranges[0].vmr_size;
        vcp->vcp_nmemranges = 0;
        if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
@@ -946,7 +952,7 @@ alloc_guest_mem(struct vm_create_params *vcp)
 
                        return (ret);
                }
-
+               memset(p, 0, vmr->vmr_size);
                vmr->vmr_va = (vaddr_t)p;
        }
 
@@ -991,6 +997,7 @@ vmm_create_vm(struct vm_create_params *vcp)
        return (0);
 }
 
+void pci_add_pthru(struct vmd_vm *vm, int bus, int dev, int fun);
 /*
  * init_emulated_hw
  *
@@ -1001,7 +1008,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int 
child_cdrom,
     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
        struct vm_create_params *vcp = &vmc->vmc_params;
-       int i;
+       size_t i;
        uint64_t memlo, memhi;
 
        /* Calculate memory size for NVRAM registers */
@@ -1062,6 +1069,13 @@ init_emulated_hw(struct vmop_create_params *vmc, int 
child_cdrom,
 
        /* Initialize virtio devices */
        virtio_init(current_vm, child_cdrom, child_disks, child_taps);
+
+       for (i = 0; i < vcp->vcp_npcis; i++) {
+               int bus = (vcp->vcp_pcis[i] >> 8);
+               int dev = (vcp->vcp_pcis[i] >> 3) & 0x1F;
+               int fun = (vcp->vcp_pcis[i] >> 0) & 0x7;
+               pci_add_pthru(current_vm, bus, dev, fun);
+       }
 }
 /*
  * restore_emulated_hw
@@ -1584,13 +1598,69 @@ vcpu_exit_inout(struct vm_run_params *vrp)
 
        if (ioports_map[vei->vei.vei_port] != NULL)
                intr = ioports_map[vei->vei.vei_port](vrp);
-       else if (vei->vei.vei_dir == VEI_DIR_IN)
-                       set_return_data(vei, 0xFFFFFFFF);
-
+       else if (vei->vei.vei_dir == VEI_DIR_IN) {
+               fprintf(stderr, "exit_inout: %x\n", vei->vei.vei_port);
+               set_return_data(vei, 0xFFFFFFFF);
+       }
        if (intr != 0xFF)
                vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
 }
 
+TAILQ_HEAD(,iohandler) memh = TAILQ_HEAD_INITIALIZER(memh);
+
+void
+register_mem(uint64_t base, uint32_t len, iocb_t handler, void *cookie)
+{
+       struct iohandler *mem;
+
+       //fprintf(stderr, "@@@ Registering mem region: %llx - %llx\n", base, 
base+len-1);
+       TAILQ_FOREACH(mem, &memh, next) {
+               if (base >= mem->start && base+len <= mem->end) {
+                       fprintf(stderr,"already registered\n");
+                       return;
+               }
+       }
+       mem = calloc(1, sizeof(*mem));
+       mem->start = base;
+       mem->end   = base+len-1;
+       mem->handler = handler;
+       mem->cookie = cookie;
+       TAILQ_INSERT_TAIL(&memh, mem, next);
+}
+
+void
+unregister_mem(uint64_t base)
+{
+       struct iohandler *mem, *tmp;
+
+       //fprintf(stderr,"@@@ Unregistering base: %llx\n", base);
+       TAILQ_FOREACH_SAFE(mem, &memh, next, tmp) {
+               if (mem->start == base) {
+                       fprintf(stderr, "  removed:%llx-%llx\n", mem->start, 
mem->end);
+                       TAILQ_REMOVE(&memh, mem, next);
+                       free(mem);
+               }
+       }
+}
+
+int
+mem_handler(int dir, uint64_t addr, uint32_t size, void *data)
+{
+       struct iohandler *mem;
+       int rc;
+
+       TAILQ_FOREACH(mem, &memh, next) {
+               if (addr >= mem->start && addr+size <= mem->end) {
+                       rc = mem->handler(dir, addr, size, data, mem->cookie);
+                       if (rc != 0) {
+                               fprintf(stderr, "Error mem handler: %llx\n", 
addr);
+                       }
+                       return rc;
+               }
+       }
+       return -1;
+}
+
 /*
  * vcpu_exit_eptviolation
  *
@@ -1604,10 +1674,66 @@ vcpu_exit_inout(struct vm_run_params *vrp)
  *  0: no action required
  *  EAGAIN: a protection fault occured, kill the vm.
  */
+
+extern int mem_chkint(void);
+
 int
 vcpu_exit_eptviolation(struct vm_run_params *vrp)
 {
        struct vm_exit *ve = vrp->vrp_exit;
+       uint64_t gip, gpa;
+       uint8_t instr[16] = { 0 };
+       struct vm_rwregs_params vrwp = { 0 };
+       uint64_t *rax;
+       struct insn ix;
+
+       translate_gva(ve, ve->vrs.vrs_gprs[VCPU_REGS_RIP], &gip, PROT_READ);
+       read_mem(gip, instr, sizeof(instr));    
+       fprintf(stderr, "===============\nept violation: %llx  rip:0x%llx %.2x 
%.2x %.2x %.2x %.2x\n",
+               ve->vee.vee_gpa, ve->vrs.vrs_gprs[VCPU_REGS_RIP], instr[0], 
instr[1], instr[2],
+               instr[3], instr[4]);
+       fprintf(stderr, "  rax:0x%.16llx rbx:0x%.16llx rcx:0x%.16llx 
rdx:0x%.16llx\n",
+               ve->vrs.vrs_gprs[VCPU_REGS_RAX],
+               ve->vrs.vrs_gprs[VCPU_REGS_RBX],
+               ve->vrs.vrs_gprs[VCPU_REGS_RCX],
+               ve->vrs.vrs_gprs[VCPU_REGS_RDX]);
+       fprintf(stderr, "  rsi:0x%.16llx rdi:0x%.16llx rbp:0x%.16llx 
rsp:0x%.16llx\n",
+               ve->vrs.vrs_gprs[VCPU_REGS_RSI],
+               ve->vrs.vrs_gprs[VCPU_REGS_RDI],
+               ve->vrs.vrs_gprs[VCPU_REGS_RBP],
+               ve->vrs.vrs_gprs[VCPU_REGS_RSP]);
+       fprintf(stderr, "  r8: 0x%.16llx r9: 0x%.16llx r10:0x%.16llx 
r11:0x%.16llx\n",
+               ve->vrs.vrs_gprs[VCPU_REGS_R8],
+               ve->vrs.vrs_gprs[VCPU_REGS_R9],
+               ve->vrs.vrs_gprs[VCPU_REGS_R10],
+               ve->vrs.vrs_gprs[VCPU_REGS_R11]);
+       fprintf(stderr, "  r12:0x%.16llx r13:0x%.16llx r14:0x%.16llx 
r15:0x%.16llx\n",
+               ve->vrs.vrs_gprs[VCPU_REGS_R12],
+               ve->vrs.vrs_gprs[VCPU_REGS_R13],
+               ve->vrs.vrs_gprs[VCPU_REGS_R14],
+               ve->vrs.vrs_gprs[VCPU_REGS_R15]);
+
+       vrwp.vrwp_mask = VM_RWREGS_GPRS;
+       vrwp.vrwp_vm_id = vrp->vrp_vm_id;
+       vrwp.vrwp_vcpu_id = vrp->vrp_vcpu_id;
+       vrwp.vrwp_regs = ve->vrs;
+       gpa = ve->vee.vee_gpa;
+
+       memset(&ix, 0, sizeof(ix));
+       dodis(instr, &ix, ve->vrs.vrs_sregs[VCPU_REGS_CS].vsi_ar & 0x2000 ?
+               SIZE_QWORD : SIZE_DWORD);
+       if (ix.incr && (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= 
VMM_PCI_MMIO_BAR_END)) {
+               rax = &vrwp.vrwp_regs.vrs_gprs[ix.reg];
+               mem_handler(ix.dir, gpa, ix.size, rax);
+               fprintf(stderr, "memhandler : %.8llx %d\n", *rax, ix.incr);
+               /* skip this instruction when returning to vm */
+               vrwp.vrwp_regs.vrs_gprs[VCPU_REGS_RIP] += ix.incr;
+               if (ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, &vrwp))
+                       fprintf(stderr,"writeregs fails\n");
+               return 0;
+       }
+       fprintf(stderr, "nothandled\n");
+
        /*
         * vmd may be exiting to vmd to handle a pending interrupt
         * but last exit type may have bee VMX_EXIT_EPT_VIOLATION,
@@ -1645,15 +1771,12 @@ vcpu_exit_eptviolation(struct vm_run_params *vrp)
 int
 vcpu_exit(struct vm_run_params *vrp)
 {
-       int ret;
+       int ret = 0;
 
        switch (vrp->vrp_exit_reason) {
        case VMX_EXIT_INT_WINDOW:
        case SVM_VMEXIT_VINTR:
        case VMX_EXIT_CPUID:
-       case VMX_EXIT_EXTINT:
-       case SVM_VMEXIT_INTR:
-       case SVM_VMEXIT_NPF:
        case SVM_VMEXIT_MSR:
        case SVM_VMEXIT_CPUID:
                /*
@@ -1664,11 +1787,13 @@ vcpu_exit(struct vm_run_params *vrp)
                 * in more vmd log spam).
                 */
                break;
+       case SVM_VMEXIT_INTR:
+       case VMX_EXIT_EXTINT:
+               //fprintf(stderr, "extint...\n");
+               break;
        case VMX_EXIT_EPT_VIOLATION:
+       case SVM_VMEXIT_NPF:
                ret = vcpu_exit_eptviolation(vrp);
-               if (ret)
-                       return (ret);
-
                break;
        case VMX_EXIT_IO:
        case SVM_VMEXIT_IOIO:
@@ -1701,7 +1826,12 @@ vcpu_exit(struct vm_run_params *vrp)
 
        /* Process any pending traffic */
        vionet_process_rx(vrp->vrp_vm_id);
-
+       {
+               uint8_t intr;
+               if ((intr = mem_chkint()) != 0xff) {
+                       vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 
intr);
+               }
+       }
        vrp->vrp_continue = 1;
 
        return (0);
@@ -2216,12 +2346,13 @@ translate_gva(struct vm_exit* exit, uint64_t va, 
uint64_t* pa, int mode)
                        return (EPERM);
 
                pte = pte | PG_U;
-               if (mode == PROT_WRITE)
+               if (mode == PROT_WRITE) {
                        pte = pte | PG_M;
-               if (write_mem(pte_paddr, &pte, pte_size)) {
-                       log_warn("%s: failed to write back flags to pte",
-                           __func__);
-                       return (EIO);
+                       if (write_mem(pte_paddr, &pte, pte_size)) {
+                               log_warn("%s: failed to write back flags to 
pte",
+                                   __func__);
+                               return (EIO);
+                       }
                }
 
                /* XXX: EINVAL if in 32bit and  PG_PS is 1 but CR4.PSE is 0 */
diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c
index 634c81e79..69bc38411 100644
--- a/usr.sbin/vmd/vmd.c
+++ b/usr.sbin/vmd/vmd.c
@@ -49,6 +49,8 @@
 #include "atomicio.h"
 #include "vmd.h"
 
+#define pledge(x...) 0
+
 __dead void usage(void);
 
 int     main(int, char **);
diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c
index 67b7f5250..920c533f0 100644
--- a/usr.sbin/vmd/vmm.c
+++ b/usr.sbin/vmd/vmm.c
@@ -53,6 +53,8 @@
 #include "vmd.h"
 #include "vmm.h"
 
+#define pledge(x...) 0
+
 void vmm_sighdlr(int, short, void *);
 int vmm_start_vm(struct imsg *, uint32_t *, pid_t *);
 int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
diff --git a/usr.sbin/vmd/vmm.h b/usr.sbin/vmd/vmm.h
index 214d41d01..a92ed720b 100644
--- a/usr.sbin/vmd/vmm.h
+++ b/usr.sbin/vmd/vmm.h
@@ -22,3 +22,17 @@ void vcpu_assert_pic_irq(uint32_t, uint32_t, int);
 void vcpu_deassert_pic_irq(uint32_t, uint32_t, int);
 void set_return_data(struct vm_exit *, uint32_t);
 void get_input_data(struct vm_exit *, uint32_t *);
+
+typedef int (*iocb_t)(int, uint64_t, uint32_t, void *, void *);
+
+struct iohandler {
+       uint64_t start;
+       uint64_t end;
+       iocb_t handler;
+       void *cookie;
+       TAILQ_ENTRY(iohandler) next;
+};
+
+void register_mem(uint64_t base, uint32_t len, iocb_t handler, void *cookie);
+void unregister_mem(uint64_t base);
+int mem_handler(int dir, uint64_t addr, uint32_t size, void *data);
-- 
2.25.1

Reply via email to