From: Ben-Ami Yassour <[EMAIL PROTECTED]> Enable a guest to access a device's memory mapped I/O regions directly. Userspace sends the mmio regions that the guest can access. On the first page fault for an access to an mmio address the host translates the gva to hpa, and updates the sptes.
Signed-off-by: Ben-Ami Yassour <[EMAIL PROTECTED]> Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]> --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/mmu.c | 27 ++++++++ arch/x86/kvm/mmu.h | 2 + arch/x86/kvm/paging_tmpl.h | 71 +++++++++++++++++++--- arch/x86/kvm/passthrough.c | 144 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/passthrough.h | 23 +++++++ arch/x86/kvm/x86.c | 2 +- include/asm-x86/kvm_host.h | 15 +++++ include/linux/kvm.h | 13 ++++ include/linux/kvm_host.h | 10 +++ virt/kvm/kvm_main.c | 24 +++++++ 11 files changed, 323 insertions(+), 10 deletions(-) create mode 100644 arch/x86/kvm/passthrough.c create mode 100644 arch/x86/kvm/passthrough.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4d0c22e..2fa4932 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o + i8254.o passthrough.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8a6a4f9..dccd898 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -34,6 +34,8 @@ #include <asm/cmpxchg.h> #include <asm/io.h> +#include "passthrough.h" + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -112,6 +114,8 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 @@ -545,6 +549,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) unsigned long *rmapp; int i; + /* bail out if this is an spte mapping an MMIO region */ + if (kvm->arch.pt.pt_mmio_mapped) + if (*spte & PT_SHADOW_IO_MARK) + return; + if (!is_rmap_pte(*spte)) return; sp = page_header(__pa(spte)); @@ -1273,6 +1282,24 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; } +/* + * This is a very big sledgehammer, it should be called very rarely. + * We call this when a passthrough mmio mapping changes in order to + * remove the old passthrough mmio sptes. This usually only occurs on + * guest initialization. + */ +void mmu_invalidate_all_sptes(struct kvm *kvm) +{ + int i; + + for (i = 0 ; i < KVM_MAX_VCPUS ; i++) { + if (kvm->vcpus[i]) + mmu_free_roots(kvm->vcpus[i]); + } + + kvm_flush_remote_tlbs(kvm); +} + static void mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e64e9f5..5c9e33e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -47,4 +47,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } +void mmu_invalidate_all_sptes(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 57d872a..c33b6cf 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -73,6 +73,8 @@ struct guest_walker { u32 error_code; }; +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr); + static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -275,7 +277,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *walker, int user_fault, int write_fault, int largepage, - int *ptwrite, struct page *page) + int *ptwrite, int pt_mmio, struct page *page) { hpa_t shadow_addr; int level; @@ -346,15 +348,61 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, *shadow_ent = shadow_pte; } - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, page, false); - + /* for the passthrough mmio case, spte is set by page_fault_pt_mmio */ + if (!pt_mmio) + mmu_set_spte(vcpu, shadow_ent, access, + walker->pte_access & access, + user_fault, write_fault, + walker->ptes[walker->level-1] & PT_DIRTY_MASK, + ptwrite, largepage, walker->gfn, page, false); return shadow_ent; } /* + * Handle pagefault for passthrough mmio + */ +static int FNAME(page_fault_pt_mmio)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *walker, + int user_fault, int write_fault, + int largepage, int *ptwrite, + struct page *page) +{ + u64 *shadow_pte; + gpa_t gpa; + hpa_t hpa = UNMAPPED_HPA; + u64 spte; + int rc = 1; + int write_pt = 0; + + gpa = FNAME(gva_to_gpa)(vcpu, addr); + if (gpa != UNMAPPED_GVA) + hpa = pt_mmio_gpa_to_hpa(vcpu->kvm, gpa); + + if (hpa != UNMAPPED_HPA) { + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + shadow_pte = FNAME(fetch)(vcpu, addr, walker, user_fault, + write_fault, largepage, &write_pt, + 1, page); + + if (shadow_pte) { + set_shadow_pte(&spte, *shadow_pte); + spte = hpa; + spte |= PT_PRESENT_MASK | PT_WRITABLE_MASK | + PT_USER_MASK | PT_PWT_MASK | PT_PCD_MASK | + PT_ACCESSED_MASK | PT_SHADOW_IO_MARK; + set_shadow_pte(shadow_pte, spte); + rc = 0; + } else { + pgprintk("fetch failed to return shadow_pte " + "for address 0x%x", (unsigned int)addr); + } + spin_unlock(&vcpu->kvm->mmu_lock); + } + return rc; +} + +/* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte * - write access through a shadow pte marked read only so that we can set @@ -418,15 +466,22 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* mmio */ if (is_error_page(page)) { + int rc = 1; pgprintk("gfn %x is mmio\n", walker.gfn); + if (vcpu->kvm->arch.pt.pt_mmio_mapped) { + rc = FNAME(page_fault_pt_mmio)(vcpu, addr, &walker, + user_fault, write_fault, + largepage, &write_pt, + page); + } kvm_release_page_clean(page); - return 1; + return rc; } spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, page); + largepage, &write_pt, 0, page); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, shadow_pte, *shadow_pte, write_pt); diff --git a/arch/x86/kvm/passthrough.c b/arch/x86/kvm/passthrough.c new file mode 100644 index 0000000..654d1fe --- /dev/null +++ b/arch/x86/kvm/passthrough.c @@ -0,0 +1,144 @@ +/* + * This module enable guest shadow page tables to map host mmio regions of + * passthrough devices directly. + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Ben-Ami Yassour <[EMAIL PROTECTED]> + * Muli Ben-Yehuda <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "linux/kvm_host.h" +#include "passthrough.h" +#include "mmu.h" + +void pt_init_vm(struct pt *pt) +{ + spin_lock_init(&pt->pt_list_lock); + INIT_LIST_HEAD(&pt->mmio_range_list); + pt->pt_mmio_mapped = 0; +} + +hpa_t pt_mmio_gpa_to_hpa(struct kvm *kvm, gpa_t gpa) +{ + hpa_t hpa = UNMAPPED_HPA; + unsigned long flags; + struct mmio_range *mmio = NULL; + + spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags); + + /* loop the list of regions */ + list_for_each_entry(mmio, &kvm->arch.pt.mmio_range_list, list) { + if ((gpa >= mmio->gpa) && (gpa < mmio->gpa + mmio->size)) { + hpa = mmio->hpa + (gpa - mmio->gpa); + break; + } + } + + spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags); + + return hpa; +} + +int kvm_vm_ioctl_pt_memory_mapping_add(struct kvm *kvm, + struct kvm_pt_memory_mapping* + memory_mapping) +{ + gpa_t gpa = memory_mapping->gpa; + hpa_t hpa = memory_mapping->hpa; + u64 size = memory_mapping->size; + int ret = 0; + struct mmio_range *mmio_new; + struct mmio_range *mmio_curr = NULL; + struct mmio_range *mmio_prev = NULL; + unsigned long flags; + + mmio_new = kmalloc(sizeof(*mmio_new), GFP_KERNEL); + if (!mmio_new) + return -ENOMEM; + + mmio_new->gpa = gpa; + mmio_new->hpa = hpa; + mmio_new->size = size; + + spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags); + + /* search for the location to add this range */ + ret = 0; + list_for_each_entry(mmio_curr, &kvm->arch.pt.mmio_range_list, list) { + if ((mmio_curr->gpa + mmio_curr->size) <= gpa) { + mmio_prev = mmio_curr; + continue; + } else if ((gpa + size) <= mmio_curr->gpa) { + /* no intersection between ranges */ + break; + } else { + /* ranges intersect */ + ret = -EINVAL; + break; + } + } + + if (!ret) { + struct mmio_range *place; + place = mmio_prev ? mmio_prev : + (struct mmio_range *)&kvm->arch.pt.mmio_range_list; + list_add((struct list_head *)mmio_new, + (struct list_head *)place); + + kvm->arch.pt.pt_mmio_mapped = 1; + } + + spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags); + + if (ret) + kfree(mmio_new); + + return ret; +} + +int kvm_vm_ioctl_pt_memory_mapping_remove(struct kvm *kvm, + struct kvm_pt_memory_mapping* + memory_mapping) +{ + gpa_t gpa = memory_mapping->gpa; + u64 size = memory_mapping->size; + int ret = 0; + struct mmio_range *mmio_curr = NULL; + unsigned long flags; + + /* search the range to remove */ + ret = 0; + spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags); + list_for_each_entry(mmio_curr, &kvm->arch.pt.mmio_range_list, list) { + if ((mmio_curr->gpa + mmio_curr->size) <= gpa) { + continue; + } else if ((gpa + size) <= mmio_curr->gpa) { + /* not found */ + ret = -EINVAL; + break; + } else { + /* ranges intersect */ + if ((gpa != mmio_curr->gpa) || + (size != mmio_curr->size)) { + /* ranges are not equal */ + ret = -EINVAL; + } + break; + } + } + + if (!ret) + list_del((struct list_head *)mmio_curr); + + spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags); + + mmu_invalidate_all_sptes(kvm); + + return ret; +} diff --git a/arch/x86/kvm/passthrough.h b/arch/x86/kvm/passthrough.h new file mode 100644 index 0000000..68810c6 --- /dev/null +++ b/arch/x86/kvm/passthrough.h @@ -0,0 +1,23 @@ +/* + * This module enable guest shadow page tables to map host mmio regions of + * passthrough devices directly. + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Ben-Ami Yassour <[EMAIL PROTECTED]> + * Muli Ben-Yehuda <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef PASSTHROUGH_H +#define PASSTHROUGH_H + +#include <asm/kvm_host.h> + +hpa_t pt_mmio_gpa_to_hpa(struct kvm *kvm, gpa_t gpa); + +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94d77e8..3398ae7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3811,7 +3811,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - + pt_init_vm(&(kvm->arch.pt)); return kvm; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index e6d8df6..01e96f7 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -44,6 +44,7 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) +#define UNMAPPED_HPA (~(hpa_t)-1) /* shadow tables are PAE even on non-PAE hosts */ #define KVM_HPAGE_SHIFT 21 @@ -296,6 +297,19 @@ struct kvm_mem_alias { gfn_t target_gfn; }; +struct mmio_range { + struct list_head list; + gpa_t gpa; + hpa_t hpa; + u64 size; +}; + +struct pt { + int pt_mmio_mapped; /* guest sptes map host mmio regions directly */ + spinlock_t pt_list_lock; /* protect pt specific lists */ + struct list_head mmio_range_list; /* ordered list of mmio mapping */ +}; + struct kvm_arch{ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; @@ -317,6 +331,7 @@ struct kvm_arch{ struct page *apic_access_page; gpa_t wall_clock; + struct pt pt; }; struct kvm_vm_stat { diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 37b963e..956512d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -289,6 +289,19 @@ struct kvm_s390_interrupt { #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) + +/* Bind host I/O address range to guest address range. */ +struct kvm_pt_memory_mapping { + __u64 gpa; + __u64 hpa; + __u64 size; /* in bytes */ +}; + +#define KVM_PT_MEMORY_MAPPING_ADD _IOWR(KVMIO, 0x48, \ + struct kvm_pt_memory_mapping) +#define KVM_PT_MEMORY_MAPPING_REMOVE _IOWR(KVMIO, 0x49, \ + struct kvm_pt_memory_mapping) + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f4e1436..3b97624 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -200,6 +200,14 @@ int kvm_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); +int kvm_vm_ioctl_pt_memory_mapping_add(struct kvm *kvm, + struct kvm_pt_memory_mapping + *memory_mapping); + +int kvm_vm_ioctl_pt_memory_mapping_remove(struct kvm *kvm, + struct kvm_pt_memory_mapping + *memory_mapping); + int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, @@ -294,4 +302,6 @@ struct kvm_stats_debugfs_item { }; extern struct kvm_stats_debugfs_item debugfs_entries[]; +void pt_init_vm(struct pt *pt); + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 30bf832..c2c6c05 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1030,6 +1030,30 @@ static long kvm_vm_ioctl(struct file *filp, goto out; break; } + case KVM_PT_MEMORY_MAPPING_ADD: { + struct kvm_pt_memory_mapping input; + + r = -EFAULT; + if (copy_from_user(&input, argp, + sizeof(struct kvm_pt_memory_mapping))) + goto out; + r = kvm_vm_ioctl_pt_memory_mapping_add(kvm, &input); + if (r) + goto out; + break; + } + case KVM_PT_MEMORY_MAPPING_REMOVE: { + struct kvm_pt_memory_mapping input; + + r = -EFAULT; + if (copy_from_user(&input, argp, + sizeof(struct kvm_pt_memory_mapping))) + goto out; + r = kvm_vm_ioctl_pt_memory_mapping_remove(kvm, &input); + if (r) + goto out; + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } -- 1.5.0.3 ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel