[PATCH 1/2] add irq priodrop support
This is the same Interrupt Priority Drop/Deactivation patch emailed some time back (except for 3.10-rc4) used by the initial device pass-through support. When enabled all IRQs on host write to distributor EOIR and DIR reg to dr-prioritize/de-activate an interrupt. For device that's passed through only the EOIR is written to drop the priority, the Guest deactivates it when it handles its EOI. This supports exitless EOI that's agnostic to bus type (i.e. PCI) The patch has been tested for all configurations: Host: No Prio Drop Guest: No Prio Drop Host: Prio DROP Guest: No Prio Drop Host: Prio Drop Guest: Prio Drop - Mario Signed-off-by: Mario Smarduch mario.smard...@huawei.com --- arch/arm/kvm/Kconfig|8 +++ drivers/irqchip/irq-gic.c | 145 ++- include/linux/irqchip/arm-gic.h |6 ++ 3 files changed, 156 insertions(+), 3 deletions(-) diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 370e1a8..c0c9f3c 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -59,6 +59,14 @@ config KVM_ARM_VGIC ---help--- Adds support for a hardware assisted, in-kernel GIC emulation. +config KVM_ARM_INT_PRIO_DROP +bool KVM support for Interrupt pass-through +depends on KVM_ARM_VGIC OF +default n +---help--- + Seperates interrupt priority drop and deactivation to enable device + pass-through to Guests. + config KVM_ARM_TIMER bool KVM support for Architected Timers depends on KVM_ARM_VGIC ARM_ARCH_TIMER diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 1760ceb..9fb4ef3 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -41,10 +41,13 @@ #include linux/slab.h #include linux/irqchip/chained_irq.h #include linux/irqchip/arm-gic.h +#include linux/irqflags.h +#include linux/bitops.h #include asm/irq.h #include asm/exception.h #include asm/smp_plat.h +#include asm/virt.h #include irqchip.h @@ -99,6 +102,20 @@ struct irq_chip gic_arch_extn = { static struct gic_chip_data gic_data[MAX_GIC_NR] __read_mostly; +#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP +/* + * Priority drop/deactivation bit map, 1st 16 bits used for SGIs, this bit map + * is shared by several guests. If bit is set only execute EOI which drops + * current priority but not deactivation. + */ +static u32 gic_irq_prio_drop[DIV_ROUND_UP(1020, 32)] __read_mostly; +static void gic_eoi_irq_priodrop(struct irq_data *); +#endif + +static void gic_enable_gicc(void __iomem *); +static void gic_eoi_sgi(u32, void __iomem *); +static void gic_priodrop_remap_eoi(struct irq_chip *); + #ifdef CONFIG_GIC_NON_BANKED static void __iomem *gic_get_percpu_base(union gic_base *base) { @@ -296,7 +313,7 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs continue; } if (irqnr 16) { - writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI); + gic_eoi_sgi(irqstat, cpu_base); #ifdef CONFIG_SMP handle_IPI(irqnr, regs); #endif @@ -450,7 +467,7 @@ static void __cpuinit gic_cpu_init(struct gic_chip_data *gic) writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4 / 4); writel_relaxed(0xf0, base + GIC_CPU_PRIMASK); - writel_relaxed(1, base + GIC_CPU_CTRL); + gic_enable_gicc(base); } #ifdef CONFIG_CPU_PM @@ -585,7 +602,7 @@ static void gic_cpu_restore(unsigned int gic_nr) writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4); writel_relaxed(0xf0, cpu_base + GIC_CPU_PRIMASK); - writel_relaxed(1, cpu_base + GIC_CPU_CTRL); + gic_enable_gicc(cpu_base); } static int gic_notifier(struct notifier_block *self, unsigned long cmd, void *v) @@ -666,6 +683,7 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) { + gic_priodrop_remap_eoi(gic_chip); if (hw 32) { irq_set_percpu_devid(irq); irq_set_chip_and_handler(irq, gic_chip, @@ -857,4 +875,125 @@ IRQCHIP_DECLARE(cortex_a9_gic, arm,cortex-a9-gic, gic_of_init); IRQCHIP_DECLARE(msm_8660_qgic, qcom,msm-8660-qgic, gic_of_init); IRQCHIP_DECLARE(msm_qgic2, qcom,msm-qgic2, gic_of_init); +#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP +/* If HYP mode enabled and PRIO DROP set EOIR function to handle PRIO DROP */ +static inline void gic_priodrop_remap_eoi(struct irq_chip *chip) +{ + if (is_hyp_mode_available()) + chip-irq_eoi = gic_eoi_irq_priodrop; +} + +/* If HYP mode set enable interrupt priority drop/deactivation, and mark + * SGIs to deactive through writes to GCICC_DIR. For Guest only enable normal + * mode. + */ +static void gic_enable_gicc(void __iomem *gicc_base
[PATCH 2/2] add initial kvm dev passhtrough support
This is the initial device pass through support. At this time host == guest only is supported. Basic Operation: - QEMU parameters: -device kvm-device-assign,host=device name for example - kvm-device-assign,host='arm-sp804'. Essentially any device that does PIO should be supported. - Host DTS contains the node for device to be passed through The host driver is unbound or not compiled in. - For Guest the intent is to add a DTS node that QEMU can parse and find the guest attributes (Mem. resource, IRQs) For now these values default to host. This is a future work item to get this working on board other then vexpress. - The physical interrupt is always passed through to CPU where the target vCPU executes or will execute. Current approach - pins vCPUs to physical CPUs, when Guest updates CPU affinity is updated in KVM vgic dist code. Future work item for IRQ affinity allow vCPU to float and on schedule in handle IRQ affinity. For high IRQ rates (i.e. wireless NEs) static binding may be used. For some other device (env. mgmt IPMI)where latency is not important dynamic may be used, it should be upto the user. - To support flexible affinity a mask is introduced (QEMU param0 (although not used here yet) o vCPU affinity - vCPU -- CPU binding, the IRQ physical CPU binding follows vCPU binding dynamically. - Obviously DMA is not supported - early DMA may be supported through a 1:1 mapping but it's unsafe and so far we don't know of any hardware that's not behind SMMU. This option may be useful in some embedded/wireless environments, where the guest may want to swap, secure isolation may not be an issue or device like look aside crypto engine is not behind IOMMU. - IOMMU/VFIO support is key and next item for us to work on. Especially for ETSI NFV VFIO is key since 4G/IMS NE pull packets of wire and switch them directly in user space. The patch has been tested on fast models in couple ways: - UP Guest with sp804 timer only - works consistently - SMP Guest with sp804 timer works consistently. Writes to '/proc/irq/sp804 irq/smp_affinity' confirm dynamic CPU affinity. - IRQ rates (maybe not that important give its emulated env) reached excess of 500. There is a QEMU piece very simple for now that I will email later, in case someone would like to test. - Mario Signed-off-by: Mario Smarduch mario.smard...@huawei.com --- arch/arm/include/asm/kvm_host.h | 14 +++ arch/arm/include/asm/kvm_vgic.h | 10 +++ arch/arm/kvm/Makefile |1 + arch/arm/kvm/arm.c | 60 + arch/arm/kvm/assign-dev.c | 189 +++ arch/arm/kvm/vgic.c | 106 ++ include/linux/irqchip/arm-gic.h |1 + include/uapi/linux/kvm.h| 33 +++ 8 files changed, 414 insertions(+) create mode 100644 arch/arm/kvm/assign-dev.c diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 57cb786..c6ad3a3 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,10 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + + /* Device Passthrough Fields */ + struct list_headassigned_dev_head; + struct mutexdev_pasthru_lock; }; #define KVM_NR_MEM_OBJS 40 @@ -146,6 +150,13 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; +struct kvm_arm_assigned_dev_kernel { + struct list_head list; + struct kvm_arm_assigned_device dev; + irqreturn_t (*irq_handler)(int, void *); + void *irq_arg; +}; + struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); @@ -156,6 +167,9 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); u64 kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); +int kvm_arm_get_device_resources(struct kvm *, + struct kvm_arm_get_device_resources *); +int kvm_arm_assign_device(struct kvm *, struct kvm_arm_assigned_device *); #define KVM_ARCH_WANT_MMU_NOTIFIER struct kvm; diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h index 343744e..c4370ae 100644 --- a/arch/arm/include/asm/kvm_vgic.h +++ b/arch/arm/include/asm/kvm_vgic.h @@ -107,6 +107,16 @@ struct vgic_dist { /* Bitmap indicating which CPU has something pending */ unsigned long irq_pending_on_cpu; + + /* Device passthrough fields */ + /* Host irq to guest irq mapping */ + u8 guest_irq[VGIC_NR_SHARED_IRQS]; + + /* Pending passthruogh irq */ + struct vgic_bitmap pasthru_spi_pending; + + /* At least one passthrough IRQ pending for some vCPU */ + u32 pasthru_pending; #endif }; diff
Dev Passthrough QEMU patch
This patch is for testing only and goes along with other two patches for priodrop and dev passthrough, it should apply against 1.4.5. diff --git a/cpus.c b/cpus.c index c15ff6c..0c19214 100644 --- a/cpus.c +++ b/cpus.c @@ -737,6 +737,26 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) CPUState *cpu = ENV_GET_CPU(env); int r; +/* For now just do a 1:1 vCPU binding as they come online for device + * pass through + */ +cpu_set_t cpuset; +int ret, i; +unsigned long cpu_index = kvm_arch_vcpu_id(cpu); + +CPU_ZERO(cpuset); +CPU_SET(cpu_index, cpuset); +ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset); +if(ret != 0) { + printf(pthread_setaffinity_np failed to setaffinity to CPU 0\n); +exit(-1); +} + +CPU_ZERO(cpuset); +pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset); +if(CPU_ISSET(cpu_index,cpuset)) +printf(Binding: vCPU %ld -- CPU %d\n, cpu_index, i); + qemu_mutex_lock(qemu_global_mutex); qemu_thread_get_self(cpu-thread); cpu-thread_id = qemu_get_thread_id(); diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index caca979..46c2c59 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -904,6 +904,8 @@ struct kvm_s390_ucas_mapping { #define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd) /* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */ #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) +#define KVM_ARM_GET_DEVICE_RESOURCES _IOW(KVMIO, 0xe1, struct kvm_arm_get_device_resources) +#define KVM_ARM_ASSIGN_DEVICE_IOW(KVMIO, 0xe2, struct kvm_arm_assigned_device) /* * ioctls for vcpu fds @@ -1013,6 +1015,7 @@ struct kvm_assigned_irq { }; }; + struct kvm_assigned_msix_nr { __u32 assigned_dev_id; __u16 entry_nr; @@ -1027,4 +1030,33 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; + +/* MAX 6 MMIO resources per device */ +#define MAX_RES_PER_DEVICE 6 +struct kvm_arm_get_device_resources { +chardevname[128]; +__u32 resource_cnt; +struct { +__u64 hpa; +__u32 size; +__u32 attr; + charhost_name[64]; +} host_resources[MAX_RES_PER_DEVICE]; + struct { + __u32 hwirq; + __u32 attr; + charhost_name[64]; + } hostirq; +}; + +struct kvm_guest_device_resources { +__u64 gpa[MAX_RES_PER_DEVICE]; +__u32 girq; +}; + +struct kvm_arm_assigned_device { +struct kvm_arm_get_device_resources dev_res; +struct kvm_guest_device_resources guest_res; +}; + #endif /* __LINUX_KVM_H */ diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs index d89b57c..9aee84e 100644 --- a/target-arm/Makefile.objs +++ b/target-arm/Makefile.objs @@ -1,5 +1,5 @@ obj-y += arm-semi.o obj-$(CONFIG_SOFTMMU) += machine.o -obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM) += kvm.o device-assign.o obj-y += translate.o op_helper.o helper.o cpu.o obj-y += neon_helper.o iwmmxt_helper.o diff --git a/target-arm/device-assign.c b/target-arm/device-assign.c new file mode 100644 index 000..e4d0e97 --- /dev/null +++ b/target-arm/device-assign.c @@ -0,0 +1,118 @@ + +#include hw/sysbus.h +#include qemu-common.h +#include hw/qdev.h +#include hw/ptimer.h +#include kvm_arm.h +#include qemu/error-report.h + +#define IORESOURCE_TYPE_BITS0x1f00 /* Resource type */ +#define IORESOURCE_IO 0x0100 /* PCI/ISA I/O ports */ +#define IORESOURCE_MEM 0x0200 +#define IORESOURCE_REG 0x0300 /* Register offsets */ +#define IORESOURCE_IRQ 0x0400 +#define IORESOURCE_DMA 0x0800 + +#define IORESOURCE_PREFETCH 0x2000 /* No side effects */ +#define IORESOURCE_READONLY 0x4000 +#define IORESOURCE_CACHEABLE0x8000 + +typedef struct { +SysBusDevice busdev; +char *devname; +uint64_t hpa, gpa; +uint32_t dev_size; +uint32_t hirq,girq; +} AssignedDevice; + +static Property device_assign_properties[] = { +DEFINE_PROP_STRING(host, AssignedDevice, devname), +DEFINE_PROP_UINT64(hpa, AssignedDevice, hpa, 0), +DEFINE_PROP_UINT64(gpa, AssignedDevice, gpa, 0), +DEFINE_PROP_UINT32(size, AssignedDevice, dev_size, 0), +DEFINE_PROP_UINT32(hostirq, AssignedDevice, hirq, 0), +DEFINE_PROP_UINT32(guestirq, AssignedDevice, girq, 0), +DEFINE_PROP_END_OF_LIST(), +}; + +static int assign_device(AssignedDevice *dev) +{ +int ret,i; +struct kvm_arm_get_device_resources dev_res; +struct kvm_arm_assigned_device dev_assigned; +char *p, c='-'; + +memset(dev_res,0,sizeof(dev_res)); +memset(dev_assigned,0,sizeof(dev_assigned)); + +if((p = strstr(dev-devname, (char *)c)) != (char *) NULL) + *p = ','; +
Re: [PATCH 2/2] add initial kvm dev passhtrough support
On 6/11/2013 10:28 AM, Alexander Graf wrote: Is there any particular reason you're not going down that path for your ARM implementation? We see this as a good starting point to build on, we need baseline numbers for performance, latency, interrupt throughput on real hardware ASAP to build competency for NFV, which has demanding Dev. Passthrough requirements. Over time we plan contributing to SMMU and VFIO as well (we're looking into this now). FYI NFV is an initiative wireless/fixed network operators are working towards - to virtualize Core, likely Radia Access and even Home Network equipment, this is a epic undertaking (i.e. Network Function Virtualization). So far VMware has taken the lead (mostly x86). On the embedded PPC side we've been discussing vfio and how it fits into a device tree, non-PCI world for a while. If you like, we can dive into more detail on that, either via email or via phone. I'll email you offline, I'd like to know more what you've done on this and see where we can align/leverage the effort. - Mario Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] add initial kvm dev passhtrough support
I know Antonios very well. Yes our intent is definitely to use VFIO. - Mario On 6/11/2013 4:52 PM, Alex Williamson wrote: On Tue, 2013-06-11 at 16:13 +0200, Mario Smarduch wrote: On 6/11/2013 10:28 AM, Alexander Graf wrote: Is there any particular reason you're not going down that path for your ARM implementation? We see this as a good starting point to build on, we need baseline numbers for performance, latency, interrupt throughput on real hardware ASAP to build competency for NFV, which has demanding Dev. Passthrough requirements. Over time we plan contributing to SMMU and VFIO as well (we're looking into this now). FYI NFV is an initiative wireless/fixed network operators are working towards - to virtualize Core, likely Radia Access and even Home Network equipment, this is a epic undertaking (i.e. Network Function Virtualization). So far VMware has taken the lead (mostly x86). On the embedded PPC side we've been discussing vfio and how it fits into a device tree, non-PCI world for a while. If you like, we can dive into more detail on that, either via email or via phone. I'll email you offline, I'd like to know more what you've done on this and see where we can align/leverage the effort. Yes, please let's use VFIO rather than continue to use or invent new device assignment interfaces for KVM. Antonios Motakis (cc'd) already contacted me about VFIO for ARM. IIRC, his initial impression was that the IOMMU backend was almost entirely reusable for ARM (a couple PCI assumptions implicit in the IOMMU API to handle) and my hope was that ARM and PPC could work together on a common VFIO device tree backend. Thanks, Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] add irq priodrop support
Hi Grant, appreciate the strong feedback, I agree with all the coding observations will make the changes. I have few inline responses. +static u32 gic_irq_prio_drop[DIV_ROUND_UP(1020, 32)] __read_mostly; I believe it is possible to have more than one GIC in a system. This map assumes only one. The prio_drop map should probably be part of gic_chip_data so that it is per-instance. Also, as discussed below, the code should be using DECLARE_BITMAP() Agree. gic_priodrop_remap_eoi() is used exactly once. You should instead put the body of it inline like so: if (IS_ENABLED(CONFIG_KVM_ARM_INT_PRIO_DROP) is_hyp_mode_available()) chip-irq_eoi = gic_eoi_irq_priodrop; Yes much cleaner. However, this block is problematic. For each map call it modifies the /global/ gic_chip. It's not a per-interrupt thing, but rather changes the callback for all gic interrupts, on *any* gic in the system. Is this really what you want? If it is, then I would expect the callback to be modified once sometime around gic_init_bases() time. Yes need to move it up, now its being set for each IRQ domain mapping call. If it is not, and what you really want is per-irq behaviour, then what you need to do is have a separate gic_priodrop_chip that can be used on a per-irq basis instead of the gic_chip. Prio drop/deactivate is per CPU and all IRQs are affected including SGIs. It's possible to run mixed CPU modes, but this patch enables all CPUs for device passthrough, similar to hyp mode enable. Another way would be the reverse - set all non-passthrough irqs to gic_priodrop_chip and the passed through IRQ to gic_chip. I think keeping it in one function and just setting a bit to enable/disable is cleaner. if (hw 32) { irq_set_percpu_devid(irq); irq_set_chip_and_handler(irq, gic_chip, @@ -857,4 +875,125 @@ IRQCHIP_DECLARE(cortex_a9_gic, arm,cortex-a9-gic, gic_of_init); IRQCHIP_DECLARE(msm_8660_qgic, qcom,msm-8660-qgic, gic_of_init); IRQCHIP_DECLARE(msm_qgic2, qcom,msm-qgic2, gic_of_init); +#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP +/* If HYP mode enabled and PRIO DROP set EOIR function to handle PRIO DROP */ +static inline void gic_priodrop_remap_eoi(struct irq_chip *chip) +{ +if (is_hyp_mode_available()) +chip-irq_eoi = gic_eoi_irq_priodrop; +} + +/* If HYP mode set enable interrupt priority drop/deactivation, and mark + * SGIs to deactive through writes to GCICC_DIR. For Guest only enable normal + * mode. + */ Nit: Read Documentation/kernel-doc-nano-HOWTO.txt. It's a good idea to stick to that format when writing function documenation. Also, convention is for multiline comments to have an empty /* line before the first line of text. Will do. +} + +void gic_spi_clr_priodrop(int irq) +{ +struct irq_data *d = irq_get_irq_data(irq); +if (likely(irq = 32 irq 1019)) { 1019 ... +clear_bit(irq % 32, (void *) gic_irq_prio_drop[irq/32]); +writel_relaxed(irq, gic_cpu_base(d) + GIC_CPU_DIR); +} +} + +int gic_spi_get_priodrop(int irq) +{ +if (likely(irq = 32 irq = 1019)) ... = 1019 Looks like some off-by-one errors going on here. Also, the rest of the gic code uses 1020, not 1019 as the upper limit. What is the reason for being difference in this code block? Hmmm a mistake. ___ linux-arm-kernel mailing list linux-arm-ker...@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-arm-kernel -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] add initial kvm dev passhtrough support
Resending, initial email from my exchange client got rejected due to HTML content On 6/12/2013 8:45 AM, Mario Smarduch wrote: Hi Antonios, thanks for your feedback, initially we’ll work with static binding gain performance data given latency/throughput is key, later add dynamic binding (as well as re-optimize affinity code). And as you already know move towards VFIO, which is a longer term effort. +struct kvm_arm_assigned_dev_kernel { + struct list_head list; + struct kvm_arm_assigned_device dev; + irqreturn_t (*irq_handler)(int, void *); + void *irq_arg; +}; + Instead of irq_arg, isn't something such as target_vcpu more clear? MS Agree. diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c index 17c5ac7..f4cb804 100644 --- a/arch/arm/kvm/vgic.c +++ b/arch/arm/kvm/vgic.c @@ -449,6 +449,41 @@ static u32 vgic_get_target_reg(struct kvm *kvm, int irq) return val; } +/* Follow the IRQ vCPU affinity so passthrough device interrupts are injected + * on physical CPU they execute. + */ +static void vgic_set_passthru_affinity(struct kvm *kvm, int irq, u32 target) +{ + struct list_head *dev_list_ptr = kvm-arch.assigned_dev_head; + struct list_head *ptr; + struct kvm_arm_assigned_dev_kernel *assigned_dev; + struct vgic_dist *dist = kvm-arch.vgic; + char *buf; + int cpu, hwirq; + + mutex_lock(kvm-arch.dev_pasthru_lock); + list_for_each(ptr, dev_list_ptr) { + assigned_dev = list_entry(ptr, + struct kvm_arm_assigned_dev_kernel, list); + if (assigned_dev-dev.guest_res.girq == irq) { + if (assigned_dev-irq_arg) + free_irq(irq, assigned_dev-irq_arg); + cpu = kvm-vcpus[target]-cpu; + hwirq = assigned_dev-dev.dev_res.hostirq.hwirq; + irq_set_affinity(hwirq, cpumask_of(cpu)); + assigned_dev-irq_arg = kvm-vcpus[target]; + buf = assigned_dev-dev.dev_res.hostirq.host_name; + sprintf(buf, %s-KVM Pass-through, + assigned_dev-dev.dev_res.devname); + gic_spi_set_priodrop(hwirq); + dist-guest_irq[hwirq - VGIC_NR_PRIVATE_IRQS] = irq; + request_irq(hwirq, assigned_dev-irq_handler, 0, buf, + assigned_dev-irq_arg); + } + } + mutex_unlock(kvm-arch.dev_pasthru_lock); +} + Maybe vgic_set_pasthru_affinity is not an ideal name for the function, since you do more than that here. After looking at your code I think things will be much easier if you decouple the host irq affinity bits from here. After that there is not much stopping from affinity following the CPU a vCPU will execute. I would rename this to something to reflect that you enable priodrop for this IRQ here, for example only vgic_set_passthrough could suffice (I'm don't like the pasthru abbreviation a lot). Then the affinity bits can be put in a different function. MJS Agree naming could be better. In arch/arm/kvm/arm.c kvm_arch_vcpu_load() you can follow up whenever a vcpu is moved to a different cpu. However in practice I don't know if the additional complexity of having the irq affinity follow the vcpu significantly improves irq latency. MJS This should save a costly IPI if for example Phys IRQ is taken on CPU 0 and target vCPU on CPU 1. I agree kvm_arch_vcpu_load() is a good place if you let vCPUs float. vigic_set_passthrough_affinity can be optimized more to eliminate the free_irq(), requesnt_irq(). For now it’s a simple implementation we’re assuming static binding, start gathering performance/latency data. Will change the name as you suggest. -- *Antonios Motakis*, Virtual Open Systems* */Open Source KVM Virtualization Development /www.virtualopensystems.com http://www.virtualopensystems.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] armv7 initial device passthrough support
Updated Device Passthrough Patch. - optimized IRQ-CPU-vCPU binding, irq is installed once - added dynamic IRQ affinity on schedule in - added documentation and few other coding recommendations. Per earlier discussion VFIO is our target but we like something earlier to work with to tackle performance latency issue (some ARM related) for device passthrough while we migrate towards VFIO. - Mario Signed-off-by: Mario Smarduch mario.smard...@huawei.com --- arch/arm/include/asm/kvm_host.h | 31 + arch/arm/include/asm/kvm_vgic.h | 10 ++ arch/arm/kvm/Makefile |1 + arch/arm/kvm/arm.c | 80 + arch/arm/kvm/assign-dev.c | 248 +++ arch/arm/kvm/vgic.c | 134 + include/linux/irqchip/arm-gic.h |1 + include/uapi/linux/kvm.h| 33 ++ 8 files changed, 538 insertions(+) create mode 100644 arch/arm/kvm/assign-dev.c diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 57cb786..c85c3a0 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,10 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + + /* Device Passthrough Fields */ + struct list_headassigned_dev_head; + struct mutexdev_passthrough_lock; }; #define KVM_NR_MEM_OBJS 40 @@ -146,6 +150,13 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; +struct kvm_arm_assigned_dev_kernel { + struct list_head list; + struct kvm_arm_assigned_device dev; + irqreturn_t (*irq_handler)(int, void *); + unsigned long vcpuid_irq_arg; +}; + struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); @@ -157,6 +168,26 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); u64 kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); +#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP +int kvm_arm_get_device_resources(struct kvm *, + struct kvm_arm_get_device_resources *); +int kvm_arm_assign_device(struct kvm *, struct kvm_arm_assigned_device *); +void kvm_arm_setdev_irq_affinity(struct kvm_vcpu *vcpu, int cpu); +#else +static inline int kvm_arm_get_device_resources(struct kvm *k, struct kvm_arm_get_device_resources *r) +{ + return -1; +} +static inline int kvm_arm_assign_device(struct kvm *k, struct kvm_arm_assigned_device *d) +{ + return -1; +} + +static inline void kvm_arm_setdev_irq_affinity(struct kvm_vcpu *vcpu, int cpu) +{ +} +#endif + #define KVM_ARCH_WANT_MMU_NOTIFIER struct kvm; int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h index 343744e..fb6afd2 100644 --- a/arch/arm/include/asm/kvm_vgic.h +++ b/arch/arm/include/asm/kvm_vgic.h @@ -107,6 +107,16 @@ struct vgic_dist { /* Bitmap indicating which CPU has something pending */ unsigned long irq_pending_on_cpu; + + /* Device passthrough fields */ + /* Host irq to guest irq mapping */ + u8 guest_irq[VGIC_NR_SHARED_IRQS]; + + /* Pending passthruogh irq */ + struct vgic_bitmap passthrough_spi_pending; + + /* At least one passthrough IRQ pending for some vCPU */ + u32 passthrough_pending; #endif }; diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 53c5ed8..823fc38 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -21,3 +21,4 @@ obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o +obj-$(CONFIG_KVM_ARM_INT_PRIO_DROP) += assign-dev.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 37d216d..ba54c64 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -26,6 +26,8 @@ #include linux/mman.h #include linux/sched.h #include linux/kvm.h +#include linux/interrupt.h +#include linux/ioport.h #include trace/events/kvm.h #define CREATE_TRACE_POINTS @@ -43,6 +45,7 @@ #include asm/kvm_emulate.h #include asm/kvm_coproc.h #include asm/kvm_psci.h +#include asm/kvm_host.h #ifdef REQUIRES_VIRT __asm__(.arch_extension virt); @@ -139,6 +142,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Mark the initial VMID generation invalid */ kvm-arch.vmid_gen = 0; + /* +* Initialize Dev Passthrough Fields +*/ + INIT_LIST_HEAD(kvm-arch.assigned_dev_head); + mutex_init(kvm-arch.dev_passthrough_lock); return ret; out_free_stage2_pgd: @@ -169,6 +177,40 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) void kvm_arch_destroy_vm(struct kvm *kvm) { int i; + struct list_head
Re: [PATCH 2/2] armv7 initial device passthrough support
On 6/15/2013 5:47 PM, Paolo Bonzini wrote: Il 13/06/2013 11:19, Mario Smarduch ha scritto: Updated Device Passthrough Patch. - optimized IRQ-CPU-vCPU binding, irq is installed once - added dynamic IRQ affinity on schedule in - added documentation and few other coding recommendations. Per earlier discussion VFIO is our target but we like something earlier to work with to tackle performance latency issue (some ARM related) for device passthrough while we migrate towards VFIO. I don't think this is acceptable upstream, unfortunately. KVM device assignment is deprecated and we should not add more users. That's fine we'll work our way towards dev-tree VFIO reusing what we can working with the community. At this point we're more concerned with numbers and best practices as opposed to mechanism this part will be time consuming. VFIO will be more background for us. What are the latency issues you have? Our focus now is on IRQ latency and throughput. Right now it appears lowest latency is 2x + exit/enter + IRQ injection overhead. We can't tolerate additional IPIs or deferred IRQ injection approaches. We're looking for numbers closer to what IBMs ELI managed. Also high res timers which ARM Virt. Ext supports very well. Exitless interrupts which ARM handles very well too. There are some future hw ARM interrupt enhancements coming up which may help a lot as well. There are many other latency/perf. reqs for NFV related to RT, essentially Guest must run near native. In the end it may turn out this may need to be outside of main tree we'll see. - Mario Paolo - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] armv7 initial device passthrough support
On 6/24/2013 10:01 PM, Christoffer Dall wrote: There are many other latency/perf. reqs for NFV related to RT, essentially Guest must run near native. In the end it may turn out this may need to be outside of main tree we'll see. It doesn't sound like this will be the end result. Everything that you try to do in your patch set can be accomplished using VFIO and a more generic infrastructure for virtual IRQ integration with KVM and user space. I mentioned in previous email we will pursue VFIO, but even at that VFIO is a starting point for NFV. We should avoid creating an environment with important functionality outside of the main tree, if at all possible. Of course that would be ideal but with NFV it may be more involved. This is similar Linux and TEM adaption around 04/05. We wanted to adapt Linux but it lacked required features that's when CGL specifications came into play to provide guidance a lot of features (TIPC, OpenIMPI, preempt_rt, AEM) lived outside mainline, supported by OS vendors delivering CGL compliant distro, while others decided to stick with IT, penetrating some applications like HLR. With NFV a likely scenario may evolve, TEMs need to start demonstrating to operators fixed and wireless virtualization use cases. The only significant difference is that unlike CGL for Linux, KVM has nor real representation and understanding of NFV reqs (as opposed to proprietary vendors). I can't speak for all TEMs but it's likely they will go off on their own to demo/proto-type and worry about Open Source acceptance later. -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] armv7 initial device passthrough support
On 6/25/2013 12:27 AM, Stuart Yoder wrote: We should avoid creating an environment with important functionality outside of the main tree, if at all possible. Also, as we architect that generic infrastructure we need to keep in mind that there are important use cases for doing I/O in user space that are not KVM guests-- just normal applications that need direct device access. Yes that's a good point especially data plane NE, also LTE has these use cases at the radio side. Stuart -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: RFC: vfio interface for platform devices (v2)
I'm having trouble understanding how this works where the Guest Device Model != Host. How do you inform the guest where the device is mapped in its physical address space, and handle GPA faults? - Mario On 7/3/2013 11:40 PM, Yoder Stuart-B08248 wrote: Version 2 -VFIO_GROUP_GET_DEVICE_FD-- specified that the path is a sysfs path -VFIO_DEVICE_GET_INFO-- defined 2 flags instead of 1 -deleted VFIO_DEVICE_GET_DEVTREE_INFO ioctl -VFIO_DEVICE_GET_REGION_INFO-- updated as per AlexW's suggestion, defined 5 new flags and associated structs -VFIO_DEVICE_GET_IRQ_INFO-- updated as per AlexW's suggestion, defined 1 new flag and associated struct -removed redundant example -- VFIO for Platform Devices The existing kernel interface for vfio-pci is pretty close to what is needed for platform devices: -mechanism to create a container -add groups/devices to a container -set the IOMMU model -map DMA regions -get an fd for a specific device, which allows user space to determine info about device regions (e.g. registers) and interrupt info -support for mmapping device regions -mechanism to set how interrupts are signaled Many platform device are simple and consist of a single register region and a single interrupt. For these types of devices the existing vfio interfaces should be sufficient. However, platform devices can get complicated-- logically represented as a device tree hierarchy of nodes. For devices with multiple regions and interrupts, new mechanisms are needed in vfio to correlate the regions/interrupts with the device tree structure that drivers use to determine the meaning of device resources. In some cases there are relationships between device, and devices reference other devices using phandle links. The kernel won't expose relationships between devices, but just exposes mappable register regions and interrupts. The changes needed for vfio are around some of the device tree related info that needs to be available with the device fd. 1. VFIO_GROUP_GET_DEVICE_FD User space knows by out-of-band means which device it is accessing and will call VFIO_GROUP_GET_DEVICE_FD passing a specific sysfs path to get the device information: fd = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, /sys/bus/platform/devices/ffe21.usb)); 2. VFIO_DEVICE_GET_INFO The number of regions corresponds to the regions defined in reg and ranges in the device tree. Two new flags are added to struct vfio_device_info: #define VFIO_DEVICE_FLAGS_PLATFORM (1 ?) /* A platform bus device */ #define VFIO_DEVICE_FLAGS_DEVTREE (1 ?) /* device tree info available */ It is possible that there could be platform bus devices that are not in the device tree, so we use 2 flags to allow for that. If just VFIO_DEVICE_FLAGS_PLATFORM is set, it means that there are regions and IRQs but no device tree info available. If just VFIO_DEVICE_FLAGS_DEVTREE is set, it means there is device tree info available. 3. VFIO_DEVICE_GET_REGION_INFO For platform devices with multiple regions, information is needed to correlate the regions with the device tree structure that drivers use to determine the meaning of device resources. The VFIO_DEVICE_GET_REGION_INFO is extended to provide device tree information. The following information is needed: -the device tree path to the node corresponding to the region -whether it corresponds to a reg or ranges property -there could be multiple sub-regions per reg or ranges and the sub-index within the reg/ranges is needed There are 5 new flags added to vfio_region_info : struct vfio_region_info { __u32 argsz; __u32 flags; #define VFIO_REGION_INFO_FLAG_CACHEABLE (1 ?) #define VFIO_DEVTREE_REGION_INFO_FLAG_REG (1 ?) #define VFIO_DEVTREE_REGION_INFO_FLAG_RANGE (1 ?) #define VFIO_DEVTREE_REGION_INFO_FLAG_INDEX (1 ?) #define VFIO_DEVTREE_REGION_INFO_FLAG_PATH (1 ?) __u32 index; /* Region index */ __u32 resv; /* Reserved for alignment */ __u64 size; /* Region size (bytes) */ __u64 offset; /* Region offset from start of device fd */ }; VFIO_REGION_INFO_FLAG_CACHEABLE -if set indicates that the region must be mapped as cacheable VFIO_DEVTREE_REGION_INFO_FLAG_REG -if set indicates that the region corresponds to a reg property in the device tree representation of the device VFIO_DEVTREE_REGION_INFO_FLAG_RANGE -if set indicates that the region corresponds to a ranges property in the device tree representation of the device VFIO_DEVTREE_REGION_INFO_FLAG_INDEX -if set indicates that there is a
huge 2nd stage pages and live migration
Hello I've been working on live migration for ARM-KVM, and noticed problem completing migration with huge 2nd stage tables. Aafter write protecting the VM, for write fault 512 page bits are set in dirty_bitmap[] to take into account future writes to huge page.The pmd is write protected again when QEMU reads the dirty log, and the cycle repeats. With this not even a idle 32MB VM completes live migration. If QEMU uses THPs, and 2nd stage tables use pte's, then there is no problem, live migration is quick. I'm assumung QEMU and Guest huge pages with 2nd stage page table pte's should work fine too. I'm wondering how this has been solved (for any architecture)? - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] migration dirty bitmap support ARMv7
The patch set supports migration dirty bitmap support implementation for arm-kvm. Spliting of pmd's to pte's as suggested is implemented on demand when migration is started. I tested it on 4-way SMP ARMv7, with SMP guests. 2GB VMs with dirty shared memory segments upto 1.8 GB and relatively fast update rates 16Mb/5mS. Next course of action would be rmap support which scales much better on bigger systems. Although one think that confused me, x86 migrations were sometimes 10 to 15 times slower, I think it must be something wrong with my configuration. Mario Smarduch (3): headers for migration dirtybitmap support initial write protect of VM address space and on dirty log read hooks to interface with QEMU for initial write protect, dirty log read arch/arm/include/asm/kvm_host.h |9 +++ arch/arm/kvm/arm.c | 62 ++- arch/arm/kvm/mmu.c | 158 ++- 3 files changed, 226 insertions(+), 3 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] migration dirtybitmap support ARMv7
- support QEMU interface for initial VM Write Protect - QEMU Dirty bit map log retrieval Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/arm.c | 62 +++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index bd18bb8..9076e3d 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -241,6 +241,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_slot_remove_write_access(kvm, mem-slot); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -773,9 +775,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } +/* + * Walks the memslot dirty bitmap, write protects dirty pages for next rount, + * and stores the dirty bitmap fo QEMU retrieval. + * + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -EINVAL; + int r; + struct kvm_memory_slot *memslot; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; + gfn_t offset; + + mutex_lock(kvm-slots_lock); + r = -EINVAL; + + if (log-slot = KVM_USER_MEM_SLOTS) + goto out; + + memslot = id_to_memslot(kvm-memslots, log-slot); + dirty_bitmap = memslot-dirty_bitmap; + + r = -ENOENT; + if (!dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); + + spin_lock(kvm-mmu_lock); + for (i = 0; i n / sizeof(long); i++) { + unsigned long mask; + + if (!dirty_bitmap[i]) + continue; + + is_dirty = true; + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, + dirty_bitmap[i]); + mask = dirty_bitmap[i]; + dirty_bitmap_buffer[i] = mask; + dirty_bitmap[i] = 0; + } + + if (is_dirty) + kvm_tlb_flush_vmid(kvm); + + spin_unlock(kvm-mmu_lock); + r = -EFAULT; + + if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; + + r = 0; +out: + mutex_unlock(kvm-slots_lock); + return r; } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] migration dirtybitmap support ARMv7
- Support write protection of entire VM address space - Split pmds section in migration mode - Write protect dirty pages on Dirty log read Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 158 +++- 1 file changed, 156 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7789857..502e776 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,13 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +void kvm_tlb_flush_vmid(struct kvm *kvm) +{ + phys_addr_t x; + /* based on function description 2nd argument is irrelevent */ + kvm_tlb_flush_vmid_ipa(kvm, x); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -639,6 +646,143 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +/* + * Called when QEMU retrieves the dirty log and write protects dirty pages + * for next QEMU call to retrieve the dirty logn + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + + while (mask) { + ipa = (slot-base_gfn + gfn_offset + __ffs(mask)) PAGE_SHIFT; + pgd = pgdp + pgd_index(ipa); + if (!pgd_present(*pgd)) + goto update_mask; + pud = pud_offset(pgd, ipa); + if (!pud_present(*pud)) + goto update_mask; + pmd = pmd_offset(pud, ipa); + if (!pmd_present(*pmd)) + goto update_mask; + pte = pte_offset_kernel(pmd, ipa); + if (!pte_present(*pte)) + goto update_mask; + if ((*pte L_PTE_S2_RDWR) == L_PTE_S2_RDONLY) + goto update_mask; + new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2); + *pte = new_pte; +update_mask: + mask = mask - 1; + } +} + +/* + * In migration splits PMDs into PTEs to keep track of dirty pages. Without + * spliting light execution prevents migration. + */ +bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr) +{ + struct page *page; + pfn_t pfn = pmd_pfn(*pmd); + pte_t *pte, new_pte; + int i; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + return false; + + pte = page_address(page); + for (i = 0; i PMD_SIZE/PAGE_SIZE; i++) { + new_pte = pfn_pte(pfn+i, PAGE_S2); + pte[i] = new_pte; + } + kvm_clean_pte(pte); + pmd_populate_kernel(NULL, pmd, pte); + + /* + * flush the whole TLB for VM relying on hardware broadcast + */ + kvm_tlb_flush_vmid(kvm); + get_page(virt_to_page(pte)); + return true; +} + +/* + * Called from QEMU when migration dirty logging is started. Write the protect + * current set. Future faults writes are tracked through WP of when dirty log + * log. + */ + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + pgd_t *pgdp = kvm-arch.pgd; + struct kvm_memory_slot *memslot = id_to_memslot(kvm-memslots, slot); + u64 start = memslot-base_gfn PAGE_SHIFT; + u64 end = (memslot-base_gfn + memslot-npages) PAGE_SHIFT; + u64 addr = start, addr1; + + spin_lock(kvm-mmu_lock); + kvm-arch.migration_in_progress = true; + while (addr end) { + if (need_resched() || spin_needbreak(kvm-mmu_lock)) { + kvm_tlb_flush_vmid(kvm); + cond_resched_lock(kvm-mmu_lock); + } + + pgd = pgdp + pgd_index(addr); + if (!pgd_present(*pgd)) { + addr = pgd_addr_end(addr, end); + continue; + } + + pud = pud_offset(pgd, addr); + if (pud_huge(*pud) || !pud_present(*pud)) { + addr = pud_addr_end(addr, end); + continue; + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + addr = pmd_addr_end(addr, end); + continue; + } + + if (kvm_pmd_huge(*pmd)) { + if (!split_pmd(kvm, pmd, addr)) { + kvm-arch.migration_in_progress = false; + return; + } + addr = pmd_addr_end(addr
[PATCH 1/3] migration dirtybitmap support ARMv7
Headers for migration, prototypes Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |9 + 1 file changed, 9 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 098f7dd..9b71f13 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,7 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + int migration_in_progress; }; #define KVM_NR_MEM_OBJS 40 @@ -228,4 +229,12 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); + +void kvm_tlb_flush_vmid(struct kvm *kvm); + #endif /* __ARM_KVM_HOST_H__ */ -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 0/3] migration dirty bitmap support ARMv7
Hi Marc, Thanks for the feedback, very valuable going forward. I'll clean the patch up and repost with much more documentation. One key error you pointed out, and I overlooked but knew about it, is to error out from get dirty log if whole VM can't be WPed, at that point QEMU will abort. - Mario -Original Message- From: Marc Zyngier [mailto:marc.zyng...@arm.com] Sent: Tuesday, April 15, 2014 2:13 AM To: Mario Smarduch Cc: kvm...@lists.cs.columbia.edu; christoffer.d...@linaro.org; 이정석; 정성진; kvm@vger.kernel.org Subject: Re: [PATCH 0/3] migration dirty bitmap support ARMv7 Mario, On 15/04/14 02:24, Mario Smarduch wrote: The patch set supports migration dirty bitmap support implementation for arm-kvm. Spliting of pmd's to pte's as suggested is implemented on demand when migration is started. I tested it on 4-way SMP ARMv7, with SMP guests. 2GB VMs with dirty shared memory segments upto 1.8 GB and relatively fast update rates 16Mb/5mS. Next course of action would be rmap support which scales much better on bigger systems. Although one think that confused me, x86 migrations were sometimes 10 to 15 times slower, I think it must be something wrong with my configuration. Mario Smarduch (3): headers for migration dirtybitmap support initial write protect of VM address space and on dirty log read hooks to interface with QEMU for initial write protect, dirty log read arch/arm/include/asm/kvm_host.h |9 +++ arch/arm/kvm/arm.c | 62 ++- arch/arm/kvm/mmu.c | 158 ++- 3 files changed, 226 insertions(+), 3 deletions(-) Overall, I think this patch series requires some work, the most glaring issue being the total lack of documentation (even by my own standards). Also, it is fairly obvious that ARMv8 should be done at the same time (as all that code is shared, and will avoid API abuse...), and that a fair amount of the code is actually generic across architecture, and should be shared. I'm very much looking forward to the next iteration of this series. Thanks, M. -- Jazz is not dead. It just smells funny... N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf
Re: [PATCH 2/3] migration dirtybitmap support ARMv7
On 04/15/2014 01:58 AM, Marc Zyngier wrote: Why do you nuke the whole TLBs for this VM? I assume you're going to repeatedly call this for all the huge pages, aren't you? Can you delay this flush to do it only once? +get_page(virt_to_page(pte)); +return true; +} + +/* + * Called from QEMU when migration dirty logging is started. Write the protect + * current set. Future faults writes are tracked through WP of when dirty log + * log. Same as above. + */ + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ +pgd_t *pgd; +pud_t *pud; +pmd_t *pmd; +pte_t *pte, new_pte; +pgd_t *pgdp = kvm-arch.pgd; +struct kvm_memory_slot *memslot = id_to_memslot(kvm-memslots, slot); +u64 start = memslot-base_gfn PAGE_SHIFT; +u64 end = (memslot-base_gfn + memslot-npages) PAGE_SHIFT; +u64 addr = start, addr1; + +spin_lock(kvm-mmu_lock); +kvm-arch.migration_in_progress = true; +while (addr end) { +if (need_resched() || spin_needbreak(kvm-mmu_lock)) { +kvm_tlb_flush_vmid(kvm); Looks like you're extremely flush happy. If you're holding the lock, why do you need all the extra flushes in the previous function? Reduced it to one flush, upon termination of the write protect loop. + +if (kvm_pmd_huge(*pmd)) { +if (!split_pmd(kvm, pmd, addr)) { +kvm-arch.migration_in_progress = false; +return; Bang, you're dead. Yes added the unlock, also added return code in get dirty log function to abort migration. pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { +if (migration_active hugetlb) { +/* get back pfn from fault_ipa */ +pfn += (fault_ipa PAGE_SHIFT) +((1 (PMD_SHIFT - PAGE_SHIFT))-1); +new_pte = pfn_pte(pfn, PAGE_S2); Please explain this. Next patch series will update this, there was another problem of handling pmd huge pages and directing them to pte handling. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/3] migration dirtybitmap support ARMv7
Hi Eric, Mark - what repository should I use to pick up Eric patches? For kvm_vm_ioctl_get_dirty_log() not sure what to make generic it appears generic enough and it does what it needs to do? Thanks, Mario On 04/15/2014 02:06 AM, Marc Zyngier wrote: On 15/04/14 02:24, Mario Smarduch wrote: - support QEMU interface for initial VM Write Protect - QEMU Dirty bit map log retrieval Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/arm.c | 62 +++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index bd18bb8..9076e3d 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -241,6 +241,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { +if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) +kvm_mmu_slot_remove_write_access(kvm, mem-slot); } There is a patch by Eric Auger doing the same thing. Please use it as a dependency. void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -773,9 +775,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } +/* + * Walks the memslot dirty bitmap, write protects dirty pages for next rount, + * and stores the dirty bitmap fo QEMU retrieval. + * + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { -return -EINVAL; +int r; +struct kvm_memory_slot *memslot; +unsigned long n, i; +unsigned long *dirty_bitmap; +unsigned long *dirty_bitmap_buffer; +bool is_dirty = false; +gfn_t offset; + +mutex_lock(kvm-slots_lock); +r = -EINVAL; + +if (log-slot = KVM_USER_MEM_SLOTS) +goto out; + +memslot = id_to_memslot(kvm-memslots, log-slot); +dirty_bitmap = memslot-dirty_bitmap; + +r = -ENOENT; +if (!dirty_bitmap) +goto out; + +n = kvm_dirty_bitmap_bytes(memslot); +dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); +memset(dirty_bitmap_buffer, 0, n); + +spin_lock(kvm-mmu_lock); +for (i = 0; i n / sizeof(long); i++) { +unsigned long mask; + +if (!dirty_bitmap[i]) +continue; + +is_dirty = true; +offset = i * BITS_PER_LONG; +kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, +dirty_bitmap[i]); +mask = dirty_bitmap[i]; +dirty_bitmap_buffer[i] = mask; +dirty_bitmap[i] = 0; +} + +if (is_dirty) +kvm_tlb_flush_vmid(kvm); + +spin_unlock(kvm-mmu_lock); +r = -EFAULT; + +if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n)) +goto out; + +r = 0; +out: +mutex_unlock(kvm-slots_lock); +return r; } This is a direct copy of the x86 code. Please make it generic. static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, Thanks, M. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/5] live migration dirty bitmap support for ARMv7
Revised iteration after initial comments. Still just for ARMv7. I looked at the ARMv8 code and yes it practically appears to reuse most of fault handling in ARMv7, I wasn't aware so much code was in common. But before then want to make sure it's reliable on real hardware. This patch adds support for ARMv7 Live Migration, primarily dirty bit map management is added. The patch follows the normal migration flow managed by user space, first write protecting the entire address space and later keeping track of dirty pages. In the process of initial write protection, and first time faults huge pages are broken up into small pages to support migration on loaded systems. Mario Smarduch (5): add ARMv7 HYP API to flush VM TLBs without address param live migration support for initial write protect of VM to manage dirty pages live migration support for VM dirty log management add 2nd stage page fault handling during live migration add kvm_arch glogal live migration variable arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |7 ++ arch/arm/kvm/arm.c | 75 - arch/arm/kvm/interrupts.S |5 + arch/arm/kvm/mmu.c | 220 ++- 5 files changed, 305 insertions(+), 3 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/5] live migration support for initial write protect of VM
Add support for initial write protection of guest VM, to later manage dirty pages. Reduced TLB flushing to one flush after memory region is write protected. This is based on Erics patch, which applied cleanly. The only patch I found in the archives was the memory region delete, but still in arm.c. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |1 + arch/arm/kvm/arm.c |4 ++ arch/arm/kvm/mmu.c | 125 +++ 3 files changed, 130 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 315e3f3..7ac1fdc 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -229,5 +229,6 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vm(struct kvm *kvm); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9a4bc10..7714cc6 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -249,6 +249,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, unmap_stage2_range(kvm, gpa, size); spin_unlock(kvm-mmu_lock); } + + /* Request has been issued to migrate the guest, 1st write protect VM */ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_slot_remove_write_access(kvm, mem-slot); } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index d7a1846..b85ab56 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -648,6 +648,131 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +/** + * split_pmd - splits huge pages into small pages, required to keep a dirty + * log of small memory granules, otherwise huge pages would need to be + * migrated. Practically an idle system has problems migrating with + * huge pages. Called during WP of entire VM address space, done + * initially when migration thread isses the KVM_MEM_LOG_DIRTY_PAGES ioctl. + * mmu_lock lock must be acquired by caller + * + * @kvm:The KVM pointer + * @pmd:pmd to 2nd stage huge page + * @addr: ` Guest Physical Address + */ +static bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr) +{ + struct page *page; + pfn_t pfn = pmd_pfn(*pmd); + pte_t *pte, new_pte; + int i; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + return false; + + pte = page_address(page); + /* first break up the huge page into small page pte's */ + for (i = 0; i PTRS_PER_PMD; i++) { + new_pte = pfn_pte(pfn+i, PAGE_S2); + pte[i] = new_pte; + } + kvm_clean_pte(pte); + /* now set the pmd to pte table */ + pmd_populate_kernel(NULL, pmd, pte); + + get_page(virt_to_page(pte)); + return true; +} + + +/** + * kvm_mmu_slot_remove_access - write protects entire VM address space. + * Called at start of migration when KVM_MEM_LOG_DIRTY_PAGES ioctl is + * issued. After this function returns all pages - minus the ones faulted + * in when mmu_lock is released, but those pages will be marked in dirty log + * and are not forgotten. + * + * Initial VM write protect sweep is required to keep track of dirty pages for + * subsequent memory region dirty log retrieval. + * - mmu_lock is held during - protect against concurent faults, mmu notifier + *invalidate/unmap/update user pte, or direct device write to guest memory + * + * @kvm:The KVM pointer + * @slot: The memory slot the dirty log is retrieved for + */ +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + pgd_t *pgdp = kvm-arch.pgd; + struct kvm_memory_slot *memslot = id_to_memslot(kvm-memslots, slot); + u64 start = memslot-base_gfn PAGE_SHIFT; + u64 end = (memslot-base_gfn + memslot-npages) PAGE_SHIFT; + u64 addr = start; + + spin_lock(kvm-mmu_lock); + kvm-arch.migration_in_progress = 1; + while (addr end) { + /* Relieve contention for mmu_lock. there is no need to flush +* TLBs here. TLB updates will be picked up on TLB refills or +* flush of VM TLBs. The important things is after you terminate +* loop all pmds have been split, write protected and visible +*/ + if (need_resched() || spin_needbreak(kvm-mmu_lock)) + cond_resched_lock(kvm-mmu_lock); + + pgd = pgdp + pgd_index(addr); + if (!pgd_present(*pgd)) { + addr = pgd_addr_end(addr, end
[PATCH 1/5] add ARMv7 HYP API to flush VM TLBs without address param
Add HYP API to invalidate all VM TLBs without passing address parameter, that kvm_tlb_flush_vmid_ipa() uses. Hopefully this is a valid way to do it. Tests show nothing is broken. The address parameter is confusing since whole VM is being invalidated. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |2 ++ arch/arm/kvm/interrupts.S |5 + arch/arm/kvm/mmu.c |9 + 4 files changed, 17 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 661da11..090398d 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -76,6 +76,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vm(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 098f7dd..315e3f3 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -228,4 +228,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_tlb_flush_vm(struct kvm *kvm); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..f81c228 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -45,8 +45,13 @@ __kvm_hyp_code_start: * * As v7 does not support flushing per IPA, just nuke the whole TLB * instead, ignoring the ipa value. + * + * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 to flush all VM + * TLBs, with no need to pass IPA. Eliminate confusing code which flushes + * whole VM but still requires an IPA which is unused. */ ENTRY(__kvm_tlb_flush_vmid_ipa) +ENTRY(__kvm_tlb_flush_vm) push{r2, r3} dsb ishst diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e8580e2..d7a1846 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,15 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/* Flushes entire VMs TLBs, for ARMv7 reuses __kvm_tlb_flush_vmid_ipa + * interface without the misleading address argument + */ +void kvm_tlb_flush_vm(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vm, kvm); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] live migration support for VM dirty log management
Add support for dirty bitmap management. Wanted to make it generic but function does a couple things different then the x86 version. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c | 71 ++- arch/arm/kvm/mmu.c | 53 + 3 files changed, 126 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 7ac1fdc..16ed4e4 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -230,5 +230,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vm(struct kvm *kvm); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 7714cc6..7882343 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -785,9 +785,78 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } + +/** + * kvm_mmu_slot_remove_access - retrieves the log of dirty pages for a memslot. + * It's itteratively during migration to retrieve pages written since + * last call. In the process write protects ptes that are dirty for next + * time, holds the mmu_lock while write protecting dirty pages. + * + * @kvm:The KVM pointer + * @log:Bitmap of dirty pages return. + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -EINVAL; + int r; + struct kvm_memory_slot *memslot; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; + gfn_t offset; + + mutex_lock(kvm-slots_lock); + r = -EINVAL; + + /* Return with error code will cause migration to abort, this happens +* when initial write protection of VM to manage dirty pages fails +*/ + if (kvm-arch.migration_in_progress == -1) + goto out; + + if (log-slot = KVM_USER_MEM_SLOTS) + goto out; + + memslot = id_to_memslot(kvm-memslots, log-slot); + dirty_bitmap = memslot-dirty_bitmap; + + r = -ENOENT; + if (!dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); + + spin_lock(kvm-mmu_lock); + for (i = 0; i n / sizeof(long); i++) { + unsigned long mask; + + if (!dirty_bitmap[i]) + continue; + + is_dirty = true; + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, + dirty_bitmap[i]); + mask = dirty_bitmap[i]; + dirty_bitmap_buffer[i] = mask; + dirty_bitmap[i] = 0; + } + + if (is_dirty) + kvm_tlb_flush_vm(kvm); + + spin_unlock(kvm-mmu_lock); + r = -EFAULT; + + if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; + + r = 0; +out: + mutex_unlock(kvm-slots_lock); + return r; } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b85ab56..47bec1c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -773,6 +773,59 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) spin_unlock(kvm-mmu_lock); } + +/** + * kvm_mmu_write_protected_pt_masked - after migration thread write protects + * the entire VM address space itterative calls are made to get diry pages + * as the VM pages are being migrated. New dirty pages may be subset + * of initial WPed VM or new writes faulted in. Here write protect new + * dirty pages again in preparation of next dirty log read. This function is + * called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages + * need to be migrated. + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, device writes) + * + * @kvm:The KVM pointer + * @slot: The memory slot the dirty log is retrieved for + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset in this memory + * slot to be writ protect + */ + +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + pud_t *pud; + pmd_t *pmd
[PATCH 4/5] add 2nd stage page fault handling during live migration
Additional logic to handle second stage page faults during migration. Primarily page faults are prevented from creating huge pages. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 33 +++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 47bec1c..ebec33c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -839,6 +839,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + bool migration_active; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -890,12 +891,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; spin_lock(kvm-mmu_lock); + /* place inside lock to prevent race condition when whole VM is being +* write proteced initially, prevent pmd update if it's split up. +*/ + migration_active = vcpu-kvm-arch.migration_in_progress; + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + + /* During migration don't rebuild huge pages */ + if (!hugetlb !force_pte !migration_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); - if (hugetlb) { + /* Steer away from installing PMDs if migrating, migration failed, +* or this an initial page fault. Migrating huge pages is too slow. +*/ + if (!migration_active hugetlb) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -907,6 +918,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { + /* First convert huge page pfn to normal 4k page pfn, +* while migration is in progress. +* Second in migration mode and rare case where +* splitting of huge pages fails check if pmd is +* mapping a huge page if it is then clear it so +* stage2_set_pte() can map in a small page. +*/ + if (migration_active hugetlb) { + pmd_t *pmd; + pfn += (fault_ipa PAGE_SHIFT) + (PTRS_PER_PMD-1); + new_pte = pfn_pte(pfn, PAGE_S2); + pmd = stage2_get_pmd(kvm, NULL, fault_ipa); + if (pmd kvm_pmd_huge(*pmd)) + clear_pmd_entry(kvm, pmd, fault_ipa); + } kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); } @@ -914,6 +941,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } + if (writable) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] add kvm_arch global live migration variable
This should be in an earlier patch, omitted by mistake. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 16ed4e4..d77c425 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,7 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + int migration_in_progress; }; #define KVM_NR_MEM_OBJS 40 -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 3/5] live migration support for VM dirty log management
MZ So let's play the difference game with x86: int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log kvm_vm_ioctl_get_dirty_log() is identical now to x86 version moved it to kvm_main.c, to make it generic, it's declared weak. Do I go into x86 and remove that function? Or leave it to x86 folks to do it? -Original Message- + * + } + + if (is_dirty) + kvm_tlb_flush_vm(kvm); MZ This can be easily abstracted to be a kvm_flush_remote_tlbs on x86, and a HW broadcast on ARM. Kvm_tlb_flush_vm() is replaced with kvm_flush_remote_tlbs() I made that function weak and declared a ARM version, in arm mmu.c The current version sends IPIs to vCPU running the guest, ARMv7+ does not need that. Is that ok? + /* walk set bits in the mask and write protect corresponding pages */ + while (mask) { + ipa = (slot-base_gfn + gfn_offset + __ffs(mask)) PAGE_SHIFT; + pgd = pgdp + pgd_index(ipa); + if (!pgd_present(*pgd)) + goto update_mask; MZ I think something is wrong in your logic. If there is no PGD, it means a whole 1GB isn't present. Yet you're just clearing one bit from the mask and doing it again. As you're only looking at BITS_PER_LONG MZ contiguous pages at a time, it is likely that the same thing will happen for the other pages, and you're just wasting precious CPU cycles here. Yes this is grossly inefficient, I updated it to walk ptes only, after first determining if it straddles a pmd. Should mostly be pte walks with maybe one PMD walk but unlikely. + new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2); + *pte = new_pte; MZ I'd like to see these two lines in a separate function (something like stage2_mark_pte_ro)... Yes ok. (emailed from outlook client) -- Jazz is not dead. It just smells funny. N떑꿩�r툤y鉉싕b쾊Ф푤v�^�)頻{.n�+돴ㅎh㎍썳變}찠꼿쟺�j:+v돣�쳭喩zZ+�+zf"톒쉱�~넮녬i鎬z�췿ⅱ�?솳鈺��)刪f
RE: [PATCH 4/5] add 2nd stage page fault handling during live migration
Hi Marc, + if (migration_active hugetlb) { + pmd_t *pmd; + pfn += (fault_ipa PAGE_SHIFT) + (PTRS_PER_PMD-1); MZ Shouldn't that be pfn += pte_index(fault_addr);? I'll change much cleaner. } + if (writable) MZ Shouldn't that be done only when migration is active? Convention in other architectures is call it anytime page is dirty, the function checks if dirty map is allocated if not it returns. + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); -- Jazz is not dead. It just smells funny.
[PATCH v3 0/4] live migration dirty bitmap support for ARMv7
Hi, this the third iteration of live migration support for the time being on ARMv7. The patches depend on Eric Augers patch for memory regions. Changes since v2: - move initial VM write protect to memory region architecture prepare function (needed to make dirty logging function generic) - added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment - optimized initial VM memory region write protect to do fewer table lookups - applied Marc's comment for walking dirty bitmap mask - added pud_addr_end() for stage2 tables, to make the walk 4-level - added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic one weak, Marc's comment to for generic dirty bitmap log function - optimized walking dirty bit map mask to skip upper tables - Marc's comment - deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged the function weak - Marc's comment - changed Data Abort handler pte index handling - Marc's comment Mario Smarduch (4): add ARMv7 HYP API to flush VM TLBs without address param live migration support for initial write protect of VM to track dirty pages live migration support for VM dirty log management add 2nd stage page fault handling during live migration arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h | 13 ++ arch/arm/kvm/arm.c |8 +- arch/arm/kvm/interrupts.S |5 + arch/arm/kvm/mmu.c | 303 ++- arch/x86/kvm/x86.c | 78 -- virt/kvm/kvm_main.c | 87 ++- 7 files changed, 409 insertions(+), 86 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 1/4] add ARMv7 HYP API to flush VM TLBs without address param
Add HYP interface for global VM TLB invalidation without address parameter. - Added ARM version of kvm_flush_remote_tlbs() Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |2 ++ arch/arm/kvm/interrupts.S |5 + arch/arm/kvm/mmu.c | 10 ++ 4 files changed, 18 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 661da11..0eeaca1 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -76,6 +76,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 098f7dd..1e739f9 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -228,4 +228,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_tlb_flush_vmid(struct kvm *kvm); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..8620280 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -45,8 +45,12 @@ __kvm_hyp_code_start: * * As v7 does not support flushing per IPA, just nuke the whole TLB * instead, ignoring the ipa value. + * + * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 for global VM TLB + * flush with no address parameters. */ ENTRY(__kvm_tlb_flush_vmid_ipa) +ENTRY(__kvm_tlb_flush_vmid) push{r2, r3} dsb ishst @@ -65,6 +69,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) pop {r2, r3} bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +ENDPROC(__kvm_tlb_flush_vmid) / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e8580e2..7ab77f3 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,16 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/* Function reuses __kvm_tlb_flush_vmid_ipa() HYP interface without additional + * address argument to flush entire VM TLBs. + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 2/4] live migration support for initial write protect of VM
Support for live migration initial write protect. - moved write protect to architecture memory region prepare function. This way you can fail, abort migration without keep track of migration status. - Above also allows to generalize read dirty log function with x86 - Added stage2_mark_pte_ro() - optimized initial write protect, skip upper table lookups - added stage2pmd_addr_end() to do generic 4 level table walk - changed kvm_flush_remote_tlbs() to weak function Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |8 ++ arch/arm/kvm/arm.c |3 + arch/arm/kvm/mmu.c | 163 +++ virt/kvm/kvm_main.c |5 +- 4 files changed, 178 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 1e739f9..9f827c8 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,12 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + + /* Marks start of migration, used to handle 2nd stage page faults +* during migration, prevent installing huge pages and split huge pages +* to small pages. +*/ + int migration_in_progress; }; #define KVM_NR_MEM_OBJS 40 @@ -230,4 +236,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); +int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9a4bc10..b916478 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -233,6 +233,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + /* Request for migration issued by user, write protect memory slot */ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + return kvm_mmu_slot_remove_write_access(kvm, mem-slot); return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7ab77f3..4d029a6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -31,6 +31,11 @@ #include trace.h +#define stage2pud_addr_end(addr, end) \ +({ u64 __boundary = ((addr) + PUD_SIZE) PUD_MASK;\ + (__boundary - 1 (end) - 1) ? __boundary : (end); \ +}) + extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; static pgd_t *boot_hyp_pgd; @@ -569,6 +574,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } +/* Write protect page */ +static void stage2_mark_pte_ro(pte_t *pte) +{ + pte_t new_pte; + + new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2); + *pte = new_pte; +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -649,6 +663,155 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +/** + * split_pmd - splits huge pages to small pages, required to keep a dirty log of + * smaller memory granules, otherwise huge pages would need to be + * migrated. Practically an idle system has problems migrating with + * huge pages. Called during WP of entire VM address space, done + * initially when migration thread isses the KVM_MEM_LOG_DIRTY_PAGES + * ioctl. + * The mmu_lock is held during splitting. + * + * @kvm:The KVM pointer + * @pmd:Pmd to 2nd stage huge page + * @addr: ` Guest Physical Address + */ +int split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr) +{ + struct page *page; + pfn_t pfn = pmd_pfn(*pmd); + pte_t *pte; + int i; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + pte = page_address(page); + /* cycle through ptes first, use pmd pfn */ + for (i = 0; i PTRS_PER_PMD; i++) { + pte[i] = pfn_pte(pfn+i, 0); + stage2_mark_pte_ro(pte[i]); + } + kvm_clean_pte(pte); + /* After page table setup set pmd */ + pmd_populate_kernel(NULL, pmd, pte); + + /* get reference on pte page */ + get_page(virt_to_page(pte)); + return 0; +} + +/** + * kvm_mmu_slot_remove_access - write protects entire VM address space. + * Called at start of migration when KVM_MEM_LOG_DIRTY_PAGES ioctl is + * issued. After this function returns all pages (minus the ones faulted + * in when mmu_lock is released) must be write protected to keep track of + * dirty pages to migrate on subsequent dirty log retrieval. + * mmu_lock is held during write protecting, released on contention. + * + * @kvm:The KVM pointer + * @slot: The memory slot the dirty log is retrieved for + */ +int
[PATCH v3 4/4] add 2nd stage page fault handling during live migration
- added pte_index() to add to pmd pfn Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 31 +-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 52d4dd6..61ee812 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -924,6 +924,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + bool migration_active; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -975,12 +976,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; spin_lock(kvm-mmu_lock); + /* place inside lock to prevent race condition when whole VM is being +* write proteced. Prevent race of huge page install when migration is +* active. +*/ + migration_active = vcpu-kvm-arch.migration_in_progress; + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + + /* During migration don't rebuild huge pages */ + if (!hugetlb !force_pte !migration_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); - if (hugetlb) { + /* During migration don't install new huge pages */ + if (hugetlb !migration_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -992,6 +1002,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { + /* First convert huge page pfn to normal 4k page pfn, +* while migration is in progress. +* Second in migration mode and rare case where +* splitting of huge pages fails check if pmd is +* mapping a huge page if it is then clear it so +* stage2_set_pte() can map in a small page. +*/ + if (migration_active hugetlb) { + pmd_t *pmd; + pfn += pte_index(fault_ipa); + new_pte = pfn_pte(pfn, PAGE_S2); + pmd = stage2_get_pmd(kvm, NULL, fault_ipa); + if (pmd kvm_pmd_huge(*pmd)) + clear_pmd_entry(kvm, pmd, fault_ipa); + } kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); } @@ -999,6 +1024,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } + if (writable) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 3/4] live migration support for VM dirty log management
- made kvm_vm_ioctl_get_dirty_log() generic moved to kvm_main.c, deleted arm,x86 versions - optimized kvm_mmu_write_protected_pt_masked() to skip upper table lookups Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 -- arch/arm/kvm/mmu.c | 99 +++ arch/x86/kvm/x86.c | 78 -- virt/kvm/kvm_main.c | 82 5 files changed, 184 insertions(+), 83 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 9f827c8..c5c27d8 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -237,5 +237,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index b916478..6ca3e84 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -784,11 +784,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 4d029a6..52d4dd6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -812,6 +812,105 @@ int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) return 0; } +/** + * kvm_mmu_write_protected_pt_masked - after migration thread write protects + * the entire VM address space itterative call are made to get diry pags + * as the VM pages are being migrated. New dirty pages may be subset + * of initial WPed VM or new writes faulted in. Here write protect new + * dirty pages again in preparation of next dirty log read. This function is + * called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages + * need to be migrated. + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + * @kvm:The KVM pointer + * @slot: The memory slot the dirty log is retrieved for + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset in this memory + * slot to be writ protect + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t ipa, next, offset_ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + gfn_t gfnofst = slot-base_gfn + gfn_offset; + bool crosses_pmd; + + ipa = (gfnofst + __ffs(mask)) PAGE_SHIFT; + offset_ipa = gfnofst PAGE_SHIFT; + next = (gfnofst + (BITS_PER_LONG - 1)) PAGE_SHIFT; + + /* check if mask width crosses 2nd level page table range, and +* possibly 3rd, 4th. If not skip upper table lookups. Unlikely +* to be true machine memory regions tend to start on atleast PMD +* boundary and mask is a power of 2. +*/ + crosses_pmd = ((offset_ipa PMD_MASK) ^ (next PMD_MASK)) ? true : + false; + /* If pgd, pud, pmd not present and you cross pmd range check next +* index. Unlikely that pgd and pud would be not present. Between +* dirty page marking and now page tables may have been altered. +*/ + pgd = pgdp + pgd_index(ipa); + if (unlikely(crosses_pmd !pgd_present(*pgd))) { + pgd = pgdp + pgd_index(next); + if (!pgd_present(*pgd)) + return; + } + + pud = pud_offset(pgd, ipa); + if (unlikely(crosses_pmd !pud_present(*pud))) { + pud = pud_offset(pgd, next); + if (!pud_present(*pud)) + return; + } + + pmd = pmd_offset(pud, ipa); + if (unlikely(crosses_pmd !pmd_present(*pmd))) { + pmd = pmd_offset(pud, next); + if (!pmd_present(*pmd)) + return; + } + + for (;;) { + pte = pte_offset_kernel(pmd, ipa); + if (!pte_present(*pte)) + goto next_ipa; + + if ((*pte L_PTE_S2_RDWR) == L_PTE_S2_RDONLY) + goto next_ipa; + + stage2_mark_pte_ro(pte); + +next_ipa: + mask = mask - 1; + if (!mask
Re: [PATCH v3 2/4] live migration support for initial write protect of VM
On 04/24/2014 09:39 AM, Steve Capper wrote: On Wed, Apr 23, 2014 at 12:18:07AM +0100, Mario Smarduch wrote: Support for live migration initial write protect. - moved write protect to architecture memory region prepare function. This way you can fail, abort migration without keep track of migration status. - Above also allows to generalize read dirty log function with x86 - Added stage2_mark_pte_ro() - optimized initial write protect, skip upper table lookups - added stage2pmd_addr_end() to do generic 4 level table walk - changed kvm_flush_remote_tlbs() to weak function Hello Mario, I've taken a quick look at this and have a few suggestions below. (I'm not a KVM expert, but took a look at the memory manipulation). Hi Steve, your suggestions are very helpful, my response inline. Thanks. Mario Future versions of this series could probably benefit from being sent to lakml too? Cheers, -- Steve Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |8 ++ arch/arm/kvm/arm.c |3 + arch/arm/kvm/mmu.c | 163 +++ virt/kvm/kvm_main.c |5 +- 4 files changed, 178 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 1e739f9..9f827c8 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,12 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + + /* Marks start of migration, used to handle 2nd stage page faults +* during migration, prevent installing huge pages and split huge pages +* to small pages. +*/ + int migration_in_progress; }; #define KVM_NR_MEM_OBJS 40 @@ -230,4 +236,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); +int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9a4bc10..b916478 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -233,6 +233,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + /* Request for migration issued by user, write protect memory slot */ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + return kvm_mmu_slot_remove_write_access(kvm, mem-slot); return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7ab77f3..4d029a6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -31,6 +31,11 @@ #include trace.h +#define stage2pud_addr_end(addr, end) \ +({ u64 __boundary = ((addr) + PUD_SIZE) PUD_MASK;\ + (__boundary - 1 (end) - 1) ? __boundary : (end); \ +}) A matter of personal preference: can this be a static inline function instead? That way you could avoid ambiguity with the parameter types. (not an issue here, but this has bitten me in the past). Yes good point, will change. + extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; static pgd_t *boot_hyp_pgd; @@ -569,6 +574,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } +/* Write protect page */ +static void stage2_mark_pte_ro(pte_t *pte) +{ + pte_t new_pte; + + new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2); + *pte = new_pte; +} This isn't making the pte read only. It's nuking all the flags from the pte and replacing them with factory settings. (In this case the PAGE_S2 pgprot). If we had other attributes that we later wish to retain this could be easily overlooked. Perhaps a new name for the function? Yes that's pretty bad, I'll clear the write protect bit only. + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -649,6 +663,155 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +/** + * split_pmd - splits huge pages to small pages, required to keep a dirty log of + * smaller memory granules, otherwise huge pages would need to be + * migrated. Practically an idle system has problems migrating with + * huge pages. Called during WP of entire VM address space, done + * initially when migration thread isses the KVM_MEM_LOG_DIRTY_PAGES + * ioctl. + * The mmu_lock is held during splitting. + * + * @kvm:The KVM pointer + * @pmd:Pmd to 2nd stage huge page + * @addr: ` Guest Physical Address Nitpick: typo ` Yes overlooked it, will delete. + */ +int split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr) Maybe worth renaming
[PATCH v4 2/5] live migration support for initial write protect of VM
Patch adds support for live migration initial split up of huge pages in memory slot and write protection of all pages in memory slot. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |8 ++ arch/arm/include/asm/kvm_mmu.h | 11 ++ arch/arm/kvm/arm.c |3 + arch/arm/kvm/mmu.c | 215 +++ virt/kvm/kvm_main.c |5 +- 5 files changed, 241 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 1e739f9..9f827c8 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,12 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + + /* Marks start of migration, used to handle 2nd stage page faults +* during migration, prevent installing huge pages and split huge pages +* to small pages. +*/ + int migration_in_progress; }; #define KVM_NR_MEM_OBJS 40 @@ -230,4 +236,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); +int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index a91c863..342ae81 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -111,6 +111,17 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) pte_val(*pte) |= L_PTE_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR); +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + + static inline void kvm_set_s2pmd_writable(pmd_t *pmd) { pmd_val(*pmd) |= L_PMD_S2_RDWR; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9a4bc10..b916478 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -233,6 +233,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + /* Request for migration issued by user, write protect memory slot */ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + return kvm_mmu_slot_remove_write_access(kvm, mem-slot); return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7ab77f3..15bbca2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -44,6 +44,41 @@ static phys_addr_t hyp_idmap_vector; #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) +/* Used for 2nd stage and identity mappings. For stage 2 mappings + * instead of unsigned long, u64 is use which won't overflow on ARMv7 for + * IPAs above 4GB. For ARMv8 use default functions. + */ + +static phys_addr_t kvm_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +{ +#if BITS_PER_LONG == 32 + u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK; + return __boundary - 1 end - 1 ? __boundary : end; +#else + return pgd_addr_end(addr, end); +#endif +} + +static phys_addr_t kvm_pud_addr_end(phys_addr_t addr, phys_addr_t end) +{ +#if BITS_PER_LONG == 32 + u64 __boundary = ((addr) + PUD_SIZE) PUD_MASK; + return __boundary - 1 end - 1 ? __boundary : end; +#else + return pud_addr_end(addr, end); +#endif +} + +static phys_addr_t kvm_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +{ +#if BITS_PER_LONG == 32 + u64 __boundary = ((addr) + PMD_SIZE) PMD_MASK; + return __boundary - 1 end - 1 ? __boundary : end; +#else + return pmd_addr_end(addr, end); +#endif +} + static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { /* @@ -649,6 +684,186 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +/** + * kvm_split_pmd - splits huge pages to small pages, required to keep a dirty + * log of smaller memory granules, otherwise huge pages would need to be + * migrated. Practically an idle system has problems migrating with + * huge pages. Called during WP of entire VM address space, done + * initially when migration thread isses the KVM_MEM_LOG_DIRTY_PAGES + * ioctl. + * The mmu_lock is held during splitting. + * + * @kvm:The KVM pointer + * @pmd:Pmd to 2nd stage huge page + * @addr: Guest Physical Address + */ +static int kvm_split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr) +{ + struct page *page; + pfn_t pfn = pmd_pfn(*pmd); + pte_t *pte; + int i; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + pte = page_address(page); + /* cycle through ptes first, use pmd pfn */ + for (i = 0; i
[PATCH v4 3/5] live migration support for VM dirty log management
This patch adds support for keeping track of VM dirty pages, by updating per memslot dirty bitmap and write protecting the page again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 -- arch/arm/kvm/mmu.c | 101 +++ arch/x86/kvm/x86.c | 78 -- virt/kvm/kvm_main.c | 84 5 files changed, 188 insertions(+), 83 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 9f827c8..c5c27d8 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -237,5 +237,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index b916478..6ca3e84 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -784,11 +784,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 15bbca2..3442594 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -864,6 +864,107 @@ out: return ret; } + +/** + * kvm_mmu_write_protected_pt_masked - after migration thread write protects + * the entire VM address space itterative call are made to get diry pags + * as the VM pages are being migrated. New dirty pages may be subset + * of initial WPed VM or new writes faulted in. Here write protect new + * dirty pages again in preparation of next dirty log read. This function is + * called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages + * need to be migrated. + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + * @kvm:The KVM pointer + * @slot: The memory slot the dirty log is retrieved for + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset in this memory + * slot to be writ protect + */ + +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t ipa, next, offset_ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + gfn_t gfnofst = slot-base_gfn + gfn_offset; + bool crosses_pmd; + + ipa = (gfnofst + __ffs(mask)) PAGE_SHIFT; + offset_ipa = gfnofst PAGE_SHIFT; + next = (gfnofst + (BITS_PER_LONG - 1)) PAGE_SHIFT; + + /* check if mask width crosses 2nd level page table range, and +* possibly 3rd, 4th. If not skip upper table lookups. Unlikely +* to be true machine memory regions tend to start on atleast PMD +* boundary and mask is a power of 2. +*/ + crosses_pmd = ((offset_ipa PMD_MASK) ^ (next PMD_MASK)) ? true : + false; + + /* If pgd, pud, pmd not present and you cross pmd range check next +* index. Unlikely that pgd and pud would be not present. Between +* dirty page marking and now page tables may have been altered. +*/ + pgd = pgdp + pgd_index(ipa); + if (unlikely(crosses_pmd !pgd_present(*pgd))) { + pgd = pgdp + pgd_index(next); + if (!pgd_present(*pgd)) + return; + } + + pud = pud_offset(pgd, ipa); + if (unlikely(crosses_pmd !pud_present(*pud))) { + pud = pud_offset(pgd, next); + if (!pud_present(*pud)) + return; + } + + pmd = pmd_offset(pud, ipa); + if (unlikely(crosses_pmd !pmd_present(*pmd))) { + pmd = pmd_offset(pud, next); + if (!pmd_present(*pmd)) + return; + } + + for (;;) { + pte = pte_offset_kernel(pmd, ipa); + if (!pte_present(*pte)) + goto next_ipa; + + if (kvm_s2pte_readonly(pte)) + goto next_ipa; + kvm_set_s2pte_readonly(pte); +next_ipa: + mask = mask - 1; + if (!mask) + break; + + /* find next page */ + ipa = (gfnofst
[PATCH v4 0/5] live migration dirty bitmap support for ARMv7
Hi, this the fourth iteration of live migration support for the time being tested on ARMv7. The patches depend on Eric Augers patch for memory regions. - Tested on two 4-way A15 systems, 2-way/4-way SMP guest upto 2GB memory - Various dirty data rates tested - 2GB/1s ... 2048 pgs/5ms - validated source/destination memory image integrity - Issue: time skips few seconds on dest., timekeeper offset from last cycle appears to big, need to investigate further. Changes since v3: - changed pte updates to reset write bit instead of setting default value for existing pte's - Steve's comment - In addition to PUD add 2nd stage 4GB range functions - Steves suggestion - Restructured initial memory slot write protect function for PGD, PUD, PMD table walking - Steves suggestion - Renamed variable types to resemble their use - Steves suggestions - Added couple pte helpers for 2nd stage tables - Steves suggestion - Updated unmap_range() that handles 2nd stage tables and identity mappings to handle 2nd stage addresses 4GB. Left ARMv8 unchanged. Changes since v2: - move initial VM write protect to memory region architecture prepare function (needed to make dirty logging function generic) - added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment - optimized initial VM memory region write protect to do fewer table lookups - applied Marc's comment for walking dirty bitmap mask - added pud_addr_end() for stage2 tables, to make the walk 4-level - added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic one weak, Marc's comment to for generic dirty bitmap log function - optimized walking dirty bit map mask to skip upper tables - Marc's comment - deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged the function weak - Marc's comment - changed Data Abort handler pte index handling - Marc's comment Mario Smarduch (5): add ARMv7 HYP API to flush VM TLBs without address param live migration support for initial write protect of VM live migration support for VM dirty log management add 2nd stage page fault handling during live migration change update_range to handle 4GB 2nd stage range for ARMv7 arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h | 13 ++ arch/arm/include/asm/kvm_mmu.h | 11 ++ arch/arm/kvm/arm.c |8 +- arch/arm/kvm/interrupts.S |5 + arch/arm/kvm/mmu.c | 377 +-- arch/x86/kvm/x86.c | 78 virt/kvm/kvm_main.c | 89 - 8 files changed, 488 insertions(+), 94 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 4/5] add 2nd stage page fault handling during live migration
This patch add support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and splits up existing huge pages. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 31 +-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 3442594..88f5503 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -978,6 +978,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + bool migration_active; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -1029,12 +1030,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; spin_lock(kvm-mmu_lock); + /* place inside lock to prevent race condition when whole VM is being +* write proteced. Prevent race of huge page install when migration is +* active. +*/ + migration_active = vcpu-kvm-arch.migration_in_progress; + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + + /* During migration don't rebuild huge pages */ + if (!hugetlb !force_pte !migration_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); - if (hugetlb) { + /* During migration don't install new huge pages */ + if (hugetlb !migration_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -1046,6 +1056,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { + /* First convert huge page pfn to normal 4k page pfn, +* while migration is in progress. +* Second in migration mode and rare case where +* splitting of huge pages fails check if pmd is +* mapping a huge page if it is then clear it so +* stage2_set_pte() can map in a small page. +*/ + if (migration_active hugetlb) { + pmd_t *pmd; + pfn += pte_index(fault_ipa); + new_pte = pfn_pte(pfn, PAGE_S2); + pmd = stage2_get_pmd(kvm, NULL, fault_ipa); + if (pmd kvm_pmd_huge(*pmd)) + clear_pmd_entry(kvm, pmd, fault_ipa); + } kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); } @@ -1053,6 +1078,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } + if (writable) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 5/5] change update_range to handle 4GB 2nd stage range for ARMv7
This patch adds support for unmapping 2nd stage page tables for addresses 4GB on ARMv7. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 20 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 88f5503..afbf8ba 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -176,21 +176,25 @@ static void clear_pte_entry(struct kvm *kvm, pte_t *pte, phys_addr_t addr) } } +/* Function shared between identity and 2nd stage mappings. For 2nd stage + * the IPA may be 4GB on ARMv7, and page table range functions + * will fail. kvm_xxx_addr_end() is used to handle both cases. + */ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, - unsigned long long start, u64 size) + phys_addr_t start, u64 size) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long long addr = start, end = start + size; - u64 next; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; while (addr end) { pgd = pgdp + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { - addr = pud_addr_end(addr, end); + addr = kvm_pud_addr_end(addr, end); continue; } @@ -200,13 +204,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, * move on. */ clear_pud_entry(kvm, pud, addr); - addr = pud_addr_end(addr, end); + addr = kvm_pud_addr_end(addr, end); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - addr = pmd_addr_end(addr, end); + addr = kvm_pmd_addr_end(addr, end); continue; } @@ -221,10 +225,10 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, */ if (kvm_pmd_huge(*pmd) || page_empty(pte)) { clear_pmd_entry(kvm, pmd, addr); - next = pmd_addr_end(addr, end); + next = kvm_pmd_addr_end(addr, end); if (page_empty(pmd) !page_empty(pud)) { clear_pud_entry(kvm, pud, addr); - next = pud_addr_end(addr, end); + next = kvm_pud_addr_end(addr, end); } } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 5/5] change update_range to handle 4GB 2nd stage range for ARMv7
Hi Gavin, thanks, didn't catch that, I'll remove these calls. - Mario On 05/05/2014 04:34 PM, Gavin Guo wrote: Hi Mario, On Tue, Apr 29, 2014 at 9:06 AM, Mario Smarduch m.smard...@samsung.com wrote: This patch adds support for unmapping 2nd stage page tables for addresses 4GB on ARMv7. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 20 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 88f5503..afbf8ba 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -176,21 +176,25 @@ static void clear_pte_entry(struct kvm *kvm, pte_t *pte, phys_addr_t addr) } } +/* Function shared between identity and 2nd stage mappings. For 2nd stage + * the IPA may be 4GB on ARMv7, and page table range functions + * will fail. kvm_xxx_addr_end() is used to handle both cases. + */ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, - unsigned long long start, u64 size) + phys_addr_t start, u64 size) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long long addr = start, end = start + size; - u64 next; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; while (addr end) { pgd = pgdp + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { - addr = pud_addr_end(addr, end); + addr = kvm_pud_addr_end(addr, end); continue; } @@ -200,13 +204,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, * move on. */ clear_pud_entry(kvm, pud, addr); - addr = pud_addr_end(addr, end); + addr = kvm_pud_addr_end(addr, end); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - addr = pmd_addr_end(addr, end); + addr = kvm_pmd_addr_end(addr, end); continue; } @@ -221,10 +225,10 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, */ if (kvm_pmd_huge(*pmd) || page_empty(pte)) { clear_pmd_entry(kvm, pmd, addr); - next = pmd_addr_end(addr, end); + next = kvm_pmd_addr_end(addr, end); if (page_empty(pmd) !page_empty(pud)) { clear_pud_entry(kvm, pud, addr); - next = pud_addr_end(addr, end); + next = kvm_pud_addr_end(addr, end); } } -- 1.7.9.5 It seems that your adding kvm_pmd_addr_end(addr, end) already exists in the following patch and may need to remove these parts from your patch. commit a3c8bd31af260a17d626514f636849ee1cd1f63e Author: Marc Zyngier marc.zyng...@arm.com Date: Tue Feb 18 14:29:03 2014 + ARM: KVM: introduce kvm_p*d_addr_end The use of p*d_addr_end with stage-2 translation is slightly dodgy, as the IPA is 40bits, while all the p*d_addr_end helpers are taking an unsigned long (arm64 is fine with that as unligned long is 64bit). The fix is to introduce 64bit clean versions of the same helpers, and use them in the stage-2 page table code. Signed-off-by: Marc Zyngier marc.zyng...@arm.com Acked-by: Catalin Marinas catalin.mari...@arm.com Reviewed-by: Christoffer Dall christoffer.d...@linaro.org Gavin -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 1/4] add ARMv7 HYP API to flush VM TLBs without address param
Patch adds HYP interface for global VM TLB invalidation without address parameter. - Added ARM version of kvm_flush_remote_tlbs() Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |2 ++ arch/arm/kvm/interrupts.S |5 + arch/arm/kvm/mmu.c | 10 ++ 4 files changed, 18 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..ac3bb65 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,4 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_tlb_flush_vmid(struct kvm *kvm); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..8620280 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -45,8 +45,12 @@ __kvm_hyp_code_start: * * As v7 does not support flushing per IPA, just nuke the whole TLB * instead, ignoring the ipa value. + * + * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 for global VM TLB + * flush with no address parameters. */ ENTRY(__kvm_tlb_flush_vmid_ipa) +ENTRY(__kvm_tlb_flush_vmid) push{r2, r3} dsb ishst @@ -65,6 +69,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) pop {r2, r3} bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +ENDPROC(__kvm_tlb_flush_vmid) / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 80bb1e6..95c172a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,16 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/* Function reuses __kvm_tlb_flush_vmid_ipa() HYP interface without additional + * address argument to flush entire VM TLBs. + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 4/4] add 2nd stage page fault handling during live migration
This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and splits up existing huge pages. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 30 -- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1458b6e..b0633dc 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1034,6 +1034,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + bool migration_active; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -1085,12 +1086,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; spin_lock(kvm-mmu_lock); + + /* place inside lock to prevent race condition when whole VM is being +* write proteced. Prevent race of huge page install when migration is +* active. +*/ + migration_active = vcpu-kvm-arch.migration_in_progress; + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + + /* During migration no need rebuild huge pages */ + if (!hugetlb !force_pte !migration_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); - if (hugetlb) { + /* During migration don't install new huge pages */ + if (hugetlb !migration_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -1102,6 +1113,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { + /* First convert huge page pfn to small page pfn, +* while migration is in progress. +* Second if pmd is mapping a huge page then +* clear pmd so stage2_set_pte() can split the pmd. +*/ + if (migration_active hugetlb) { + pmd_t *pmd; + pfn += pte_index(fault_ipa); + new_pte = pfn_pte(pfn, PAGE_S2); + pmd = stage2_get_pmd(kvm, NULL, fault_ipa); + if (pmd kvm_pmd_huge(*pmd)) + clear_pmd_entry(kvm, pmd, fault_ipa); + } kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); } @@ -1109,6 +1133,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } + if (writable) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 1/4] add ARMv7 HYP API to flush VM TLBs without address param
On 05/14/2014 09:47 AM, Christoffer Dall wrote: On Wed, May 07, 2014 at 05:40:13PM -0700, Mario Smarduch wrote: Patch adds HYP interface for global VM TLB invalidation without address parameter. - Added ARM version of kvm_flush_remote_tlbs() Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |2 ++ arch/arm/kvm/interrupts.S |5 + arch/arm/kvm/mmu.c | 10 ++ 4 files changed, 18 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..ac3bb65 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,4 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_tlb_flush_vmid(struct kvm *kvm); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..8620280 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -45,8 +45,12 @@ __kvm_hyp_code_start: * * As v7 does not support flushing per IPA, just nuke the whole TLB * instead, ignoring the ipa value. + * + * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 for global VM TLB + * flush with no address parameters. */ ENTRY(__kvm_tlb_flush_vmid_ipa) +ENTRY(__kvm_tlb_flush_vmid) push {r2, r3} dsb ishst @@ -65,6 +69,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) pop {r2, r3} bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +ENDPROC(__kvm_tlb_flush_vmid) yikes, can you please make this a separate function that calls the other one? Done separate function, got the idea from entry-common.s ENTRY(ret_to_user), ENTRY(ret_to_user_from_irq) and others. / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 80bb1e6..95c172a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,16 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/* Function reuses __kvm_tlb_flush_vmid_ipa() HYP interface without additional + * address argument to flush entire VM TLBs. + */ This is not proper kernel commenting formatting, please see Documentation/CodingStyle. For new exported functions in the KVM/ARM code, please add kdocs style documentation to the functions. Done. +void kvm_flush_remote_tlbs(struct kvm *kvm) This doesn't build?: I reworked the patch series to build successfully after applying each patch. This patch was missing a weak declaration of the function in virt/kvm/kvm_main.c. I simplified some related code for PMD splitting reusing current mmu.c code, instead of reinventing. I'll email new patch series tomorrow, you might not want to waste your time on 2-4. Thanks. - Mario arch/arm/kvm/mmu.o: In function `kvm_flush_remote_tlbs': mmu.c:(.text+0xc7c): multiple definition of `kvm_flush_remote_tlbs' +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 1/4] add ARMv7 HYP API to flush VM TLBs without address param
Patch adds HYP interface for global VM TLB invalidation without address parameter. Added ARM version of kvm_flush_remote_tlbs(), made generic one weak. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/kvm/interrupts.S | 11 +++ arch/arm/kvm/mmu.c | 15 +++ virt/kvm/kvm_main.c|2 +- 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..bddc66b 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 80bb1e6..eea3f0a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,21 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). + * + * @kvm: pointer to kvm structure. + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa70c6e..ba25765 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -void kvm_flush_remote_tlbs(struct kvm *kvm) +void __weak kvm_flush_remote_tlbs(struct kvm *kvm) { long dirty_count = kvm-tlbs_dirty; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 0/4] live migration dirty bitmap support for ARMv7
This is v6 patcheset of live mgiration support for ARMv7. - Tested on two 4-way A15 hardware, QEMU 2-way/4-way SMP guest upto 2GB - Various dirty data rates tested - 2GB/1s ... 2048 pgs/5ms - validated source/destination memory image integrity Changes since v1: - add unlock of VM mmu_lock to prevent a deadlock - moved migratiion active inside mmu_lock acquire for visibility in 2nd stage data abort handler - Added comments Changes since v2: - move initial VM write protect to memory region architecture prepare function (needed to make dirty logging function generic) - added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment - optimized initial VM memory region write protect to do fewer table lookups - applied Marc's comment for walking dirty bitmap mask - added pud_addr_end() for stage2 tables, to make the walk 4-level - added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic one weak, Marc's comment to for generic dirty bitmap log function - optimized walking dirty bit map mask to skip upper tables - Marc's comment - deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged the function weak - Marc's comment - changed Data Abort handler pte index handling - Marc's comment Changes since v3: - changed pte updates to reset write bit instead of setting default value for existing pte's - Steve's comment - In addition to PUD add 2nd stage 4GB range functions - Steves suggestion - Restructured initial memory slot write protect function for PGD, PUD, PMD table walking - Steves suggestion - Renamed variable types to resemble their use - Steves suggestions - Added couple pte helpers for 2nd stage tables - Steves suggestion - Updated unmap_range() that handles 2nd stage tables and identity mappings to handle 2nd stage addresses 4GB. Left ARMv8 unchanged. Changes since v4: - rebased to 3.15.0-rc1 - 'next' to pickup p*addr_end patches - Gavins comment - Update PUD address end function to support 4-level page table walk - Elimiated 5th patch of the series that fixed unmap_range(), since it was fixed by Marcs patches. Changes since v5: - Created seperate entry point for VMID TLB flush with no param - Christoffers comment - Update documentation for kvm_flush_remote_tlbs() - Christoffers comment - Simplified splitting of huge pages - inittial WP and 2nd stage DABT handler clear the huge page PMD, and use current code to fault in small pages. Removed kvm_split_pmd(). Mario Smarduch (4): add ARMv7 HYP API to flush VM TLBs without address param live migration support for initial write protect of VM live migration support for VM dirty log management add 2nd stage page fault handling during live migration arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h | 11 ++ arch/arm/include/asm/kvm_mmu.h | 10 ++ arch/arm/kvm/arm.c |8 +- arch/arm/kvm/interrupts.S | 11 ++ arch/arm/kvm/mmu.c | 292 ++- arch/x86/kvm/x86.c | 86 virt/kvm/kvm_main.c | 84 ++- 8 files changed, 409 insertions(+), 94 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 4/4] add 2nd stage page fault handling during live migration
This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and splits up existing huge pages. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 36 ++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b939312..10e7bf6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1002,6 +1002,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + bool migration_active; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -1053,12 +1054,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; spin_lock(kvm-mmu_lock); + + /* +* Place inside lock to prevent race condition when whole VM is being +* write proteced. Prevent race of huge page install when migration is +* active. +*/ + migration_active = vcpu-kvm-arch.migration_in_progress; + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + + /* When migrating don't spend cycles coalescing huge pages */ + if (!hugetlb !force_pte !migration_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); - if (hugetlb) { + /* During migration don't install huge pages */ + if (hugetlb !migration_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -1069,6 +1081,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, new_pmd); } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); + + /* +* If pmd is mapping a huge page then split it up into +* small pages, when doing live migration. +*/ + if (migration_active) { + pmd_t *pmd; + if (hugetlb) { + pfn += pte_index(fault_ipa); + gfn = fault_ipa PAGE_SHIFT; + } + new_pte = pfn_pte(pfn, PAGE_S2); + pmd = stage2_get_pmd(kvm, NULL, fault_ipa); + if (pmd kvm_pmd_huge(*pmd)) + clear_pmd_entry(kvm, pmd, fault_ipa); + } + if (writable) { kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); @@ -1077,6 +1106,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } + /* Assuming 4k pages, set one bit/page in memslot dirty_bitmap[] */ + if (writable) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v6 2/4] live migration support for initial write protect of VM
Patch adds memslot support for initial write protection and split up of huge pages Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |8 +++ arch/arm/include/asm/kvm_mmu.h | 10 +++ arch/arm/kvm/arm.c |3 + arch/arm/kvm/mmu.c | 143 +++ 4 files changed, 164 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..0e55b17 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,12 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; + /* +* Marks start of migration, used to handle 2nd stage page faults +* during migration, prevent installing huge pages and split huge pages +* to small pages. +*/ + int migration_in_progress; }; #define KVM_NR_MEM_OBJS 40 @@ -231,4 +237,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5c7aa3c..7f9d9d3 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,16 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR); +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end)\ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..1055266 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -234,6 +234,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + /* Request for migration issued by user, write protect memory slot */ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + return kvm_mmu_slot_remove_write_access(kvm, mem-slot); return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index eea3f0a..b71ad27 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -748,6 +748,149 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } + +/* + * Walks PMD page table range and write protects it. Called with + * 'kvm-mmu_lock' * held + */ +static void stage2_wp_pmd_range(phys_addr_t addr, phys_addr_t end, pmd_t *pmd) +{ + pte_t *pte; + + while (addr end) { + pte = pte_offset_kernel(pmd, addr); + addr += PAGE_SIZE; + if (!pte_present(*pte)) + continue; + /* skip write protected pages */ + if (kvm_s2pte_readonly(pte)) + continue; + kvm_set_s2pte_readonly(pte); + } +} + +/* + * Walks PUD page table range to write protects it , if necessary spluts up + * huge pages to small pages. Called with 'kvm-mmu_lock' held. + */ +static void stage2_wp_pud_range(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end, pud_t *pud) +{ + pmd_t *pmd; + phys_addr_t pmd_end; + + while (addr end) { + /* If needed give up CPU during PUD page table walk */ + if (need_resched() || spin_needbreak(kvm-mmu_lock)) + cond_resched_lock(kvm-mmu_lock); + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + addr = kvm_pmd_addr_end(addr, end); + continue; + } + + if (kvm_pmd_huge(*pmd)) { + /* +* Clear pmd entry DABT handler will install smaller +* pages. +*/ + clear_pmd_entry(kvm, pmd, addr); + addr = kvm_pmd_addr_end(addr, end); + continue; + } + + pmd_end = kvm_pmd_addr_end(addr, end); + stage2_wp_pmd_range(addr, pmd_end, pmd); + addr = pmd_end; + } +} + +/* + * Walks PGD page table range to write protect it. Called with 'kvm-mmu_lock' + * held. + */ +static int stage2_wp_pgd_range(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end, pgd_t *pgd) +{ + phys_addr_t pud_end
[PATCH v6 3/4] live migration support for VM dirty log management
This patch adds support for keeping track of VM dirty pages, by updating per memslot dirty bitmap and write protecting the page again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 -- arch/arm/kvm/mmu.c | 98 +++ arch/x86/kvm/x86.c | 86 -- virt/kvm/kvm_main.c | 82 5 files changed, 183 insertions(+), 91 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 0e55b17..4fef77d 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -238,5 +238,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 1055266..0b847b5 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -777,11 +777,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b71ad27..b939312 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -891,6 +891,104 @@ out: return ret; } + +/** + * kvm_mmu_write_protected_pt_masked - walk mask relative start of memslot and + * write protect again for next dirty log read. + * + * After migration thread write protects entire VM iterative calls are made + * to get diry page log. The log is returned and dirty pages are write + * protected again. This function is called as a result KVM_GET_DIRTY_LOG + * ioctl. + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + * @kvm:The KVM pointer + * @slot: The memory slot the dirty log is retrieved for + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset in this memory + * slot to be writ protect + */ + +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t ipa, next, offset_ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + gfn_t gfnofst = slot-base_gfn + gfn_offset; + bool crosses_pmd; + + ipa = (gfnofst + __ffs(mask)) PAGE_SHIFT; + offset_ipa = gfnofst PAGE_SHIFT; + next = (gfnofst + (BITS_PER_LONG - 1)) PAGE_SHIFT; + + /* check if mask width crosses 2nd level page table range, and +* possibly 3rd, 4th. If not skip upper table lookups. Unlikely +* to be true. +*/ + crosses_pmd = ((offset_ipa PMD_MASK) ^ (next PMD_MASK)) ? true : + false; + + /* If pgd, pud, pmd not present and you cross pmd range check next +* index. +*/ + pgd = pgdp + pgd_index(ipa); + if (unlikely(crosses_pmd !pgd_present(*pgd))) { + pgd = pgdp + pgd_index(next); + if (!pgd_present(*pgd)) + return; + } + + pud = pud_offset(pgd, ipa); + if (unlikely(crosses_pmd !pud_present(*pud))) { + pud = pud_offset(pgd, next); + if (!pud_present(*pud)) + return; + } + + pmd = pmd_offset(pud, ipa); + if (unlikely(crosses_pmd !pmd_present(*pmd))) { + pmd = pmd_offset(pud, next); + if (!pmd_present(*pmd)) + return; + } + + for (;;) { + pte = pte_offset_kernel(pmd, ipa); + if (!pte_present(*pte)) + goto next_ipa; + + if (kvm_s2pte_readonly(pte)) + goto next_ipa; + kvm_set_s2pte_readonly(pte); +next_ipa: + mask = mask - 1; + if (!mask) + break; + + /* find next page */ + ipa = (gfnofst + __ffs(mask)) PAGE_SHIFT; + + /* skip upper page table lookups */ + if (!crosses_pmd) + continue; + + pgd = pgdp + pgd_index(ipa); + if (unlikely(!pgd_present(*pgd))) + goto next_ipa; + pud = pud_offset(pgd, ipa
Re: [PATCH v5 2/4] live migration support for initial write protect of VM
On 05/15/2014 11:53 AM, Christoffer Dall wrote: [I know you sent out a newer version but I already reviewed some of this patch on the plane today but couldn't send it out before I got home. Anyway, here it is:] On Wed, May 07, 2014 at 05:40:14PM -0700, Mario Smarduch wrote: Patch adds support for live migration initial split up of huge pages in memory slot and write protection of all pages in memory slot. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |7 ++ arch/arm/include/asm/kvm_mmu.h | 16 +++- arch/arm/kvm/arm.c |3 + arch/arm/kvm/mmu.c | 179 +++ virt/kvm/kvm_main.c |6 +- 5 files changed, 209 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index ac3bb65..91744c3 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -67,6 +67,11 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_distvgic; +/* Marks start of migration, used to handle 2nd stage page faults + * during migration, prevent installing huge pages and split huge pages + * to small pages. + */ commenting style this is a bit verbose for a field in a struct, perhaps moving the longer version to where you set this? Will do. +int migration_in_progress; }; #define KVM_NR_MEM_OBJS 40 @@ -233,4 +238,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); +int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5c7aa3c..b339fa9 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,13 +114,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ +pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR); This relies on the pte already having been set as RDONLY or RDWR, if you are creating a new pte and calling this function it could be easy to miss that distinction, I would prefer: pte_val(*pte) = L_PTE_S2_RDWR; pte_val(*pte) |= L_PTE_S2_RDONLY; Currently it's called only on set, or live pte's, I'll change it so it's applicate to all cases. +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ +return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end) \ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ (__boundary - 1 (end) - 1)? __boundary: (end);\ }) -#define kvm_pud_addr_end(addr,end) (end) +/* For - 4-level table walk return PUD range end if end 1GB */ not sure you need this comment, the scheme is very common all over the kernel. Yes. +#define kvm_pud_addr_end(addr, end) \ +({ u64 __boundary = ((addr) + PUD_SIZE) PUD_MASK;\ +(__boundary - 1 (end) - 1) ? __boundary : (end); \ +}) why do we need this? We should only ever have 3 levels of page tables, right? I removed in v6 patch. #define kvm_pmd_addr_end(addr, end) \ ({ u64 __boundary = ((addr) + PMD_SIZE) PMD_MASK;\ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..1055266 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -234,6 +234,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { +/* Request for migration issued by user, write protect memory slot */ Does this necessarily only happen when there's a request for migration? Isn't it just a log call that could be used for other things (potentially)? From QEMU view migration thread calls KVM memory listener kvm_log_global_start and that kicks off dirty log tracking for each memslot. There are other operations like region add (kvm_region_add) that starts kvm_log_start for that memslot, or other odd case if you add a region that overlaps regions you may start logging the whole region. But in either case it appears you're migrating already. But no I don't see any other feature that triggers this. +if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) +return kvm_mmu_slot_remove_write_access(kvm, mem-slot); return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 95c172a..85145d8 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -743,6 +743,185
Re: [PATCH v6 0/4] live migration dirty bitmap support for ARMv7
Will do that, I'm sure there will be another iteration :). On 05/15/2014 11:51 AM, Christoffer Dall wrote: On Thu, May 15, 2014 at 11:27:27AM -0700, Mario Smarduch wrote: This is v6 patcheset of live mgiration support for ARMv7. migration This is an extremely terse cover letter. It would have been nice with a few sentences of which existing features this leverages, which support was missing, what the preferred approach is, etc. Also, links to a wiki page or just a few notes on how you did the testing below with which user space tools etc. would also have been great. - Tested on two 4-way A15 hardware, QEMU 2-way/4-way SMP guest upto 2GB - Various dirty data rates tested - 2GB/1s ... 2048 pgs/5ms - validated source/destination memory image integrity Changes since v1: - add unlock of VM mmu_lock to prevent a deadlock - moved migratiion active inside mmu_lock acquire for visibility in 2nd stage data abort handler - Added comments Changes since v2: - move initial VM write protect to memory region architecture prepare function (needed to make dirty logging function generic) - added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment - optimized initial VM memory region write protect to do fewer table lookups - applied Marc's comment for walking dirty bitmap mask - added pud_addr_end() for stage2 tables, to make the walk 4-level - added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic one weak, Marc's comment to for generic dirty bitmap log function - optimized walking dirty bit map mask to skip upper tables - Marc's comment - deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged the function weak - Marc's comment - changed Data Abort handler pte index handling - Marc's comment Changes since v3: - changed pte updates to reset write bit instead of setting default value for existing pte's - Steve's comment - In addition to PUD add 2nd stage 4GB range functions - Steves suggestion - Restructured initial memory slot write protect function for PGD, PUD, PMD table walking - Steves suggestion - Renamed variable types to resemble their use - Steves suggestions - Added couple pte helpers for 2nd stage tables - Steves suggestion - Updated unmap_range() that handles 2nd stage tables and identity mappings to handle 2nd stage addresses 4GB. Left ARMv8 unchanged. Changes since v4: - rebased to 3.15.0-rc1 - 'next' to pickup p*addr_end patches - Gavins comment - Update PUD address end function to support 4-level page table walk - Elimiated 5th patch of the series that fixed unmap_range(), since it was fixed by Marcs patches. Changes since v5: - Created seperate entry point for VMID TLB flush with no param - Christoffers comment - Update documentation for kvm_flush_remote_tlbs() - Christoffers comment - Simplified splitting of huge pages - inittial WP and 2nd stage DABT handler clear the huge page PMD, and use current code to fault in small pages. Removed kvm_split_pmd(). Mario Smarduch (4): add ARMv7 HYP API to flush VM TLBs without address param live migration support for initial write protect of VM live migration support for VM dirty log management add 2nd stage page fault handling during live migration arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h | 11 ++ arch/arm/include/asm/kvm_mmu.h | 10 ++ arch/arm/kvm/arm.c |8 +- arch/arm/kvm/interrupts.S | 11 ++ arch/arm/kvm/mmu.c | 292 ++- arch/x86/kvm/x86.c | 86 virt/kvm/kvm_main.c | 84 ++- 8 files changed, 409 insertions(+), 94 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 2/4] live migration support for initial write protect of VM
Hi Christoffer, few more comments struct vgic_distvgic; + /* Marks start of migration, used to handle 2nd stage page faults +* during migration, prevent installing huge pages and split huge pages +* to small pages. +*/ commenting style this is a bit verbose for a field in a struct, perhaps moving the longer version to where you set this? Will do. + int migration_in_progress; }; I think this flag could be removed all together. Migration can be stopped at any time (started too), through user request or other events. When that happens (like migrate_cancel) migrate cleanup bh runs and eventually calls KVM memory listener kvm_log_global_start() (cancel handler) that stops logging, clears KVM_MEM_LOG_DIRTY_PAGES, and region ops ioctl, clears dirty_bitmap. In either case dirty_bitmap for memslot is set or unset during migration to track dirty pages, following that field seems to be a better way to keep track of migration. This again is QEMU view but it appears all these policies are driven from user space. +/* kvm_split_pmd - splits huge pages to small pages, required to keep a dirty + * log of smaller memory granules, otherwise huge pages would need to be + * migrated. Practically an idle system has problems migrating with This seems abrupt. Why can't we just represent a 2M huge page as 512 4K bits and write protect the huge pages, if you take a write fault on a 2M page, then split it then. That's one alternative the one I put into v6 is clear the PMD and force user_mem_abort() to fault in 4k pages, and mark the dirty_bitmap[] for that page, reuse the current code. Have not checked the impact on performance, it takes few seconds longer to converge for the tests I'm running. I was thinking about this and if PMD attributes need to be passed onto the PTEs then it appears what you recommend is required. But during run time I don't see how 2nd stage attributes can change, could the guest do anything to change them (SH, Memattr)? Performance may also be other reason but that always depends on the load, clearing a PMD seems easier and reuses current code. Probably several load tests/benchmarks can help here. Also noticed hw PMD/PTE attributes differ a little which is not significant now, but moving forward different page size and any new revisions to fields may require additional maintenance. I'll be out next week and back 26'th, I'll create a link with details on test environment and tests. The cover letter will will go through general overview only. Thanks, Mario If your use case is HA, then you will be doing this a lot, and you don't want to hurt performance of your main live system more than necessary. + * huge pages. Called during WP of entire VM address space, done -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 2/4] live migration support for initial write protect of VM
Hi Christoffer, I was out traveling last week + holiday. You had lots of comments in last version (incl. below), reworking to submit a new series. Un-clutter from basic issues, and update current logic. In next couple days I'll submit new series. Also looking into a wiki to document test env (but may windup with a github link). Thanks, Mario On 05/27/2014 12:58 PM, Christoffer Dall wrote: On Thu, May 15, 2014 at 11:27:29AM -0700, Mario Smarduch wrote: Patch adds memslot support for initial write protection and split up of huge pages I lost track of where we are with these patches, but I see a lot of issues in this patch that I believe I already commented on (but I may not have had time to comment before you sent out v6). In any case, I'm going to wait with reviewing things carefully until you send out a v7, but for v7: - Please document the rationale and design behind what you're doing in the commit text of each patch. Each of these patches are quite large, but the commit messages are barely two lines. I suggest you take a look at 'git log arch/arm/kvm' for example to get a feel for what I'm looking for. - There is nothing specific in the interface to KVM discussing migration or live migration, it is only used as an example for features in trying to stay generic. Please use similar generic concepts in the kernel to make things coherent. 'git grep migration arch/x86/kvm' also tells you that x86 gets away with full support for live migration without referring to migration except as examples of how features might be useful. Thanks for the work, looking forward to seeing a new revision. -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 3/4] live migration support for VM dirty log management
On 05/27/2014 01:12 PM, Christoffer Dall wrote: On Thu, May 15, 2014 at 11:27:30AM -0700, Mario Smarduch wrote: This patch adds support for keeping track of VM dirty pages, by updating per memslot dirty bitmap and write protecting the page again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 -- arch/arm/kvm/mmu.c | 98 +++ arch/x86/kvm/x86.c | 86 -- virt/kvm/kvm_main.c | 82 5 files changed, 183 insertions(+), 91 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 0e55b17..4fef77d 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -238,5 +238,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +struct kvm_memory_slot *slot, +gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 1055266..0b847b5 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -777,11 +777,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ -return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b71ad27..b939312 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -891,6 +891,104 @@ out: return ret; } + +/** + * kvm_mmu_write_protected_pt_masked - walk mask relative start of memslot and + * write protect again for next dirty log read. + * + * After migration thread write protects entire VM iterative calls are made + * to get diry page log. The log is returned and dirty pages are write + * protected again. This function is called as a result KVM_GET_DIRTY_LOG + * ioctl. + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + * @kvm:The KVM pointer + * @slot: The memory slot the dirty log is retrieved for + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset in this memory + * slot to be writ protect + */ + +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +struct kvm_memory_slot *slot, +gfn_t gfn_offset, unsigned long mask) +{ +phys_addr_t ipa, next, offset_ipa; +pgd_t *pgdp = kvm-arch.pgd, *pgd; +pud_t *pud; +pmd_t *pmd; +pte_t *pte; +gfn_t gfnofst = slot-base_gfn + gfn_offset; +bool crosses_pmd; + +ipa = (gfnofst + __ffs(mask)) PAGE_SHIFT; +offset_ipa = gfnofst PAGE_SHIFT; +next = (gfnofst + (BITS_PER_LONG - 1)) PAGE_SHIFT; + +/* check if mask width crosses 2nd level page table range, and + * possibly 3rd, 4th. If not skip upper table lookups. Unlikely + * to be true. + */ +crosses_pmd = ((offset_ipa PMD_MASK) ^ (next PMD_MASK)) ? true : +false; you can just assign the value, no need for the tertiary operator, a bool will always be true or false. (Marc wanted to make this explicit elsewhere in the code, an uses the 'val = !!(expression)' syntax). Ah ok. + +/* If pgd, pud, pmd not present and you cross pmd range check next + * index. + */ +pgd = pgdp + pgd_index(ipa); +if (unlikely(crosses_pmd !pgd_present(*pgd))) { +pgd = pgdp + pgd_index(next); +if (!pgd_present(*pgd)) +return; +} + +pud = pud_offset(pgd, ipa); +if (unlikely(crosses_pmd !pud_present(*pud))) { +pud = pud_offset(pgd, next); +if (!pud_present(*pud)) +return; +} + +pmd = pmd_offset(pud, ipa); +if (unlikely(crosses_pmd !pmd_present(*pmd))) { +pmd = pmd_offset(pud, next); +if (!pmd_present(*pmd)) +return; +} + +for (;;) { +pte = pte_offset_kernel(pmd, ipa); +if (!pte_present(*pte)) +goto next_ipa; + +if (kvm_s2pte_readonly(pte)) +goto next_ipa; +kvm_set_s2pte_readonly(pte); +next_ipa: +mask = mask - 1; +if (!mask) +break; + +/* find next page */ +ipa = (gfnofst + __ffs(mask)) PAGE_SHIFT
Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration
On 05/27/2014 01:19 PM, Christoffer Dall wrote: On Thu, May 15, 2014 at 11:27:31AM -0700, Mario Smarduch wrote: This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and splits up existing huge pages. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 36 ++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b939312..10e7bf6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1002,6 +1002,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; +bool migration_active; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -1053,12 +1054,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; spin_lock(kvm-mmu_lock); + +/* + * Place inside lock to prevent race condition when whole VM is being + * write proteced. Prevent race of huge page install when migration is + * active. + */ +migration_active = vcpu-kvm-arch.migration_in_progress; + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; -if (!hugetlb !force_pte) + +/* When migrating don't spend cycles coalescing huge pages */ +if (!hugetlb !force_pte !migration_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); -if (hugetlb) { +/* During migration don't install huge pages */ again, all this is not about migration per se, it's about when logging dirty pages, (which may be commonly used for migration). Yes that's true , I'll update but until recently (new RFC on qemu list) where dirty logging is used for getting VM RSS or hot memory regions, I don't see any other use case. +if (hugetlb !migration_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -1069,6 +1081,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, new_pmd); } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); + +/* + * If pmd is mapping a huge page then split it up into + * small pages, when doing live migration. + */ +if (migration_active) { +pmd_t *pmd; +if (hugetlb) { +pfn += pte_index(fault_ipa); +gfn = fault_ipa PAGE_SHIFT; +} how can you have hugetlb when we entered this else-clause conditional on having !hugetlb? - if(hugetlb !migration_active) forces all page faults to enter here while in migration. Huge page entries are cleared and stage2_set_pte() splits the huge page, and installs the pte for the fault_ipa. I placed that there since it flows with installing a pte as well as splitting a huge page. But your comment on performance split up huge page vs. deferred page faulting should move it out of here. +new_pte = pfn_pte(pfn, PAGE_S2); +pmd = stage2_get_pmd(kvm, NULL, fault_ipa); +if (pmd kvm_pmd_huge(*pmd)) +clear_pmd_entry(kvm, pmd, fault_ipa); If we have a huge pmd entry, how did we take a fault on there? Would that be if a different CPU inserted a huge page entry since we got here, is this what you're trying to handle? I'm confused. I thing this related to the above. +} + if (writable) { kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); @@ -1077,6 +1106,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } +/* Assuming 4k pages, set one bit/page in memslot dirty_bitmap[] */ Assuming? this makes me nervous. The point is probably that it's harmless if we're not logging dirty pages, because then nobody reads teh data structure, and if we are logging, then we are mapping everything using 4K pages? It's probably clearer code-wise to condition this on whether or not we are logging dirty page, and the branch is also likely to be much faster than the function call to mark_page_dirty. I'm not sure I get the point. The call is always safe, you either have old copy or new copy of memory slot with dirty_bitmap set or not set. The log read is done while holding kvm slots_lock. Is the comment related to performance, not supporting multiple page sizes, or it's unsafe to call
Re: [PATCH v6 3/4] live migration support for VM dirty log management
On 05/28/2014 02:08 AM, Christoffer Dall wrote: On Tue, May 27, 2014 at 02:55:21PM -0700, Mario Smarduch wrote: On 05/27/2014 01:12 PM, Christoffer Dall wrote: On Thu, May 15, 2014 at 11:27:30AM -0700, Mario Smarduch wrote: [...] + + /* If pgd, pud, pmd not present and you cross pmd range check next + * index. + */ + pgd = pgdp + pgd_index(ipa); + if (unlikely(crosses_pmd !pgd_present(*pgd))) { + pgd = pgdp + pgd_index(next); + if (!pgd_present(*pgd)) + return; + } + + pud = pud_offset(pgd, ipa); + if (unlikely(crosses_pmd !pud_present(*pud))) { + pud = pud_offset(pgd, next); + if (!pud_present(*pud)) + return; + } + + pmd = pmd_offset(pud, ipa); + if (unlikely(crosses_pmd !pmd_present(*pmd))) { + pmd = pmd_offset(pud, next); + if (!pmd_present(*pmd)) + return; + } + + for (;;) { + pte = pte_offset_kernel(pmd, ipa); + if (!pte_present(*pte)) + goto next_ipa; + + if (kvm_s2pte_readonly(pte)) + goto next_ipa; + kvm_set_s2pte_readonly(pte); +next_ipa: + mask = mask - 1; + if (!mask) + break; + + /* find next page */ + ipa = (gfnofst + __ffs(mask)) PAGE_SHIFT; + + /* skip upper page table lookups */ + if (!crosses_pmd) + continue; + + pgd = pgdp + pgd_index(ipa); + if (unlikely(!pgd_present(*pgd))) + goto next_ipa; + pud = pud_offset(pgd, ipa); + if (unlikely(!pud_present(*pud))) + goto next_ipa; + pmd = pmd_offset(pud, ipa); + if (unlikely(!pmd_present(*pmd))) + goto next_ipa; + } So I think the reason this is done separately on x86 is that they have an rmap structure for their gfn mappings so that they can quickly lookup ptes based on a gfn and write-protect it without having to walk the stage-2 page tables. Yes, they also use rmapps for mmu notifiers, invalidations on huge VMs and large ranges resulted in excessive times. Unless you want to introduce this on ARM, I think you will be much Eventually yes but that would also require reworking mmu notifiers. I had two step approach in mind. Initially get the dirty page marking to work, TLB flushing, GIC/arch-timer migration, validate migration under various stress loads (page reclaim) with mmu notifiers, test several VMs and migration times. Then get rmapp (or something similar) working - eventually for huge VMs it's needed. In short two phases. better off just having a single (properly written) iterating write-protect function, that takes a start and end IPA and a bitmap for which pages to actually write-protect, which can then handle the generic case (either NULL or all-ones bitmap) or a specific case, which just traverses the IPA range given as input. Such a function should follow the model of page table walk functions discussed previously (separate functions: wp_pgd_enties(), wp_pud_entries(), wp_pmd_entries(), wp_pte_entries()). However, you may want to verify my assumption above with the x86 people and look at sharing the rmap logic between architectures. In any case, this code is very difficult to read and understand, and it doesn't look at all like the other code we have to walk page tables. I understand you are trying to optimize for performance (by skipping some intermediate page table level lookups), but you never declare that goal anywhere in the code or in the commit message. Marc's comment noticed I was walking a small range (128k), using upper table iterations that covered 1G, 2MB ranges. As you mention the code tries to optimize upper table lookups. Yes the function is too bulky, but I'm not sure how to remove the upper table checks since page tables may change between the time pages are marked dirty and the log is retrieved. And if a memory slot is very dirty walking upper tables will impact performance. I'll think some more on this function. I think you should aim at the simplest possible implementation that functionally works, first. Let's verify that this thing works, have clean working code that implementation-wise is as minimal as possible. Then we can run perf on that and see if our migrations are very slow, where we are actually spending time, and only then optimize it. The solution to this specific problem for the time being appears quite clear to me: Follow the exact same scheme as for unmap_range (the one I sent out here: https://lists.cs.columbia.edu/pipermail/kvmarm/2014-May/009592.html, the diff is hard to read, so I recommend you apply the patch and look at the resulting code). Have a similar scheme, call it wp_ipa_range() or something like that, and use that for now. Ok I'll reuse that code. I'll need
Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration
emslot dirty_bitmap during and after write protect. -Christoffer Regarding huge pud that's causing some design problems, should huge PUD pages be considered at all? Thanks, Mario ___ kvmarm mailing list kvm...@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration
Little bit more details on this question - For 2nd stage 3-level tables PUD blocks don't exist - although it appears you can have a PGD block but I don't see any support for that. But should the code still work as if PUDs (4-level table) are used and check for pud_huge()? Looking at ARMv8 there are several block formats, I don't know which one will be use for 2nd stage (4KB, 16,...) but one of them supports 4-level table (have not looked at this in detail, could be wrong here). Should pud_huge() be supported for future compatibility? This impacts logging - - Some decisions are needed either clear the PUD entry and force them to pages or mark dirty bit map for each 4k page in the PUD Block range, IA64 appears to that in mark_pages_dirty(). - If you assume pud_huge() then you probably have to support the logic for PUD Block descriptor even though it's not used in 3-level table at this time. I think until PUD Blocks are actually used it's maybe better to ignore them. - Mario On 05/28/2014 11:42 AM, Mario Smarduch wrote: emslot dirty_bitmap during and after write protect. -Christoffer Regarding huge pud that's causing some design problems, should huge PUD pages be considered at all? Thanks, Mario ___ kvmarm mailing list kvm...@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm ___ kvmarm mailing list kvm...@lists.cs.columbia.edu https://lists.cs.columbia.edu/mailman/listinfo/kvmarm -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration
So this needs to be cleared up given this is key to logging. Cases this code handles during migration - 1. huge page fault described above - write protect fault so you breakup the huge page. 2. All other faults - first time access, pte write protect you again wind up in stage2_set_pte(). Am I missing something here? no, I forgot about the fact that we can take the permission fault now. Hmm, ok, so either we need to use the original approach of always splitting up huge pages or we need to just follow the regular huge page path here and just mark all 512 4K pages dirty in the log, or handle it in stage2_set_pte(). I would say go with the most simple appraoch for now (which may be going back to splitting all pmd_huge() into regular pte's), and we can take a more careful look in the next patch iteration. Looking at the overall memslot update architecture and various fail scenarios - user_mem_abort() appears to be the most optimal and reliable place. First Write Protect huge pages after memslots are committed and deal with rest in user_mem_abort(). Still need some feedback on the pud_huge() before revising for next iteration? - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration
On 05/29/2014 10:57 AM, Christoffer Dall wrote: On Thu, May 29, 2014 at 10:08:07AM -0700, Mario Smarduch wrote: So this needs to be cleared up given this is key to logging. Cases this code handles during migration - 1. huge page fault described above - write protect fault so you breakup the huge page. 2. All other faults - first time access, pte write protect you again wind up in stage2_set_pte(). Am I missing something here? no, I forgot about the fact that we can take the permission fault now. Hmm, ok, so either we need to use the original approach of always splitting up huge pages or we need to just follow the regular huge page path here and just mark all 512 4K pages dirty in the log, or handle it in stage2_set_pte(). I would say go with the most simple appraoch for now (which may be going back to splitting all pmd_huge() into regular pte's), and we can take a more careful look in the next patch iteration. Looking at the overall memslot update architecture and various fail scenarios - user_mem_abort() appears to be the most optimal and reliable place. First Write Protect huge pages after memslots are committed and deal with rest in user_mem_abort(). Still need some feedback on the pud_huge() before revising for next iteration? Just assume it's not used for now, and that you don't have to consider it, and make that assumption clear in the commit message, so it doesn't block this work. I have a feeling we need to go through a few iterations here, so let's get that rolling. Thanks. Ok thanks I'm on it now. - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 2/4] live migration support for initial write protect of VM
+static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ +pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR); This relies on the pte already having been set as RDONLY or RDWR, if you are creating a new pte and calling this function it could be easy to miss that distinction, I would prefer: pte_val(*pte) = L_PTE_S2_RDWR; pte_val(*pte) |= L_PTE_S2_RDONLY; Confused on this comment, this appears to just add the read-only permission. But will leave other permission bits intact, and clears out the rest of the pte? - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v7 4/4] arm: dirty page logging 2nd stage page fault handling support
This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and disolves huge pages to page tables. In case migration is canceled huge pages will be used again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 36 ++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1c546c9..aca4fbf 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -966,6 +966,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ + bool logging_active = !!memslot-dirty_bitmap; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -1019,10 +1021,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(kvm-mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + + /* When logging don't spend cycles to check for huge pages */ + if (!hugetlb !force_pte !logging_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); - if (hugetlb) { + /* +* Force all not present/perm faults to PTE handling, address both +* PMD and PTE faults +*/ + if (hugetlb !logging_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -1034,6 +1042,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { + /* +* If pmd is mapping a huge page then clear it and let +* stage2_set_pte() create a pte table. At the sametime +* you write protect the pte (PAGE_S2 pgprot_t). +*/ + if (logging_active) { + pmd_t *pmd; + if (hugetlb) { + pfn += pte_index(fault_ipa); + gfn = fault_ipa PAGE_SHIFT; + new_pte = pfn_pte(pfn, PAGE_S2); + } + pmd = stage2_get_pmd(kvm, NULL, fault_ipa); + if (pmd kvm_pmd_huge(*pmd)) + clear_pmd_entry(kvm, pmd, fault_ipa); + } kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); } @@ -1041,6 +1065,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } + /* +* Log the dirty page in dirty_bitmap[], call regardless if logging is +* disabled or enabled both cases handled safely. +* TODO: for larger page size mark mulitple dirty page bits for each +* 4k page. +*/ + if (writable) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v7 0/4] arm: dirty page logging support for ARMv7
is 409600, 8192, 5 o QEMU is instrumented to save RAM memory regions on source and destination after memory is migrated, but before guest started. Later files are checksummed on both ends for correctness, given VMs are small this works. o Guest kernel is instrumented to capture current cycle counter - last cycle and compare to qemu down time to test arch timer accuracy. o Network failover is at L3 due to interface limitations, ping continues working transparently o Also tested 'migrate_cancel' to test reassemble of huge pages (inserted low level instrumentation code). Changes since v6: - primarily reworked initial write protect, and write protect of dirty pages on logging request - Only code logic change, disolve huge pages to page tables in page fault handler - Made many many changes based on Christoffers comments. Mario Smarduch (4): add ARMv7 HYP API to flush VM TLBs without address param dirty page logging inital mem region write protect (w/no huge PUD support) dirty log write protect management sppport dirt page logging 2nd stage page fault handling support arch/arm/include/asm/kvm_asm.h|1 + arch/arm/include/asm/kvm_host.h |5 + arch/arm/include/asm/kvm_mmu.h| 20 +++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c| 11 +- arch/arm/kvm/interrupts.S | 11 ++ arch/arm/kvm/mmu.c| 243 - arch/x86/kvm/x86.c| 86 virt/kvm/kvm_main.c | 83 ++- 9 files changed, 367 insertions(+), 94 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v7 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param
Patch adds HYP interface for global VM TLB invalidation without address parameter. Added ARM version of kvm_flush_remote_tlbs(), made the generic implementation a weak symbol. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/kvm/interrupts.S | 11 +++ arch/arm/kvm/mmu.c | 14 ++ virt/kvm/kvm_main.c|2 +- 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..bddc66b 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2ac9588..ef29540 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,20 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa70c6e..ba25765 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -void kvm_flush_remote_tlbs(struct kvm *kvm) +void __weak kvm_flush_remote_tlbs(struct kvm *kvm) { long dirty_count = kvm-tlbs_dirty; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v7 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)
Patch adds memslot support for initial write protection and split up of huge pages. This patch series assumes that huge PUDs will not be used to map VM memory. This patch depends on the unmap_range() patch, it needs to be applied first. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |2 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|6 ++ arch/arm/kvm/mmu.c| 114 + 5 files changed, 143 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..59565f5 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,4 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ + pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ + return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end)\ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..dfd63ac 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { + /* +* At this point memslot has been committed and the there is an +* allocated dirty_bitmap[] so marking of diryt pages works now on. +*/ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_wp_memory_region(kvm, mem-slot); } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ef29540..e5dff85 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -760,6 +760,120 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } + +/** + * stage2_wp_pte_range - write protect PTE range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmd_range - write protect PMD range + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + /* +* Write Protect the PMD, give user_mem_abort() +* a choice to clear and fault on demand or +* break up the huge page
[PATCH v7 3/4] arm: dirty log write protect management support
This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 --- arch/arm/kvm/mmu.c | 79 +++ arch/x86/kvm/x86.c | 86 --- virt/kvm/kvm_main.c | 81 5 files changed, 163 insertions(+), 91 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 59565f5..b760f9c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dfd63ac..f06fb21 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e5dff85..1c546c9 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) spin_unlock(kvm-mmu_lock); } +/** + * stage2_wp_mask_range() - write protect memslot pages set in mask + * @pmd - pointer to page table + * @start_ipa - the start range of mask + * @addr - start_ipa or start range of adjusted mask if crossing PMD range + * @mask - mask of dirty pages + * + * Walk mask and write protect the associated dirty pages in the memory region. + * If mask crosses a PMD range adjust it to next page table and return. + */ +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa, + phys_addr_t *addr, unsigned long *mask) +{ + pte_t *pte; + bool crosses_pmd; + int i; + + for (i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE; + *mask; + i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) { + crosses_pmd = !!((start_ipa PMD_MASK) ^ (*addr PMD_MASK)); + if (unlikely(crosses_pmd)) { + /* Adjust mask dirty bits relative to next page table */ + *mask = (PTRS_PER_PTE - pte_index(start_ipa)); + return; + } + + pte = pte_offset_kernel(pmd, *addr); + if (!pte_none(*pte)) + kvm_set_s2pte_readonly(pte); + *mask = ~(1 i); + } +} + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm:The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset' in this memory + * slot to be write protected + * + * Called from dirty page logging read function to write protect bits set in + * mask to record future writes to these pages in dirty page log. This function + * uses simplified page table walk knowing that mask spawns range of two PMDs. + * + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + pud_t *pud; + pmd_t *pmd; + phys_addr_t start_ipa = (slot-base_gfn + gfn_offset) PAGE_SHIFT; + phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE; + phys_addr_t addr = start_ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + + do { + pgd = pgdp + pgd_index(addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud) !pud_huge(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd) !kvm_pmd_huge(*pmd)) + stage2_wp_mask_range(pmd, start_ipa, + addr, mask); + else + addr += PMD_SIZE; + } else
[RESEND PATCH v7 3/4] arm: dirty log write protect management support
Resending patch, noticed I forgot to adjust start_ipa properly in stage2_wp_mask_range() and then noticed that pte's can be indexed directly. The patch applies cleanly after 2/4 and 4/4 applies cleanly after this patch. This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 --- arch/arm/kvm/mmu.c | 79 +++ arch/x86/kvm/x86.c | 86 --- virt/kvm/kvm_main.c | 81 5 files changed, 163 insertions(+), 91 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 59565f5..b760f9c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dfd63ac..f06fb21 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e5dff85..5ede813 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) spin_unlock(kvm-mmu_lock); } +/** + * stage2_wp_mask_range() - write protect memslot pages set in mask + * @pmd - pointer to page table + * @start_ipa - the start range of mask + * @addr - start_ipa or start range of adjusted mask if crossing PMD range + * @mask - mask of dirty pages + * + * Walk mask and write protect the associated dirty pages in the memory region. + * If mask crosses a PMD range adjust it to next page table and return. + */ +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa, + phys_addr_t *addr, unsigned long *mask) +{ + pte_t *pte; + bool crosses_pmd; + int i = __ffs(*mask); + + if (unlikely(*addr start_ipa)) + start_ipa = *addr - i * PAGE_SIZE; + pte = pte_offset_kernel(pmd, start_ipa); + for (*addr = start_ipa + i * PAGE_SIZE; *mask; + i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) { + crosses_pmd = !!((start_ipa PMD_MASK) ^ (*addr PMD_MASK)); + if (unlikely(crosses_pmd)) { + /* Adjust mask dirty bits relative to next page table */ + *mask = (PTRS_PER_PTE - pte_index(start_ipa)); + return; + } + if (!pte_none(pte[i])) + kvm_set_s2pte_readonly(pte[i]); + *mask = ~(1 i); + } +} + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm:The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset' in this memory + * slot to be write protected + * + * Called from dirty page logging read function to write protect bits set in + * mask to record future writes to these pages in dirty page log. This function + * uses simplified page table walk given mask can spawn no more then 2 PMD + * table range. + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + pud_t *pud; + pmd_t *pmd; + phys_addr_t start_ipa = (slot-base_gfn + gfn_offset) PAGE_SHIFT; + phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE; + phys_addr_t addr = start_ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + + do { + pgd = pgdp + pgd_index(addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud) !pud_huge(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd) !kvm_pmd_huge(*pmd
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
On 06/04/2014 11:55 PM, Xiao Guangrong wrote: On 06/05/2014 05:11 AM, Mario Smarduch wrote: +spin_lock(kvm-mmu_lock); + +for (i = 0; i n / sizeof(long); i++) { +unsigned long mask; +gfn_t offset; + +if (!dirty_bitmap[i]) +continue; + +is_dirty = true; + +mask = xchg(dirty_bitmap[i], 0); +dirty_bitmap_buffer[i] = mask; + +offset = i * BITS_PER_LONG; +kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); +} +if (is_dirty) +kvm_flush_remote_tlbs(kvm); You moved the flush into mmu-lock. Please do not :). See commit 198c74f43f0f5473f99967aead30ddc622804bc1 Thanks for reviewing, I revised to pick up your version. Functionally there should be no impact on ARM, the TLB flush function is different. - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RESEND PATCH v7 3/4] arm: dirty log write protect management support
kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch changed this function, this patch picks up those changes, re-tested everything works. Applies cleanly with other patches. This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 --- arch/arm/kvm/mmu.c | 79 +++ arch/x86/kvm/x86.c | 86 --- virt/kvm/kvm_main.c | 86 +++ 5 files changed, 168 insertions(+), 91 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 59565f5..b760f9c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dfd63ac..f06fb21 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e5dff85..907344c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) spin_unlock(kvm-mmu_lock); } +/** + * stage2_wp_mask_range() - write protect memslot pages set in mask + * @pmd - pointer to page table + * @start_ipa - the start range of mask + * @addr - start_ipa or start range of adjusted mask if crossing PMD range + * @mask - mask of dirty pages + * + * Walk mask and write protect the associated dirty pages in the memory region. + * If mask crosses a PMD range adjust it to next page table and return. + */ +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa, + phys_addr_t *addr, unsigned long *mask) +{ + pte_t *pte; + bool crosses_pmd; + int i = __ffs(*mask); + + if (unlikely(*addr start_ipa)) + start_ipa = *addr - i * PAGE_SIZE; + pte = pte_offset_kernel(pmd, start_ipa); + for (*addr = start_ipa + i * PAGE_SIZE; *mask; + i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) { + crosses_pmd = !!((start_ipa PMD_MASK) ^ (*addr PMD_MASK)); + if (unlikely(crosses_pmd)) { + /* Adjust mask dirty bits relative to next page table */ + *mask = (PTRS_PER_PTE - pte_index(start_ipa)); + return; + } + if (!pte_none(pte[i])) + kvm_set_s2pte_readonly(pte[i]); + *mask = ~(1 i); + } +} + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm:The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset' in this memory + * slot to be write protected + * + * Called from dirty page logging read function to write protect bits set in + * mask to record future writes to these pages in dirty page log. This function + * uses simplified page table walk given mask can spawn no more then 2 PMD + * table range. + * 'kvm-mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + pud_t *pud; + pmd_t *pmd; + phys_addr_t start_ipa = (slot-base_gfn + gfn_offset) PAGE_SHIFT; + phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE; + phys_addr_t addr = start_ipa; + pgd_t *pgdp = kvm-arch.pgd, *pgd; + + do { + pgd = pgdp + pgd_index(addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud) !pud_huge(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd) !kvm_pmd_huge(*pmd
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
On 06/05/2014 10:52 PM, Xiao Guangrong wrote: On 06/06/2014 03:09 AM, Mario Smarduch wrote: On 06/04/2014 11:55 PM, Xiao Guangrong wrote: On 06/05/2014 05:11 AM, Mario Smarduch wrote: + spin_lock(kvm-mmu_lock); + + for (i = 0; i n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; + + if (!dirty_bitmap[i]) + continue; + + is_dirty = true; + + mask = xchg(dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); + } + if (is_dirty) + kvm_flush_remote_tlbs(kvm); You moved the flush into mmu-lock. Please do not :). See commit 198c74f43f0f5473f99967aead30ddc622804bc1 Thanks for reviewing, I revised to pick up your version. Functionally there should be no impact on ARM, the TLB flush function is different. Yeah, i agree your point on ARM, but your patch moved the function from x86 to the common code, that means this function is reused between ARM and x86. No? Yes you pretty much summarized it. My point was more like I'm glad the change had no impact on ARM :) Thanks, - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v7 0/4] arm: dirty page logging support for ARMv7
On 06/08/2014 03:45 AM, Christoffer Dall wrote: On Tue, Jun 03, 2014 at 04:19:23PM -0700, Mario Smarduch wrote: This patch adds support for dirty page logging so far tested only on ARMv7. With dirty page logging, GICv2 vGIC and arch timer save/restore support, live migration is supported. Dirty page logging support - - initially write protects VM RAM memory regions - 2nd stage page tables - add support to read dirty page log and again write protect the dirty pages - second stage page table for next pass. - second stage huge page are disolved into page tables to keep track of dirty pages at page granularity. Tracking at huge page granularity limits migration to an almost idle system. There are couple approaches to handling huge pages: 1 - break up huge page into page table and write protect all pte's 2 - clear the PMD entry, create a page table install the faulted page entry and write protect it. not sure I fully understand. Is option 2 simply write-protecting all PMDs and splitting it at fault time? No that's 1 above. Option 2 is the optimized solution you describe in patch 4 review - clear the PMD and let stage2_set_pte allocate a page table and install the pte, then it's demand faulting on future access to that PMD range. This patch implements #2, in the future #1 may be implemented depending on more bench mark results. Option 1: may over commit and do unnecessary work, but on heavy loads appears to converge faster during live migration Option 2: Only write protects pages that are accessed, migration varies, takes longer then Option 1 but eventually catches up. - In the event migration is canceled, normal behavior is resumed huge pages are rebuilt over time. - Another alternative is use of reverse mappings where for each level 2nd stage tables (PTE, PMD, PUD) pointers to spte's are maintained (x86 impl.). Primary reverse mapping benefits are for mmu notifiers for large memory range invalidations. Reverse mappings also improve dirty page logging, instead of walking page tables, spete pointers are accessed directly via reverse map array. - Reverse mappings will be considered for future support once the current implementation is hardened. Is the following a list of your future work? I guess yes and no, with exception of lmbench I've ran these tests also couple other folks have tested with prior revisions. I'll run more (overnight, burn in tests) adding lmbench, but I'm hoping others will run tests to give this more run time, different loads and so on. o validate current dirty page logging support o VMID TLB Flushing, migrating multiple guests o GIC/arch-timer migration o migration under various loads, primarily page reclaim and validate current mmu-notifiers o Run benchmarks (lmbench for now) and test impact on performance, and optimize o Test virtio - since it writes into guest memory. Wait until pci is supported on ARM. So you're not testing with virtio now? Your command line below seems to suggest that in fact you are. /me confused. Yes so I've see no errors with virtio-mmio transport and virto-net-device, blk-device backends under moderate loads. But virtio inbound is purely user space in this case QEMU so I can't say with certainty that virtio is 100%. Sometime back I found problems with virtio-mmio when transport and backend are not fused together none of the performance options (UFO, TSO, Partial Checksum...) got applied, like they did for virti-net-pci. So to summarize I need to see how virtio tracks dirty pages for virtio-mmio, and virtio-pci in QEMU. I have fair idea where to look but have not done so yet. o Currently on ARM, KVM doesn't appear to write into Guest address space, need to mark those pages dirty too (???). not sure what you mean here, can you expand? For few architectures KVM writes into guest memory, one example is PV-EOI, will write into guest memory to disable/enable PV-EOI while injecting an interrupt - based one number of in flight interrupts. There is other code that does it too, but I'm not familiar with all the use cases. So if we do that on ARM the page(s) must marked dirty. - Move onto ARMv8 since 2nd stage mmu is shared between both architectures. But in addition to dirty page log additional support for GIC, arch timers, and emulated devices is required. Also working on emulated platform masks a lot of potential bugs, but does help to get majority of code working. Test Environment: --- NOTE: RUNNING on FAST Models will hardly ever fail and mask bugs, infact initially light loads were succeeding without dirty page logging support. --- - Will put all components on github, including test setup diagram - In short
Re: [PATCH v7 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param
On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Tue, Jun 03, 2014 at 04:19:24PM -0700, Mario Smarduch wrote: Patch adds HYP interface for global VM TLB invalidation without address parameter. Added ARM version of kvm_flush_remote_tlbs(), made the generic implementation a weak symbol. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/kvm/interrupts.S | 11 +++ arch/arm/kvm/mmu.c | 14 ++ virt/kvm/kvm_main.c|2 +- 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..bddc66b 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) +b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2ac9588..ef29540 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,20 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ +if (kvm) +kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa70c6e..ba25765 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -void kvm_flush_remote_tlbs(struct kvm *kvm) +void __weak kvm_flush_remote_tlbs(struct kvm *kvm) { long dirty_count = kvm-tlbs_dirty; -- 1.7.9.5 This doesn't build or link on aarch64 :( -Christoffer I'll recompile and retest the dirty page logging portion on ARMv8 and resolve these issues, early next week. In the meantime if it's ok with you, I'' move forward with the rest of the patches on ARMv7 to get through critical issues. Would that work? - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v7 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)
On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Tue, Jun 03, 2014 at 04:19:25PM -0700, Mario Smarduch wrote: Patch adds memslot support for initial write protection and split up of huge pages. This patch series assumes that huge PUDs will not be used to map VM memory. This patch depends on the unmap_range() patch, it needs to be applied first. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |2 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|6 ++ arch/arm/kvm/mmu.c| 114 + 5 files changed, 143 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..59565f5 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,4 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ +pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ +return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ +pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ +return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + not crazy about the names, how about kvm_set_s2_pte_readonly etc.? So kvm_set_s2pte_writable(pte_t *pte) was there already just following that convention. the fact that these don't exist for arm64 makes me think it may break the build for arm64 as well... Yes will address it. /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end) \ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..dfd63ac 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { +/* + * At this point memslot has been committed and the there is an + * allocated dirty_bitmap[] so marking of diryt pages works now on. s/diryt/dirty/ works now on ? Ok + */ +if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) +kvm_mmu_wp_memory_region(kvm, mem-slot); } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ef29540..e5dff85 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -760,6 +760,120 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } + +/** + * stage2_wp_pte_range - write protect PTE range + * @pmd:pointer to pmd entry + * @addr: range start address + * @end:range end address + */ +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ +pte_t *pte; + +pte = pte_offset_kernel(pmd, addr); +do { +if (!pte_none(*pte)) { +if (!kvm_s2pte_readonly(pte)) +kvm_set_s2pte_readonly(pte); do you need the test before setting readonly? Probably not. Some memory regions have hardly any pages present and sometimes not dirty. Was thinking of couple enhancements not to flush if there are no dirty pages or few dirty pages then just flush by IPA. But currently not doing anything with this info, leave it for future. +} +} while (pte++, addr += PAGE_SIZE
Re: [PATCH v7 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)
On 06/09/2014 11:09 AM, Christoffer Dall wrote: On Mon, Jun 09, 2014 at 10:58:18AM -0700, Mario Smarduch wrote: On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Tue, Jun 03, 2014 at 04:19:25PM -0700, Mario Smarduch wrote: Patch adds memslot support for initial write protection and split up of huge pages. This patch series assumes that huge PUDs will not be used to map VM memory. This patch depends on the unmap_range() patch, it needs to be applied first. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |2 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|6 ++ arch/arm/kvm/mmu.c| 114 + 5 files changed, 143 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..59565f5 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,4 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ + pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ + return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + not crazy about the names, how about kvm_set_s2_pte_readonly etc.? So kvm_set_s2pte_writable(pte_t *pte) was there already just following that convention. ah, ok, no problem then. the fact that these don't exist for arm64 makes me think it may break the build for arm64 as well... Yes will address it. /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end) \ ({u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK; \ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..dfd63ac 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { + /* + * At this point memslot has been committed and the there is an + * allocated dirty_bitmap[] so marking of diryt pages works now on. s/diryt/dirty/ works now on ? Ok Sorry I thought it was comment. This function is called after the memslots have been committed so we know dirty bit map has been allocated and marking the dirty bitmap will work as the pages are being write protected and we're getting faults. I don't understand what works now on means, so you need to clarify. + */ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_wp_memory_region(kvm, mem-slot); } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ef29540..e5dff85 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -760,6 +760,120 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } + +/** + * stage2_wp_pte_range - write protect PTE range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte
Re: [PATCH v7 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param
On 06/09/2014 10:49 AM, Christoffer Dall wrote: On Mon, Jun 09, 2014 at 10:06:29AM -0700, Mario Smarduch wrote: On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Tue, Jun 03, 2014 at 04:19:24PM -0700, Mario Smarduch wrote: Patch adds HYP interface for global VM TLB invalidation without address parameter. Added ARM version of kvm_flush_remote_tlbs(), made the generic implementation a weak symbol. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/kvm/interrupts.S | 11 +++ arch/arm/kvm/mmu.c | 14 ++ virt/kvm/kvm_main.c|2 +- 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..bddc66b 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2ac9588..ef29540 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,20 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa70c6e..ba25765 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -void kvm_flush_remote_tlbs(struct kvm *kvm) +void __weak kvm_flush_remote_tlbs(struct kvm *kvm) { long dirty_count = kvm-tlbs_dirty; -- 1.7.9.5 This doesn't build or link on aarch64 :( -Christoffer I'll recompile and retest the dirty page logging portion on ARMv8 and resolve these issues, early next week. In the meantime if it's ok with you, I'' move forward with the rest of the patches on ARMv7 to get through critical issues. Would that work? Your patches need to at least compile with other architectures, I suggest fixing that up front. You also eventually need to somehow test (or ask maintainers to test) a branch with your patches on the architectures that your code messes with. This later point can be addressed once we're close to a consensus on ARM. Ok got it. So far I just checked to make sure x86 compiles since one function is generic and declared __weak, both x86 and ARM share it now (Xiaos comment). -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote: kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch changed this function, this patch picks up those changes, re-tested everything works. Applies cleanly with other patches. This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 --- arch/arm/kvm/mmu.c | 79 +++ arch/x86/kvm/x86.c | 86 --- virt/kvm/kvm_main.c | 86 +++ 5 files changed, 168 insertions(+), 91 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 59565f5..b760f9c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +struct kvm_memory_slot *slot, +gfn_t gfn_offset, unsigned long mask); Do all other architectures implement this function? arm64? Besides arm, x86 but the function is not generic. #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dfd63ac..f06fb21 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ -return -EINVAL; -} - What about the other architectures implementing this function? Six architectures define this function. With this patch this function is generic in kvm_main.c used by x86. static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e5dff85..907344c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) spin_unlock(kvm-mmu_lock); } +/** + * stage2_wp_mask_range() - write protect memslot pages set in mask + * @pmd - pointer to page table + * @start_ipa - the start range of mask + * @addr - start_ipa or start range of adjusted mask if crossing PMD range + * @mask - mask of dirty pages + * + * Walk mask and write protect the associated dirty pages in the memory region. + * If mask crosses a PMD range adjust it to next page table and return. + */ +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa, +phys_addr_t *addr, unsigned long *mask) +{ +pte_t *pte; +bool crosses_pmd; +int i = __ffs(*mask); + +if (unlikely(*addr start_ipa)) +start_ipa = *addr - i * PAGE_SIZE; huh? +pte = pte_offset_kernel(pmd, start_ipa); +for (*addr = start_ipa + i * PAGE_SIZE; *mask; +i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) { +crosses_pmd = !!((start_ipa PMD_MASK) ^ (*addr PMD_MASK)); +if (unlikely(crosses_pmd)) { +/* Adjust mask dirty bits relative to next page table */ +*mask = (PTRS_PER_PTE - pte_index(start_ipa)); +return; +} +if (!pte_none(pte[i])) +kvm_set_s2pte_readonly(pte[i]); +*mask = ~(1 i); This is *really* complicated, and *really* unintuitive and *really* hard to read! I feel this may very likely break, and is optimizing prematurely for some very special case. Can't you follow the usual scheme of traversing the levels one-by-one and just calculate the 'end' address based on the number of bits in your long, and just adjust the mask in the calling function each time you are about to call a lower-level function? Agreed I'll extend wp_range functions, it probably makes no sense to be optimizing at this phase. In fact, I think this could be trivially implemented as an extension to your existing wp_range functions. On ARM you are mostly going to consider 32 pages, on arm64 you are mostly going to consider 64 pages, just calculate that range in terms of IPAs and set that as the limit for calling stage2_wp_pgd_range (which should be factor'ed out into its function and called from kvm_mmu_wp_memory_region). +} +} + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm:The KVM pointer + * @slot: The memory
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
On 06/10/2014 02:22 AM, Christoffer Dall wrote: On Mon, Jun 09, 2014 at 06:47:12PM -0700, Mario Smarduch wrote: On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote: kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch changed this function, this patch picks up those changes, re-tested everything works. Applies cleanly with other patches. This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c |5 --- arch/arm/kvm/mmu.c | 79 +++ arch/x86/kvm/x86.c | 86 --- virt/kvm/kvm_main.c | 86 +++ 5 files changed, 168 insertions(+), 91 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 59565f5..b760f9c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); Do all other architectures implement this function? arm64? Besides arm, x86 but the function is not generic. you're now calling this from generic code, so all architecture must implement it, and the prototype should proably be in include/linux/kvm_host.h, not in the arch-specific headers. Ah ok. #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dfd63ac..f06fb21 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - What about the other architectures implementing this function? Six architectures define this function. With this patch this function is generic in kvm_main.c used by x86. But you're not defining it as a weak symbol (and I don't suspect that you should unless other archs do this in a *very* different way), so you need to either remove it from the other archs, make it a weak symbol (I hope this is not the case) or do something else. Mistake on my part I just cut and paste Xiaos x86's recent upstream patch and didn't add weak definition. I looked at IA64, MIPS (two of them ), S390 somewhat similar but quite different implementations. They use a sync version, where the dirty bitmaps are maintained at arch level and then copied to memslot-dirty_bitmap. There is only commonality between x86 and ARM right now, x86 uses memslot-dirty_bitmap directly. Maybe this function should go back to architecture layer, it's unlikely it can become generic across all architectures. There is also the issue of kvm_flush_remote_tlbs(), that's also weak, the generic one is using IPIs. Since it's only used in mmu.c maybe make this one static. -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v7 4/4] arm: dirty page logging 2nd stage page fault handling support
On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Tue, Jun 03, 2014 at 04:19:27PM -0700, Mario Smarduch wrote: This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and disolves huge pages to page tables. s/disolves/dissolves/g Will do. In case migration is canceled huge pages will be used again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 36 ++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1c546c9..aca4fbf 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -966,6 +966,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; +/* Get logging status, if dirty_bitmap is not NULL then logging is on */ +bool logging_active = !!memslot-dirty_bitmap; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -1019,10 +1021,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(kvm-mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; -if (!hugetlb !force_pte) + +/* When logging don't spend cycles to check for huge pages */ drop the comment: either explain the entire clause (which would be too long) or don't explain anything. Ok. +if (!hugetlb !force_pte !logging_active) instead of having all this, can't you just change if (is_vm_hugetlb_page(vma)) to if (is_vm_hugetlb_page(vma) !logging_active) then you're also not mucking around with the gfn etc. I didn't want to modify this function too much, but if that's ok that simplifies things a lot. hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); -if (hugetlb) { +/* + * Force all not present/perm faults to PTE handling, address both + * PMD and PTE faults + */ I don't understand this comment? In which case does this apply? The cases I see here - - huge page permission fault is forced into page table code while logging - pte permission/not present handled by page table code as before. +if (hugetlb !logging_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -1034,6 +1042,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { +/* + * If pmd is mapping a huge page then clear it and let + * stage2_set_pte() create a pte table. At the sametime + * you write protect the pte (PAGE_S2 pgprot_t). + */ +if (logging_active) { +pmd_t *pmd; +if (hugetlb) { +pfn += pte_index(fault_ipa); +gfn = fault_ipa PAGE_SHIFT; +new_pte = pfn_pte(pfn, PAGE_S2); +} +pmd = stage2_get_pmd(kvm, NULL, fault_ipa); +if (pmd kvm_pmd_huge(*pmd)) +clear_pmd_entry(kvm, pmd, fault_ipa); +} now instead of all this, you just need to check for kvm_pmd_huge() in stage2_set_pte() and if that's true, you clear it, and then then install your new pte. Yes this really simplifies things! kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); } @@ -1041,6 +1065,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } +/* + * Log the dirty page in dirty_bitmap[], call regardless if logging is + * disabled or enabled both cases handled safely. + * TODO: for larger page size mark mulitple dirty page bits for each + * 4k page. + */ +if (writable) +mark_page_dirty(kvm, gfn); what if you just faulted in a page on a read which wasn't present before but it happens to belong to a writeable memslot, is that page then dirty? hmmm. A bug, must also check if it was a write fault not just that we're dealing with a writable region. This one could be pretty bad on performance, not to mention in accurate. It will be interesting to see new test results, glad you caught that. Thanks, Mario. out_unlock: spin_unlock(kvm-mmu_lock); -- 1.7.9.5 Thanks, -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm
Re: [PATCH v7 4/4] arm: dirty page logging 2nd stage page fault handling support
On 06/10/2014 11:58 PM, Christoffer Dall wrote: On Tue, Jun 10, 2014 at 11:23:17AM -0700, Mario Smarduch wrote: On 06/08/2014 05:05 AM, Christoffer Dall wrote: On Tue, Jun 03, 2014 at 04:19:27PM -0700, Mario Smarduch wrote: This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and disolves huge pages to page tables. s/disolves/dissolves/g Will do. In case migration is canceled huge pages will be used again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 36 ++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1c546c9..aca4fbf 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -966,6 +966,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ + bool logging_active = !!memslot-dirty_bitmap; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -1019,10 +1021,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(kvm-mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + + /* When logging don't spend cycles to check for huge pages */ drop the comment: either explain the entire clause (which would be too long) or don't explain anything. Ok. + if (!hugetlb !force_pte !logging_active) instead of having all this, can't you just change if (is_vm_hugetlb_page(vma)) to if (is_vm_hugetlb_page(vma) !logging_active) then you're also not mucking around with the gfn etc. I didn't want to modify this function too much, but if that's ok that simplifies things a lot. Don't worry about the changes as much as the resulting code. If something requires a lot of refactoring, usually that can be handled by splitting up renames, factoring out functions, etc. into multiple smaller patches. hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); - if (hugetlb) { + /* + * Force all not present/perm faults to PTE handling, address both + * PMD and PTE faults + */ I don't understand this comment? In which case does this apply? The cases I see here - - huge page permission fault is forced into page table code while logging - pte permission/not present handled by page table code as before. Hmm, the wording doesn't really work for me. I don't think this comment adds anything or is required, when getting this deep into the fault handler etc., one better understand what's going on. The most suitable place for a comment in this work is probably in stage2_set_pte() where you can now detect a kvm_pmd_huge(), when you add that, you may want to add a small comment that this only happens when logging dirty pages. + if (hugetlb !logging_active) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -1034,6 +1042,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { + /* + * If pmd is mapping a huge page then clear it and let + * stage2_set_pte() create a pte table. At the sametime + * you write protect the pte (PAGE_S2 pgprot_t). + */ + if (logging_active) { + pmd_t *pmd; + if (hugetlb) { + pfn += pte_index(fault_ipa); + gfn = fault_ipa PAGE_SHIFT; + new_pte = pfn_pte(pfn, PAGE_S2); + } + pmd = stage2_get_pmd(kvm, NULL, fault_ipa); + if (pmd kvm_pmd_huge(*pmd)) + clear_pmd_entry(kvm, pmd, fault_ipa); + } now instead of all this, you just need to check for kvm_pmd_huge() in stage2_set_pte() and if that's true, you clear it, and then then install your new pte. Yes this really simplifies things! kvm_set_s2pte_writable(new_pte); kvm_set_pfn_dirty(pfn); } @@ -1041,6 +1065,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); } + /* + * Log the dirty page in dirty_bitmap[], call regardless if logging is + * disabled or enabled both cases handled safely. + * TODO: for larger page size mark mulitple dirty page bits for each
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
Hi Paolo, for ARM dirty page logging we have couple functions that are generic. - kvm_vm_ioctl_get_dirty_log - is identical to x86 version - kvm_flush_remote_tlbs - ARM version does hardware broadcast it's different from the generic one in kvm_main.c How to proceed to make these generic? Please see below from Christoffer. Current patch moves kvm_vm_ioctl_get_dirty_log() into kvm_main.c and labels it and kvm_flush_remote_tlbs weak. Please advise. Thanks, - Mario So I don't see a lot of use of weak symbols in kvm_main.c (actually on kvmarm/next I don't see any), but we do want to share code when more than one architecture implements something in the exact same way, like it seems x86 and ARM is doing here for this particular function. I think the KVM scheme is usually to check for some define, like: #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG ret = kvm_arch_get_dirty_log(...); #else ret = kvm_get_dirty_log(...); #endif but Paolo may have a more informed oppinion of how to deal with these. Thanks, -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
On 06/11/2014 12:03 AM, Christoffer Dall wrote: There is also the issue of kvm_flush_remote_tlbs(), that's also weak, the generic one is using IPIs. Since it's only used in mmu.c maybe make this one static. So I don't see a lot of use of weak symbols in kvm_main.c (actually on kvmarm/next I don't see any), but we do want to share code when more than one architecture implements something in the exact same way, like it seems x86 and ARM is doing here for this particular function. I think the KVM scheme is usually to check for some define, like: #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG ret = kvm_arch_get_dirty_log(...); #else ret = kvm_get_dirty_log(...); #endif but Paolo may have a more informed oppinion of how to deal with these. Thanks, -Christoffer One approach I'm trying looking at the code in kvm_main(). This approach applies more to selecting features as opposed to selecting generic vs architecture specific functions. 1.- - add to 'virt/kvm/Kconfig' config HAVE_KVM_ARCH_TLB_FLUSH_ALL bool config HAVE_KVM_ARCH_DIRTY_LOG bool 2.-- For ARM and later ARM64 add to 'arch/arm[64]/kvm/Kconfig' config KVM bool Kernel-based Virtual Machine (KVM) support ... select HAVE_KVM_ARCH_TLB_FLUSH_ALL .. Not for HAVE_KVM_ARCH_DIRTY_LOG given it's shared with x86, but would need to do it for every other architecture that does not share it (except initially for arm64 since it will use the variant that returns -EINVAL until feature is supported) 3-- In kvm_main.c would have something like void kvm_flush_remote_tlbs(struct kvm *kvm) { #ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL kvm_arch_flush_remote_tlbs(kvm); #else long dirty_count = kvm-tlbs_dirty; smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm-stat.remote_tlb_flush; cmpxchg(kvm-tlbs_dirty, dirty_count, 0); #endif } Then add void kvm_flush_remote_tlbs(struct kvm *kvm) definition to arm kvm_host.h. Define the function in this case mmu.c For the dirty log function int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { #ifdef CONFIG_HAVE_KVM_ARCH_DIRTY_LOG kvm_arch_vm_ioctl_get_dirty_log(kvm, log); #else int r; struct kvm_memory_slot *memslot; unsigned long n, i; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool is_dirty = false; ... But then you have to go into every architecture and define the kvm_arch_vm_...() variant. Is this the right way to go? Or is there a simpler way? Thanks, - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 3/4] arm: dirty log write protect management support
This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. For ARMv8 read of dirty log returns invalid operation. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |3 ++ arch/arm/kvm/arm.c | 83 +++ arch/arm/kvm/mmu.c | 22 +++ 3 files changed, 108 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 586c467..dbf3d45 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -233,5 +233,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index e11c2dd..cb3c090 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -783,10 +783,93 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } +#ifdef CONFIG_ARM +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing data, we keep the following order for + * each bit: + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Flush TLB's if needed. + * 4. Copy the snapshot to the userspace. + * + * Between 2 and 3, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page will be reported dirty at + * step 4 using the snapshot taken before and step 3 ensures that successive + * writes will be logged for the next call. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + int r; + struct kvm_memory_slot *memslot; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; + + mutex_lock(kvm-slots_lock); + + r = -EINVAL; + if (log-slot = KVM_USER_MEM_SLOTS) + goto out; + + memslot = id_to_memslot(kvm-memslots, log-slot); + + dirty_bitmap = memslot-dirty_bitmap; + r = -ENOENT; + if (!dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); + + spin_lock(kvm-mmu_lock); + + for (i = 0; i n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; + + if (!dirty_bitmap[i]) + continue; + + is_dirty = true; + + mask = xchg(dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); + } + + spin_unlock(kvm-mmu_lock); + + lockdep_assert_held(kvm-slots_lock); + if (is_dirty) + kvm_tlb_flush_vmid(kvm); + + r = -EFAULT; + if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; + + r = 0; +out: + mutex_unlock(kvm-slots_lock); + return r; +} +#else int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { return -EINVAL; } +#endif static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 37edcbe..1caf511 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -888,6 +888,28 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) kvm_tlb_flush_vmid(kvm); spin_unlock(kvm-mmu_lock); } + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset:The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot-base_gfn + gfn_offset; + phys_addr_t start = (base_gfn
[PATCH v8 4/4] arm: dirty page logging 2nd stage page fault handling support
This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and dissolves huge pages to page tables. In case migration is canceled huge pages will be used again. For ARMv8 logging is hardcoded to false. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 31 +-- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1caf511..d49df28 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -641,7 +641,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, bool iomap, + bool logging_active) { pmd_t *pmd; pte_t *pte, old_pte; @@ -656,6 +657,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } + /* +* While dirty memory logging, clear PMD entry for huge page and split +* into smaller pages, to track dirty memory at page granularity. +*/ + if (logging_active kvm_pmd_huge(*pmd)) { + phys_addr_t ipa = pmd_pfn(*pmd) PAGE_SHIFT; + clear_pmd_entry(kvm, pmd, ipa); + } + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -708,7 +718,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) goto out; spin_lock(kvm-mmu_lock); - ret = stage2_set_pte(kvm, cache, addr, pte, true); + ret = stage2_set_pte(kvm, cache, addr, pte, true, false); spin_unlock(kvm-mmu_lock); if (ret) goto out; @@ -925,6 +935,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ +#ifdef CONFIG_ARM + bool logging_active = !!memslot-dirty_bitmap; +#else + bool logging_active = false; +#endif write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -935,7 +951,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(current-mm-mmap_sem); vma = find_vma_intersection(current-mm, hva, hva + 1); - if (is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma) !logging_active) { hugetlb = true; gfn = (fault_ipa PMD_MASK) PAGE_SHIFT; } else { @@ -978,7 +994,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(kvm-mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + if (!hugetlb !force_pte !logging_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); if (hugetlb) { @@ -997,9 +1013,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); - ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); + ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false, + logging_active); } + if (write_fault) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); @@ -1150,7 +1169,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; - stage2_set_pte(kvm, NULL, gpa, pte, false); + stage2_set_pte(kvm, NULL, gpa, pte, false, false); } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)
Patch adds support for initial write protection VM memlsot. This patch series assumes that huge PUDs will not be used in 2nd stage tables. For ARMv8 nothing happens here. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |1 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|9 +++ arch/arm/kvm/mmu.c| 128 + 5 files changed, 159 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index ac3bb65..586c467 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -232,5 +232,6 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_tlb_flush_vmid(struct kvm *kvm); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ + pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ + return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end)\ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..e11c2dd 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { +#ifdef CONFIG_ARM + /* +* At this point memslot has been committed and there is an +* allocated dirty_bitmap[], dirty pages will be be tracked while the +* memory slot is write protected. +*/ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_wp_memory_region(kvm, mem-slot); +#endif } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e90b9e4..37edcbe 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -762,6 +762,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +#ifdef CONFIG_ARM +/** + * stage2_wp_pte_range - write protect PTE range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmd_range - write protect PMD range + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else + stage2_wp_pte_range(pmd, addr, next); + + } + } while (pmd
[PATCH v8 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param
Patch adds HYP interface for global VM TLB invalidation without address parameter. Moved VM TLB flushing back to architecture layer. This patch depends on the unmap_range() patch, it needs to be applied first. No changes to ARMv8. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |2 ++ arch/arm/kvm/interrupts.S | 11 +++ arch/arm/kvm/mmu.c | 16 4 files changed, 30 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..ac3bb65 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,4 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_tlb_flush_vmid(struct kvm *kvm); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..a3717b7 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * @kvm: pointer to kvm structure + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2ac9588..e90b9e4 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,22 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +#ifdef CONFIG_ARM +/** + * kvm_tlb_flush_vmid() - flush all VM TLB entries + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). + */ +void kvm_tlb_flush_vmid(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} +#endif + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 0/4] arm: dirty page logging support for ARMv7
This patch adds support for dirty page logging so far tested only on ARMv7, and verified to compile on ARMv8. With dirty page logging, GICv2 vGIC and arch timer save/restore support, live migration is supported. Dirty page logging support - - initially write protects VM RAM memory regions - 2nd stage page tables - add support to read dirty page log and again write protect the dirty pages - second stage page table for next pass. - second stage huge page are dissolved into page tables to keep track of dirty pages at page granularity. Tracking at huge page granularity limits migration to an almost idle system. - In the event migration is canceled, normal behavior is resumed huge pages are rebuilt over time. - At this time reverse mappings are not used to for write protecting of 2nd stage tables. - Future work - Enable diry memory logging to work on ARMv8 FastModels. Test Environment: --- NOTE: RUNNING on FAST Models will hardly ever fail and mask bugs, infact initially light loads were succeeding without dirty page logging support. --- - Will put all components on github, including test setup diagram - In short summary o Two ARM Exyonys 5440 development platforms - 4-way 1.7 GHz, with 8GB, 256GB storage, 1GBs Ethernet, with swap enabled o NFS Server runing Ubuntu 13.04 - both ARM boards mount shared file system - Shared file system includes - QEMU, Guest Kernel, DTB, multiple Ext3 root file systems. o Component versions: qemu-1.7.5, vexpress-a15, host/guest kernel 3.15-rc1, o Use QEMU Ctr+A+C and migrate -d tcp:IP:port command - Destination command syntax: can change smp to 4, machine model outdated, but has been tested on virt by others (need to upgrade) /mnt/migration/qemu-system-arm -enable-kvm -smp 2 -kernel \ /mnt/migration/zImage -dtb /mnt/migration/guest-a15.dtb -m 1792 \ -M vexpress-a15 -cpu cortex-a15 -nographic \ -append root=/dev/vda rw console=ttyAMA0 rootwait \ -drive if=none,file=/mnt/migration/guest1.root,id=vm1 \ -device virtio-blk-device,drive=vm1 \ -netdev type=tap,id=net0,ifname=tap0 \ -device virtio-net-device,netdev=net0,mac=52:54:00:12:34:58 \ -incoming tcp:0:4321 - Source command syntax same except '-incoming' o Test migration of multiple VMs use tap0, tap1, ..., and guest0.root, . has been tested as well. o On source run multiple copies of 'dirtyram.arm' - simple program to dirty pages periodically. ./dirtyarm.ram total mmap size dirty page size sleep time Example: ./dirtyram.arm 102580 812 30 - dirty 102580 pages - 812 pages every 30ms with an incrementing counter - run anywhere from one to as many copies as VM resources can support. If the dirty rate is too high migration will run indefintely - run date output loop, check date is picked up smoothly - place guest/host into page reclaim/swap mode - by whatever means in this case run multiple copies of 'dirtyram.ram' on host - issue migrate command(s) on source - Top result is 409600, 8192, 5 o QEMU is instrumented to save RAM memory regions on source and destination after memory is migrated, but before guest started. Later files are checksummed on both ends for correctness, given VMs are small this works. o Guest kernel is instrumented to capture current cycle counter - last cycle and compare to qemu down time to test arch timer accuracy. o Network failover is at L3 due to interface limitations, ping continues working transparently o Also tested 'migrate_cancel' to test reassemble of huge pages (inserted low level instrumentation code). - Basic Network Test - Assuming one ethernet interface available Source host IP 192.168.10.101/24, VM tap0 192.168.2.1/24 and VM eth0 192.168.2.100/24 with default route 192.168.2.1 Destination host IP 192.168.10.100/24, VM same settings as above. Both VMs have identical MAC addresses. Initially NFS server route to 192.168.2.100 is via 192.168.10.101 - ssh 192.168.2.100 - start migration from source to destination - after migration ends - on NFS server switch routes. route add -host 192.168.2.100 gw 192.168.10.100 ssh should resume after route switch. ping as well should work seamlessly. Changes since v7: - Reworked write protection of dirty page mask - Moved generic code back to architecture layer, keep it there for time being, until a KVM framework for architecture functions to override genric ones is defined. - Fixed conditon bug for marking pages dirty Mario Smarduch (4): add ARMv7 HYP API to flush VM TLBs without address param dirty page logging inital mem region write protect (w/no huge PUD support) dirty log write protect management support dirty page logging 2nd stage page fault handling
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
On 07/04/2014 09:29 AM, Paolo Bonzini wrote: Il 03/07/2014 17:04, Christoffer Dall ha scritto: Hmmm, I'm really not an expert in the 'established procedures' for what to put in config files etc., but here's my basic take: a) you wouldn't put a config option in Kconfig unless it's comething that's actually configurable or some generic feature/subsystem that should only be enabled if hardware has certain capabilities or other config options enabled. b) this seems entirely an implementation issue and not depending on anything users should select. Actually I think Mario's idea is just fine. Non-user-accessible Kconfig symbols are used a lot to invoke an #ifdef elsewhere in the code; compare this with his proposal is a bit different but not too much. Sometimes #defines are used, sometimes Kconfig symbols, but the idea is the same. Paolo Hi Paolo, thanks for your feedback. I forgot to add that I tried define ARCH_HAVE_... approach but checkpatch rejected it and insisted on Kconfig. Thanks, - Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support
Hi Christoffer, Just back from holiday - a short plan to resume work. - move VM tlb flush and kvm log functions to generic, per Paolo's comments use Kconfig approach - update other architectures make sure they compile - Keep it ARMv7 for now Get maintainers to test the branch. In parallel add dirty log support to ARMv8, to test I would add a QEMU monitor function to validate general operation. Your thoughts? Thanks, Mario On 07/03/2014 08:04 AM, Christoffer Dall wrote: On Tue, Jun 17, 2014 at 06:41:52PM -0700, Mario Smarduch wrote: On 06/11/2014 12:03 AM, Christoffer Dall wrote: There is also the issue of kvm_flush_remote_tlbs(), that's also weak, the generic one is using IPIs. Since it's only used in mmu.c maybe make this one static. So I don't see a lot of use of weak symbols in kvm_main.c (actually on kvmarm/next I don't see any), but we do want to share code when more than one architecture implements something in the exact same way, like it seems x86 and ARM is doing here for this particular function. I think the KVM scheme is usually to check for some define, like: #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG ret = kvm_arch_get_dirty_log(...); #else ret = kvm_get_dirty_log(...); #endif but Paolo may have a more informed oppinion of how to deal with these. Thanks, -Christoffer One approach I'm trying looking at the code in kvm_main(). This approach applies more to selecting features as opposed to selecting generic vs architecture specific functions. 1.- - add to 'virt/kvm/Kconfig' config HAVE_KVM_ARCH_TLB_FLUSH_ALL bool config HAVE_KVM_ARCH_DIRTY_LOG bool 2.-- For ARM and later ARM64 add to 'arch/arm[64]/kvm/Kconfig' config KVM bool Kernel-based Virtual Machine (KVM) support ... select HAVE_KVM_ARCH_TLB_FLUSH_ALL .. Not for HAVE_KVM_ARCH_DIRTY_LOG given it's shared with x86, but would need to do it for every other architecture that does not share it (except initially for arm64 since it will use the variant that returns -EINVAL until feature is supported) 3-- In kvm_main.c would have something like void kvm_flush_remote_tlbs(struct kvm *kvm) { #ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL kvm_arch_flush_remote_tlbs(kvm); #else long dirty_count = kvm-tlbs_dirty; smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm-stat.remote_tlb_flush; cmpxchg(kvm-tlbs_dirty, dirty_count, 0); #endif } Then add void kvm_flush_remote_tlbs(struct kvm *kvm) definition to arm kvm_host.h. Define the function in this case mmu.c For the dirty log function int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { #ifdef CONFIG_HAVE_KVM_ARCH_DIRTY_LOG kvm_arch_vm_ioctl_get_dirty_log(kvm, log); #else int r; struct kvm_memory_slot *memslot; unsigned long n, i; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool is_dirty = false; ... But then you have to go into every architecture and define the kvm_arch_vm_...() variant. Is this the right way to go? Or is there a simpler way? Hmmm, I'm really not an expert in the 'established procedures' for what to put in config files etc., but here's my basic take: a) you wouldn't put a config option in Kconfig unless it's comething that's actually configurable or some generic feature/subsystem that should only be enabled if hardware has certain capabilities or other config options enabled. b) this seems entirely an implementation issue and not depending on anything users should select. c) therefore, I think it's either a question of always having an arch-specific implementation that you probe for its return value or you have some sort of define in the header files for the arch/X/include/asm/kvm_host.h to control what you need. -Christoffer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush
Patch adds HYP interface for global VM TLB invalidation without address parameter. Generic VM TLB flush calls ARMv7 arch defined TLB flush function. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |1 + arch/arm/kvm/Kconfig|1 + arch/arm/kvm/interrupts.S | 12 arch/arm/kvm/mmu.c | 17 + virt/kvm/Kconfig|3 +++ virt/kvm/kvm_main.c |4 7 files changed, 39 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..042206f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -230,5 +230,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_arch_flush_remote_tlbs(struct kvm *); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 466bd29..44d3b6f 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO + select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_ARM_HOST depends on ARM_VIRT_EXT ARM_LPAE ---help--- diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..1258d46 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,18 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2ac9588..35254c6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,23 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +#ifdef CONFIG_ARM +/** + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). + */ +void kvm_arch_flush_remote_tlbs(struct kvm *kvm) +{ + if (kvm) + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + +#endif + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 13f2d19..f1efaa5 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -34,3 +34,6 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT config KVM_VFIO bool + +config HAVE_KVM_ARCH_TLB_FLUSH_ALL + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa70c6e..258f3d9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -186,12 +186,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { +#ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL + kvm_arch_flush_remote_tlbs(kvm); +#else long dirty_count = kvm-tlbs_dirty; smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm-stat.remote_tlb_flush; cmpxchg(kvm-tlbs_dirty, dirty_count, 0); +#endif } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v9 0/4] arm: dirty page logging support for ARMv7
host IP 192.168.10.100/24, VM same settings as above. Both VMs have identical MAC addresses. Initially NFS server route to 192.168.2.100 is via 192.168.10.101 - ssh 192.168.2.100 - start migration from source to destination - after migration ends - on NFS server switch routes. route add -host 192.168.2.100 gw 192.168.10.100 ssh should resume after route switch. ping as well should work seamlessly. Mario Smarduch (4): add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support) dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8 ia64 mips powerpc s390 arch specific ARMv7 dirty page logging 2nd stage page fault handling support arch/arm/include/asm/kvm_asm.h|1 + arch/arm/include/asm/kvm_host.h |2 + arch/arm/include/asm/kvm_mmu.h| 20 arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/Kconfig |1 + arch/arm/kvm/arm.c| 17 ++- arch/arm/kvm/interrupts.S | 12 ++ arch/arm/kvm/mmu.c| 198 - arch/arm64/include/asm/kvm_host.h |2 + arch/arm64/kvm/Kconfig|1 + arch/ia64/include/asm/kvm_host.h |1 + arch/ia64/kvm/Kconfig |1 + arch/ia64/kvm/kvm-ia64.c |2 +- arch/mips/include/asm/kvm_host.h |2 +- arch/mips/kvm/Kconfig |1 + arch/mips/kvm/kvm_mips.c |2 +- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/kvm/Kconfig |1 + arch/powerpc/kvm/book3s.c |2 +- arch/powerpc/kvm/booke.c |2 +- arch/s390/include/asm/kvm_host.h |2 + arch/s390/kvm/Kconfig |1 + arch/s390/kvm/kvm-s390.c |2 +- arch/x86/kvm/x86.c| 86 -- include/linux/kvm_host.h |3 + virt/kvm/Kconfig |6 + virt/kvm/kvm_main.c | 94 27 files changed, 366 insertions(+), 99 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v9 3/4] arm: dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8 ia64 mips powerpc s390 arch specific
This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. The dirty log read function is generic for armv7 and x86, and arch specific for arm64, ia64, mips, powerpc, s390. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/arm.c |8 +++- arch/arm/kvm/mmu.c | 22 + arch/arm64/include/asm/kvm_host.h |2 + arch/arm64/kvm/Kconfig |1 + arch/ia64/include/asm/kvm_host.h|1 + arch/ia64/kvm/Kconfig |1 + arch/ia64/kvm/kvm-ia64.c|2 +- arch/mips/include/asm/kvm_host.h|2 +- arch/mips/kvm/Kconfig |1 + arch/mips/kvm/kvm_mips.c|2 +- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/kvm/Kconfig|1 + arch/powerpc/kvm/book3s.c |2 +- arch/powerpc/kvm/booke.c|2 +- arch/s390/include/asm/kvm_host.h|2 + arch/s390/kvm/Kconfig |1 + arch/s390/kvm/kvm-s390.c|2 +- arch/x86/kvm/x86.c | 86 - include/linux/kvm_host.h|3 ++ virt/kvm/Kconfig|3 ++ virt/kvm/kvm_main.c | 90 +++ 21 files changed, 143 insertions(+), 93 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index e11c2dd..f7739a0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -783,10 +783,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +#ifdef CONFIG_ARM64 +/* + * For now features not supported on ARM64, the #ifdef is added to make that + * clear but not needed since ARM64 Kconfig selects function in generic code. + */ +int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { return -EINVAL; } +#endif static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7bfc792..ca84331 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -889,6 +889,28 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) kvm_flush_remote_tlbs(kvm); spin_unlock(kvm-mmu_lock); } + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset:The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot-base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} #endif static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 92242ce..b4a280b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -200,4 +200,6 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, hyp_stack_ptr, vector_ptr); } +int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8ba85e9..9e21a8a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_ARCH_DIRTY_LOG select KVM_MMIO select KVM_ARM_HOST select KVM_ARM_VGIC diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index db95f57..d79f520 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -594,6 +594,7 @@ void kvm_sal_emul(struct kvm_vcpu *vcpu); #define __KVM_HAVE_ARCH_VM_ALLOC 1 struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); #endif /* __ASSEMBLY__*/ diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 990b864..32dd6c8 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -24,6 +24,7 @@ config KVM depends on BROKEN select PREEMPT_NOTIFIERS
[PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)
Patch adds support for initial write protection VM memlsot. This patch series assumes that huge PUDs will not be used in 2nd stage tables. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |1 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|9 +++ arch/arm/kvm/mmu.c| 128 + 5 files changed, 159 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 042206f..6521a2d 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,5 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_arch_flush_remote_tlbs(struct kvm *); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ + pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ + return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end)\ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..e11c2dd 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { +#ifdef CONFIG_ARM + /* +* At this point memslot has been committed and there is an +* allocated dirty_bitmap[], dirty pages will be be tracked while the +* memory slot is write protected. +*/ + if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_wp_memory_region(kvm, mem-slot); +#endif } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 35254c6..7bfc792 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +#ifdef CONFIG_ARM +/** + * stage2_wp_pte_range - write protect PTE range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmd_range - write protect PMD range + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else + stage2_wp_pte_range(pmd, addr, next); + + } + } while (pmd
[PATCH v9 4/4] arm: ARMv7 dirty page logging 2nd stage page fault handling support
This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and dissolves huge pages to page tables. In case migration is canceled huge pages will be used again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 31 +-- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ca84331..a17812a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, bool iomap, + bool logging_active) { pmd_t *pmd; pte_t *pte, old_pte; @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } + /* +* While dirty memory logging, clear PMD entry for huge page and split +* into smaller pages, to track dirty memory at page granularity. +*/ + if (logging_active kvm_pmd_huge(*pmd)) { + phys_addr_t ipa = pmd_pfn(*pmd) PAGE_SHIFT; + clear_pmd_entry(kvm, pmd, ipa); + } + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) goto out; spin_lock(kvm-mmu_lock); - ret = stage2_set_pte(kvm, cache, addr, pte, true); + ret = stage2_set_pte(kvm, cache, addr, pte, true, false); spin_unlock(kvm-mmu_lock); if (ret) goto out; @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ + #ifdef CONFIG_ARM + bool logging_active = !!memslot-dirty_bitmap; + #else + bool logging_active = false; + #endif write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(current-mm-mmap_sem); vma = find_vma_intersection(current-mm, hva, hva + 1); - if (is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma) !logging_active) { hugetlb = true; gfn = (fault_ipa PMD_MASK) PAGE_SHIFT; } else { @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(kvm-mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb !force_pte) + if (!hugetlb !force_pte !logging_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); if (hugetlb) { @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); - ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); + ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false, + logging_active); } + if (write_fault) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; - stage2_set_pte(kvm, NULL, gpa, pte, false); + stage2_set_pte(kvm, NULL, gpa, pte, false, false); } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush
On 07/24/2014 11:12 PM, Alexander Graf wrote: On 25.07.14 02:56, Mario Smarduch wrote: Patch adds HYP interface for global VM TLB invalidation without address parameter. Generic VM TLB flush calls ARMv7 arch defined TLB flush function. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |1 + arch/arm/kvm/Kconfig|1 + arch/arm/kvm/interrupts.S | 12 arch/arm/kvm/mmu.c | 17 + virt/kvm/Kconfig|3 +++ virt/kvm/kvm_main.c |4 7 files changed, 39 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..042206f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -230,5 +230,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_arch_flush_remote_tlbs(struct kvm *); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 466bd29..44d3b6f 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO +select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_ARM_HOST depends on ARM_VIRT_EXT ARM_LPAE ---help--- diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..1258d46 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,18 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bxlr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) +b__kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2ac9588..35254c6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,23 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +#ifdef CONFIG_ARM Why the ifdef? We're in ARM code here, no? For the time being to compile ARM64. +/** + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). + */ +void kvm_arch_flush_remote_tlbs(struct kvm *kvm) +{ +if (kvm) +kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); I don't see why we should ever call this function with kvm==NULL. Yes that true, I copied a generic arm/arm64 mmu function. But it's use here guarantees kvm != NULL. Alex Thanks, Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)
On 07/24/2014 11:16 PM, Alexander Graf wrote: On 25.07.14 02:56, Mario Smarduch wrote: Patch adds support for initial write protection VM memlsot. This patch series assumes that huge PUDs will not be used in 2nd stage tables. Is this a valid assumption? Right now it's unclear if PUDs will be used to back guest memory, assuming so required quite a bit of additional code. After discussing on mailing list it was recommended to treat this as BUG_ON case for now. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |1 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|9 +++ arch/arm/kvm/mmu.c| 128 + 5 files changed, 159 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 042206f..6521a2d 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,5 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_arch_flush_remote_tlbs(struct kvm *); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ +pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ +return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ +pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ +return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end)\ ({u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR(_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY(_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR(_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..e11c2dd 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { +#ifdef CONFIG_ARM Same question on CONFIG_ARM here. Is this the define used to distinguish between 32bit and 64bit? Yes let ARM64 compile. Eventually we'll come back to ARM64 soon, and these will go. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs ... - looking for comments
- place guest/host into page reclaim/swap mode - by whatever means in this case run multiple copies of 'dirtyram.ram' on host - issue migrate command(s) on source - Top result is 409600, 8192, 5 o QEMU is instrumented to save RAM memory regions on source and destination after memory is migrated, but before guest started. Later files are checksummed on both ends for correctness, given VMs are small this works. o Guest kernel is instrumented to capture current cycle counter - last cycle and compare to qemu down time to test arch timer accuracy. o Network failover is at L3 due to interface limitations, ping continues working transparently o Also tested 'migrate_cancel' to test reassemble of huge pages (inserted low level instrumentation code). - Basic Network Test - Assuming one ethernet interface available Source host IP 192.168.10.101/24, VM tap0 192.168.2.1/24 and VM eth0 192.168.2.100/24 with default route 192.168.2.1 Destination host IP 192.168.10.100/24, VM same settings as above. Both VMs have identical MAC addresses. Initially NFS server route to 192.168.2.100 is via 192.168.10.101 - ssh 192.168.2.100 - start migration from source to destination - after migration ends - on NFS server switch routes. route add -host 192.168.2.100 gw 192.168.10.100 ssh should resume after route switch. ping as well should work seamlessly. Mario Smarduch (4): add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support) dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8 ia64 mips powerpc s390 arch specific ARMv7 dirty page logging 2nd stage page fault handling support arch/arm/include/asm/kvm_asm.h|1 + arch/arm/include/asm/kvm_host.h |2 + arch/arm/include/asm/kvm_mmu.h| 20 arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/Kconfig |1 + arch/arm/kvm/arm.c| 17 ++- arch/arm/kvm/interrupts.S | 12 ++ arch/arm/kvm/mmu.c| 198 - arch/arm64/include/asm/kvm_host.h |2 + arch/arm64/kvm/Kconfig|1 + arch/ia64/include/asm/kvm_host.h |1 + arch/ia64/kvm/Kconfig |1 + arch/ia64/kvm/kvm-ia64.c |2 +- arch/mips/include/asm/kvm_host.h |2 +- arch/mips/kvm/Kconfig |1 + arch/mips/kvm/kvm_mips.c |2 +- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/kvm/Kconfig |1 + arch/powerpc/kvm/book3s.c |2 +- arch/powerpc/kvm/booke.c |2 +- arch/s390/include/asm/kvm_host.h |2 + arch/s390/kvm/Kconfig |1 + arch/s390/kvm/kvm-s390.c |2 +- arch/x86/kvm/x86.c| 86 -- include/linux/kvm_host.h |3 + virt/kvm/Kconfig |6 + virt/kvm/kvm_main.c | 94 27 files changed, 366 insertions(+), 99 deletions(-) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush
On 08/11/2014 12:12 PM, Christoffer Dall wrote: On Thu, Jul 24, 2014 at 05:56:05PM -0700, Mario Smarduch wrote: Patch adds HYP interface for global VM TLB invalidation without address parameter. Generic VM TLB flush calls ARMv7 arch defined TLB flush function. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_asm.h |1 + arch/arm/include/asm/kvm_host.h |1 + arch/arm/kvm/Kconfig|1 + arch/arm/kvm/interrupts.S | 12 arch/arm/kvm/mmu.c | 17 + virt/kvm/Kconfig|3 +++ virt/kvm/kvm_main.c |4 7 files changed, 39 insertions(+) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 53b3c4a..21bc519 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 193ceaf..042206f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -230,5 +230,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); +void kvm_arch_flush_remote_tlbs(struct kvm *); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 466bd29..44d3b6f 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO +select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_ARM_HOST depends on ARM_VIRT_EXT ARM_LPAE ---help--- diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d40..1258d46 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,18 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) +b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + + / * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2ac9588..35254c6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,23 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +#ifdef CONFIG_ARM I assume this is here because of arm vs. arm64, use static inlines in the header files to differentiate instead. Yes that's right, will move it. +/** + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries without address + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by + * kvm_tlb_flush_vmid_ipa(). remove the last sentence from here, it's repetitive. Ok. + */ +void kvm_arch_flush_remote_tlbs(struct kvm *kvm) +{ +if (kvm) +kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} + +#endif + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 13f2d19..f1efaa5 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -34,3 +34,6 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT config KVM_VFIO bool + +config HAVE_KVM_ARCH_TLB_FLUSH_ALL + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa70c6e..258f3d9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -186,12 +186,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { +#ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL +kvm_arch_flush_remote_tlbs(kvm); +#else long dirty_count = kvm-tlbs_dirty; smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm-stat.remote_tlb_flush; cmpxchg(kvm-tlbs_dirty, dirty_count, 0); +#endif I would split this into two patches, one trivial one for the KVM generic solution, and one to add the arm-specific part. That will make your commit text and title much nicer to read too. Yes makes sense easier to review generic and arch layers. Thanks
Re: [PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)
On 08/11/2014 12:12 PM, Christoffer Dall wrote: Remove the parenthesis from the subject line. Hmmm have to check this don't see it my patch file. On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote: Patch adds support for initial write protection VM memlsot. This patch series ^^^ stray whitespace of Need to watch out for these adds delays to review cycle. assumes that huge PUDs will not be used in 2nd stage tables. may be worth mentioning that this is always valid on ARMv7. Yep definitely. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |1 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|9 +++ arch/arm/kvm/mmu.c| 128 + 5 files changed, 159 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 042206f..6521a2d 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,5 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_arch_flush_remote_tlbs(struct kvm *); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ +pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ +return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ +pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ +return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end) \ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..e11c2dd 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { +#ifdef CONFIG_ARM +/* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be be tracked while the + * memory slot is write protected. + */ +if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) +kvm_mmu_wp_memory_region(kvm, mem-slot); +#endif } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 35254c6..7bfc792 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +#ifdef CONFIG_ARM +/** + * stage2_wp_pte_range - write protect PTE range + * @pmd:pointer to pmd entry + * @addr: range start address + * @end:range end address + */ +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ +pte_t *pte; + +pte = pte_offset_kernel(pmd, addr); +do { +if (!pte_none(*pte)) { +if (!kvm_s2pte_readonly(pte)) +kvm_set_s2pte_readonly(pte); +} +} while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmd_range - write protect PMD range + * @pud:pointer to pud entry + * @addr: range start address + * @end:range end address + */ +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end
Re: [PATCH v9 3/4] arm: dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8 ia64 mips powerpc s390 arch specific
On 08/11/2014 12:13 PM, Christoffer Dall wrote: On Thu, Jul 24, 2014 at 05:56:07PM -0700, Mario Smarduch wrote: This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. The dirty log read function is generic for armv7 and x86, and arch specific for arm64, ia64, mips, powerpc, s390. So I would also split up this patch. One that only modifies the existing functionality, but does not introduce any new functionality for ARM. Put this first patch in the beginning of the patch series with the other prepatory patch, so that you get something like this: [PATCH 1/X] KVM: Add architecture-specific TLB flush implementations [PATCH 2/X] KVM: Add generic implementation of kvm_vm_ioctl_get_dirty_log [PATCH 3/X] arm: KVM: Add ARMv7 API to flush TLBs [PATCH 4/X] arm: KVM: Add initial dirty page locking infrastructure ... Yes definitely, thanks for the advice makes the patch series easier to review. That will make it easier to get the patches accepted and for us to review... Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/arm.c |8 +++- arch/arm/kvm/mmu.c | 22 + arch/arm64/include/asm/kvm_host.h |2 + arch/arm64/kvm/Kconfig |1 + arch/ia64/include/asm/kvm_host.h|1 + arch/ia64/kvm/Kconfig |1 + arch/ia64/kvm/kvm-ia64.c|2 +- arch/mips/include/asm/kvm_host.h|2 +- arch/mips/kvm/Kconfig |1 + arch/mips/kvm/kvm_mips.c|2 +- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/kvm/Kconfig|1 + arch/powerpc/kvm/book3s.c |2 +- arch/powerpc/kvm/booke.c|2 +- arch/s390/include/asm/kvm_host.h|2 + arch/s390/kvm/Kconfig |1 + arch/s390/kvm/kvm-s390.c|2 +- arch/x86/kvm/x86.c | 86 - include/linux/kvm_host.h|3 ++ virt/kvm/Kconfig|3 ++ virt/kvm/kvm_main.c | 90 +++ 21 files changed, 143 insertions(+), 93 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index e11c2dd..f7739a0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -783,10 +783,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +#ifdef CONFIG_ARM64 +/* + * For now features not supported on ARM64, the #ifdef is added to make that + * clear but not needed since ARM64 Kconfig selects function in generic code. + */ I don't think this comment is needed, but if you really want it, it should be something like: /* * ARM64 does not support dirty logging and therefore selects * CONFIG_HAVE_KVM_ARCH_DIRTY_LOG. Provide a -EINVAL stub. */ I think it could go since I'm doing arm64 now. +int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { return -EINVAL; } +#endif static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7bfc792..ca84331 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -889,6 +889,28 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) kvm_flush_remote_tlbs(kvm); spin_unlock(kvm-mmu_lock); } + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm:The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +struct kvm_memory_slot *slot, +gfn_t gfn_offset, unsigned long mask) +{ +phys_addr_t base_gfn = slot-base_gfn + gfn_offset; +phys_addr_t start = (base_gfn + __ffs(mask)) PAGE_SHIFT; +phys_addr_t end = (base_gfn + __fls(mask) + 1) PAGE_SHIFT; __fls(x) + 1 is the same as fls(x) For me the __fls(x) + 1 is easier to see the covered range. Unless it really breaks the convention I'd prefer to keep the '+1'. Either way no problem. + +stage2_wp_range(kvm, start, end); +} #endif static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 92242ce..b4a280b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -200,4 +200,6 @@ static
Re: [PATCH v9 4/4] arm: ARMv7 dirty page logging 2nd stage page fault handling support
On 08/11/2014 12:13 PM, Christoffer Dall wrote: On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and dissolves huge pages to page tables. In case migration is canceled huge pages will be used again. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/kvm/mmu.c | 31 +-- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ca84331..a17812a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, bool iomap, + bool logging_active) { pmd_t *pmd; pte_t *pte, old_pte; @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } +/* + * While dirty memory logging, clear PMD entry for huge page and split + * into smaller pages, to track dirty memory at page granularity. + */ +if (logging_active kvm_pmd_huge(*pmd)) { +phys_addr_t ipa = pmd_pfn(*pmd) PAGE_SHIFT; +clear_pmd_entry(kvm, pmd, ipa); clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is definitely not the right thing to call. I don't see that in 3.15rc1/rc4 - static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { if (kvm_pmd_huge(*pmd)) { pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { [] } I thought the purpose of this function was to clear PMD entry. Also ran hundreds of tests no problems. Hmmm confused. +} + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) goto out; spin_lock(kvm-mmu_lock); -ret = stage2_set_pte(kvm, cache, addr, pte, true); +ret = stage2_set_pte(kvm, cache, addr, pte, true, false); spin_unlock(kvm-mmu_lock); if (ret) goto out; @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; +/* Get logging status, if dirty_bitmap is not NULL then logging is on */ +#ifdef CONFIG_ARM +bool logging_active = !!memslot-dirty_bitmap; +#else +bool logging_active = false; +#endif can you make this an inline in the header files for now please? Yes definitely. write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM !write_fault) { @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(current-mm-mmap_sem); vma = find_vma_intersection(current-mm, hva, hva + 1); -if (is_vm_hugetlb_page(vma)) { +if (is_vm_hugetlb_page(vma) !logging_active) { hugetlb = true; gfn = (fault_ipa PMD_MASK) PAGE_SHIFT; } else { @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(kvm-mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; -if (!hugetlb !force_pte) +if (!hugetlb !force_pte !logging_active) hugetlb = transparent_hugepage_adjust(pfn, fault_ipa); if (hugetlb) { @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); -ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false); +ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false, +logging_active); } +if (write_fault) +mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(kvm-mmu_lock); @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; -stage2_set_pte(kvm, NULL, gpa, pte, false); +stage2_set_pte(kvm, NULL, gpa, pte, false, false); why is logging never active if we are called from MMU notifiers? mmu notifiers update sptes, but I don't see how these updates can
Re: [PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)
On 08/11/2014 12:12 PM, Christoffer Dall wrote: Remove the parenthesis from the subject line. On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote: Patch adds support for initial write protection VM memlsot. This patch series ^^^ stray whitespace of assumes that huge PUDs will not be used in 2nd stage tables. may be worth mentioning that this is always valid on ARMv7. Signed-off-by: Mario Smarduch m.smard...@samsung.com --- arch/arm/include/asm/kvm_host.h |1 + arch/arm/include/asm/kvm_mmu.h| 20 ++ arch/arm/include/asm/pgtable-3level.h |1 + arch/arm/kvm/arm.c|9 +++ arch/arm/kvm/mmu.c| 128 + 5 files changed, 159 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 042206f..6521a2d 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,5 +231,6 @@ int kvm_perf_teardown(void); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_arch_flush_remote_tlbs(struct kvm *); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f..08ab5e8 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ +pte_val(*pte) = (pte_val(*pte) ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ +return (pte_val(*pte) L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ +pmd_val(*pmd) = (pmd_val(*pmd) ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ +return (pmd_val(*pmd) L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end) \ ({ u64 __boundary = ((addr) + PGDIR_SIZE) PGDIR_MASK;\ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 85c60ad..d8bb40b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY (_AT(pteval_t, 1) 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) 6) /* HAP[2:1] */ /* diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37..e11c2dd 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { +#ifdef CONFIG_ARM +/* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be be tracked while the + * memory slot is write protected. + */ +if ((change != KVM_MR_DELETE) (mem-flags KVM_MEM_LOG_DIRTY_PAGES)) +kvm_mmu_wp_memory_region(kvm, mem-slot); +#endif } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 35254c6..7bfc792 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +#ifdef CONFIG_ARM +/** + * stage2_wp_pte_range - write protect PTE range + * @pmd:pointer to pmd entry + * @addr: range start address + * @end:range end address + */ +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ +pte_t *pte; + +pte = pte_offset_kernel(pmd, addr); +do { +if (!pte_none(*pte)) { +if (!kvm_s2pte_readonly(pte)) +kvm_set_s2pte_readonly(pte); +} +} while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmd_range - write protect PMD range + * @pud:pointer to pud entry + * @addr: range start address + * @end:range end address + */ +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ +pmd_t *pmd; +phys_addr_t next; + +pmd = pmd_offset(pud, addr); + +do { +next