from:"Mario Smarduch"

[PATCH 1/2] add irq priodrop support

2013-06-11 Thread Mario Smarduch

This is the same Interrupt Priority Drop/Deactivation patch 
emailed some time back (except for 3.10-rc4) used by the initial 
device pass-through support. 

When enabled all IRQs on host write to distributor EOIR and 
DIR reg to dr-prioritize/de-activate an interrupt. For device 
that's passed through only the EOIR is written
to drop the priority, the Guest deactivates it when 
it handles its EOI. This supports exitless EOI that's agnostic
to bus type (i.e. PCI)

The patch has been tested for all configurations:
Host: No Prio Drop  Guest: No Prio Drop
Host: Prio DROP Guest: No Prio Drop
Host: Prio Drop Guest: Prio Drop 

- Mario

Signed-off-by: Mario Smarduch mario.smard...@huawei.com
---
 arch/arm/kvm/Kconfig|8 +++
 drivers/irqchip/irq-gic.c   |  145 ++-
 include/linux/irqchip/arm-gic.h |6 ++
 3 files changed, 156 insertions(+), 3 deletions(-)

diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 370e1a8..c0c9f3c 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -59,6 +59,14 @@ config KVM_ARM_VGIC
---help---
  Adds support for a hardware assisted, in-kernel GIC emulation.
 
+config KVM_ARM_INT_PRIO_DROP
+bool KVM support for Interrupt pass-through
+depends on KVM_ARM_VGIC  OF
+default n
+---help---
+  Seperates interrupt priority drop and deactivation to enable device
+  pass-through to Guests.
+
 config KVM_ARM_TIMER
bool KVM support for Architected Timers
depends on KVM_ARM_VGIC  ARM_ARCH_TIMER
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 1760ceb..9fb4ef3 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -41,10 +41,13 @@
 #include linux/slab.h
 #include linux/irqchip/chained_irq.h
 #include linux/irqchip/arm-gic.h
+#include linux/irqflags.h
+#include linux/bitops.h
 
 #include asm/irq.h
 #include asm/exception.h
 #include asm/smp_plat.h
+#include asm/virt.h
 
 #include irqchip.h
 
@@ -99,6 +102,20 @@ struct irq_chip gic_arch_extn = {
 
 static struct gic_chip_data gic_data[MAX_GIC_NR] __read_mostly;
 
+#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP
+/*
+ * Priority drop/deactivation bit map, 1st 16 bits used for SGIs, this bit map
+ * is shared by several guests. If bit is set only execute EOI which drops
+ * current priority but not deactivation.
+ */
+static u32  gic_irq_prio_drop[DIV_ROUND_UP(1020, 32)] __read_mostly;
+static void gic_eoi_irq_priodrop(struct irq_data *);
+#endif
+
+static void gic_enable_gicc(void __iomem *);
+static void gic_eoi_sgi(u32, void __iomem *);
+static void gic_priodrop_remap_eoi(struct irq_chip *);
+
 #ifdef CONFIG_GIC_NON_BANKED
 static void __iomem *gic_get_percpu_base(union gic_base *base)
 {
@@ -296,7 +313,7 @@ static asmlinkage void __exception_irq_entry 
gic_handle_irq(struct pt_regs *regs
continue;
}
if (irqnr  16) {
-   writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
+   gic_eoi_sgi(irqstat, cpu_base);
 #ifdef CONFIG_SMP
handle_IPI(irqnr, regs);
 #endif
@@ -450,7 +467,7 @@ static void __cpuinit gic_cpu_init(struct gic_chip_data 
*gic)
writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4 / 
4);
 
writel_relaxed(0xf0, base + GIC_CPU_PRIMASK);
-   writel_relaxed(1, base + GIC_CPU_CTRL);
+   gic_enable_gicc(base);
 }
 
 #ifdef CONFIG_CPU_PM
@@ -585,7 +602,7 @@ static void gic_cpu_restore(unsigned int gic_nr)
writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4);
 
writel_relaxed(0xf0, cpu_base + GIC_CPU_PRIMASK);
-   writel_relaxed(1, cpu_base + GIC_CPU_CTRL);
+   gic_enable_gicc(cpu_base);
 }
 
 static int gic_notifier(struct notifier_block *self, unsigned long cmd,
void *v)
@@ -666,6 +683,7 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned 
int irq)
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
irq_hw_number_t hw)
 {
+   gic_priodrop_remap_eoi(gic_chip);
if (hw  32) {
irq_set_percpu_devid(irq);
irq_set_chip_and_handler(irq, gic_chip,
@@ -857,4 +875,125 @@ IRQCHIP_DECLARE(cortex_a9_gic, arm,cortex-a9-gic, 
gic_of_init);
 IRQCHIP_DECLARE(msm_8660_qgic, qcom,msm-8660-qgic, gic_of_init);
 IRQCHIP_DECLARE(msm_qgic2, qcom,msm-qgic2, gic_of_init);
 
+#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP
+/* If HYP mode enabled and PRIO DROP set EOIR function to handle PRIO DROP */
+static inline void gic_priodrop_remap_eoi(struct irq_chip *chip)
+{
+   if (is_hyp_mode_available())
+   chip-irq_eoi = gic_eoi_irq_priodrop;
+}
+
+/* If HYP mode set enable interrupt priority drop/deactivation, and mark
+ * SGIs to deactive through writes to GCICC_DIR. For Guest only enable normal
+ * mode.
+ */
+static void gic_enable_gicc(void __iomem *gicc_base

[PATCH 2/2] add initial kvm dev passhtrough support

2013-06-11 Thread Mario Smarduch


This is the initial device pass through support.
At this time host == guest only is supported.
Basic Operation:

- QEMU parameters: -device kvm-device-assign,host=device name
  for example - kvm-device-assign,host='arm-sp804'. Essentially
  any device that does PIO should be supported.
- Host DTS contains the node for device to be passed through
  The host driver is unbound or not compiled in.
- For Guest the intent is to add a DTS node that QEMU can
  parse and find the guest attributes (Mem. resource, IRQs)
  For now these values default to host. This is a future
  work item to get this working on board other then vexpress.
- The physical interrupt is always passed through to CPU
  where the target vCPU executes or will execute.
  Current approach - pins vCPUs to physical CPUs, when 
  Guest updates CPU affinity is updated in KVM vgic dist
  code. Future work item for IRQ affinity allow vCPU to
  float and on schedule in handle IRQ affinity. For high
  IRQ rates (i.e. wireless NEs) static binding may be used. 
  For some other device (env. mgmt IPMI)where latency is not
  important dynamic may be used, it should be upto the user.
- To support flexible affinity a mask is introduced (QEMU param0
  (although not used here yet)
  o vCPU affinity - vCPU -- CPU binding, the IRQ physical
CPU binding follows vCPU binding dynamically.
- Obviously DMA is not supported
  - early DMA may be supported through a 1:1 mapping but it's unsafe
and so far we don't know of any hardware that's not behind SMMU.
This option may be useful in some embedded/wireless environments,
where the guest may want to swap, secure isolation may not be
an issue or device like look aside crypto engine is not behind IOMMU.
  - IOMMU/VFIO support is key and next item for us to work on. Especially 
for ETSI NFV VFIO is key since 4G/IMS NE pull packets
of wire and switch them directly in user space.

The patch has been tested on fast models in couple ways:
- UP Guest with sp804 timer only - works consistently
- SMP Guest with sp804 timer works consistently. 
  Writes to '/proc/irq/sp804 irq/smp_affinity' 
  confirm dynamic CPU affinity.
- IRQ rates (maybe not that important give its emulated env) reached
  excess of 500.

There is a QEMU piece very simple for now that I will
email later, in case someone would like to test.

- Mario



Signed-off-by: Mario Smarduch mario.smard...@huawei.com
---
 arch/arm/include/asm/kvm_host.h |   14 +++
 arch/arm/include/asm/kvm_vgic.h |   10 +++
 arch/arm/kvm/Makefile   |1 +
 arch/arm/kvm/arm.c  |   60 +
 arch/arm/kvm/assign-dev.c   |  189 +++
 arch/arm/kvm/vgic.c |  106 ++
 include/linux/irqchip/arm-gic.h |1 +
 include/uapi/linux/kvm.h|   33 +++
 8 files changed, 414 insertions(+)
 create mode 100644 arch/arm/kvm/assign-dev.c

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 57cb786..c6ad3a3 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,10 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+
+   /* Device Passthrough Fields */
+   struct list_headassigned_dev_head;
+   struct mutexdev_pasthru_lock;
 };
 
 #define KVM_NR_MEM_OBJS 40
@@ -146,6 +150,13 @@ struct kvm_vcpu_stat {
u32 halt_wakeup;
 };
 
+struct kvm_arm_assigned_dev_kernel {
+   struct list_head list;
+   struct kvm_arm_assigned_device dev;
+   irqreturn_t (*irq_handler)(int, void *);
+   void *irq_arg;
+};
+
 struct kvm_vcpu_init;
 int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init);
@@ -156,6 +167,9 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct 
kvm_one_reg *reg);
 int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
 u64 kvm_call_hyp(void *hypfn, ...);
 void force_vm_exit(const cpumask_t *mask);
+int kvm_arm_get_device_resources(struct kvm *,
+   struct kvm_arm_get_device_resources *);
+int kvm_arm_assign_device(struct kvm *, struct kvm_arm_assigned_device *);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 struct kvm;
diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h
index 343744e..c4370ae 100644
--- a/arch/arm/include/asm/kvm_vgic.h
+++ b/arch/arm/include/asm/kvm_vgic.h
@@ -107,6 +107,16 @@ struct vgic_dist {
 
/* Bitmap indicating which CPU has something pending */
unsigned long   irq_pending_on_cpu;
+
+   /* Device passthrough  fields */
+   /* Host irq to guest irq mapping */
+   u8  guest_irq[VGIC_NR_SHARED_IRQS];
+
+   /* Pending passthruogh irq */
+   struct vgic_bitmap  pasthru_spi_pending;
+
+   /* At least one passthrough IRQ pending for some vCPU */
+   u32 pasthru_pending;
 #endif
 };
 
diff

Dev Passthrough QEMU patch

2013-06-11 Thread Mario Smarduch



This patch is for testing only and goes along with other
two patches for priodrop and dev passthrough, it should apply against
1.4.5. 

diff --git a/cpus.c b/cpus.c
index c15ff6c..0c19214 100644
--- a/cpus.c
+++ b/cpus.c
@@ -737,6 +737,26 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
 CPUState *cpu = ENV_GET_CPU(env);
 int r;
 
+/* For now just do a 1:1 vCPU binding as they come online for device
+ * pass through
+ */
+cpu_set_t cpuset;
+int ret, i;
+unsigned long cpu_index = kvm_arch_vcpu_id(cpu);
+
+CPU_ZERO(cpuset);
+CPU_SET(cpu_index, cpuset);
+ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset);
+if(ret != 0) {
+   printf(pthread_setaffinity_np failed to setaffinity to CPU 0\n);
+exit(-1);
+}
+
+CPU_ZERO(cpuset);
+pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset);
+if(CPU_ISSET(cpu_index,cpuset))
+printf(Binding: vCPU %ld -- CPU %d\n, cpu_index, i);
+
 qemu_mutex_lock(qemu_global_mutex);
 qemu_thread_get_self(cpu-thread);
 cpu-thread_id = qemu_get_thread_id();
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index caca979..46c2c59 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -904,6 +904,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_HTAB_FD  _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
 /* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
 #define KVM_ARM_SET_DEVICE_ADDR  _IOW(KVMIO,  0xab, struct 
kvm_arm_device_addr)
+#define KVM_ARM_GET_DEVICE_RESOURCES _IOW(KVMIO,  0xe1, struct 
kvm_arm_get_device_resources)
+#define KVM_ARM_ASSIGN_DEVICE_IOW(KVMIO,  0xe2, struct 
kvm_arm_assigned_device)
 
 /*
  * ioctls for vcpu fds
@@ -1013,6 +1015,7 @@ struct kvm_assigned_irq {
};
 };
 
+
 struct kvm_assigned_msix_nr {
__u32 assigned_dev_id;
__u16 entry_nr;
@@ -1027,4 +1030,33 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+
+/* MAX 6 MMIO resources per device */
+#define MAX_RES_PER_DEVICE  6
+struct kvm_arm_get_device_resources {
+chardevname[128];
+__u32   resource_cnt;
+struct {
+__u64   hpa;
+__u32   size;
+__u32   attr;
+   charhost_name[64];
+} host_resources[MAX_RES_PER_DEVICE];
+   struct {
+   __u32   hwirq;
+   __u32   attr;   
+   charhost_name[64];
+   } hostirq;
+};
+
+struct kvm_guest_device_resources {
+__u64   gpa[MAX_RES_PER_DEVICE];
+__u32   girq;
+};
+
+struct kvm_arm_assigned_device {
+struct  kvm_arm_get_device_resources dev_res;
+struct  kvm_guest_device_resources guest_res;
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs
index d89b57c..9aee84e 100644
--- a/target-arm/Makefile.objs
+++ b/target-arm/Makefile.objs
@@ -1,5 +1,5 @@
 obj-y += arm-semi.o
 obj-$(CONFIG_SOFTMMU) += machine.o
-obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM) += kvm.o device-assign.o
 obj-y += translate.o op_helper.o helper.o cpu.o
 obj-y += neon_helper.o iwmmxt_helper.o
diff --git a/target-arm/device-assign.c b/target-arm/device-assign.c
new file mode 100644
index 000..e4d0e97
--- /dev/null
+++ b/target-arm/device-assign.c
@@ -0,0 +1,118 @@
+
+#include hw/sysbus.h
+#include qemu-common.h
+#include hw/qdev.h
+#include hw/ptimer.h
+#include kvm_arm.h
+#include qemu/error-report.h
+
+#define IORESOURCE_TYPE_BITS0x1f00  /* Resource type */
+#define IORESOURCE_IO   0x0100  /* PCI/ISA I/O ports */
+#define IORESOURCE_MEM  0x0200
+#define IORESOURCE_REG  0x0300  /* Register offsets */
+#define IORESOURCE_IRQ  0x0400
+#define IORESOURCE_DMA  0x0800
+
+#define IORESOURCE_PREFETCH 0x2000  /* No side effects */
+#define IORESOURCE_READONLY 0x4000
+#define IORESOURCE_CACHEABLE0x8000
+
+typedef struct {
+SysBusDevice busdev;
+char   *devname;
+uint64_t   hpa, gpa;
+uint32_t   dev_size;
+uint32_t   hirq,girq;
+} AssignedDevice;
+
+static Property device_assign_properties[] = {
+DEFINE_PROP_STRING(host, AssignedDevice, devname),
+DEFINE_PROP_UINT64(hpa, AssignedDevice, hpa, 0),
+DEFINE_PROP_UINT64(gpa, AssignedDevice, gpa, 0),
+DEFINE_PROP_UINT32(size, AssignedDevice, dev_size, 0),
+DEFINE_PROP_UINT32(hostirq, AssignedDevice, hirq, 0),
+DEFINE_PROP_UINT32(guestirq, AssignedDevice, girq, 0),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static int assign_device(AssignedDevice *dev)
+{
+int ret,i;
+struct kvm_arm_get_device_resources dev_res;
+struct kvm_arm_assigned_device dev_assigned;
+char *p, c='-';
+
+memset(dev_res,0,sizeof(dev_res));
+memset(dev_assigned,0,sizeof(dev_assigned));
+
+if((p = strstr(dev-devname, (char *)c)) != (char *) NULL)
+   *p = ',';
+

Re: [PATCH 2/2] add initial kvm dev passhtrough support

2013-06-11 Thread Mario Smarduch


On 6/11/2013 10:28 AM, Alexander Graf wrote:

 
 Is there any particular reason you're not going down that path for your ARM 
 implementation?

We see this as a good starting point to build on, we need baseline numbers
for performance, latency, interrupt throughput on real hardware
ASAP to build competency for NFV, which has demanding Dev. Passthrough
requirements. Over time we plan contributing to SMMU and VFIO as well
(we're looking into this now).

FYI NFV is an initiative wireless/fixed network operators are working 
towards - to virtualize Core, likely Radia Access and even Home Network 
equipment, this is a epic undertaking (i.e. Network Function Virtualization). 
So far VMware has taken the lead (mostly x86).
 
 
 On the embedded PPC side we've been discussing vfio and how it fits into a 
 device tree, non-PCI world for a while. If you like, we can dive into more 
 detail on that, either via email or via phone.

I'll email you offline, I'd like to know more what you've done on this
and see where we can align/leverage the effort.

- Mario
 
 
 Alex
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] add initial kvm dev passhtrough support

2013-06-11 Thread Mario Smarduch


I know Antonios very well. Yes our intent is definitely to use VFIO.

- Mario 

On 6/11/2013 4:52 PM, Alex Williamson wrote:
 On Tue, 2013-06-11 at 16:13 +0200, Mario Smarduch wrote:
 On 6/11/2013 10:28 AM, Alexander Graf wrote:


 Is there any particular reason you're not going down that path for your ARM 
 implementation?

 We see this as a good starting point to build on, we need baseline numbers
 for performance, latency, interrupt throughput on real hardware
 ASAP to build competency for NFV, which has demanding Dev. Passthrough
 requirements. Over time we plan contributing to SMMU and VFIO as well
 (we're looking into this now).

 FYI NFV is an initiative wireless/fixed network operators are working 
 towards - to virtualize Core, likely Radia Access and even Home Network 
 equipment, this is a epic undertaking (i.e. Network Function 
 Virtualization). 
 So far VMware has taken the lead (mostly x86).
  

 On the embedded PPC side we've been discussing vfio and how it fits into a 
 device tree, non-PCI world for a while. If you like, we can dive into more 
 detail on that, either via email or via phone.

 I'll email you offline, I'd like to know more what you've done on this
 and see where we can align/leverage the effort.
 
 Yes, please let's use VFIO rather than continue to use or invent new
 device assignment interfaces for KVM.  Antonios Motakis (cc'd) already
 contacted me about VFIO for ARM.  IIRC, his initial impression was that
 the IOMMU backend was almost entirely reusable for ARM (a couple PCI
 assumptions implicit in the IOMMU API to handle) and my hope was that
 ARM and PPC could work together on a common VFIO device tree backend.
 Thanks,
 
 Alex
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] add irq priodrop support

2013-06-11 Thread Mario Smarduch

Hi Grant,
appreciate the strong feedback, I agree with all
the coding observations will make the changes. 
I have few inline responses.


 +static u32  gic_irq_prio_drop[DIV_ROUND_UP(1020, 32)] __read_mostly;
 
 I believe it is possible to have more than one GIC in a system. This map
 assumes only one. The prio_drop map should probably be part of
 gic_chip_data so that it is per-instance.
 
 Also, as discussed below, the code should be using DECLARE_BITMAP()

Agree.

 
 gic_priodrop_remap_eoi() is used exactly once. You should instead put
 the body of it inline like so:
 
   if (IS_ENABLED(CONFIG_KVM_ARM_INT_PRIO_DROP)  is_hyp_mode_available())
   chip-irq_eoi = gic_eoi_irq_priodrop;

Yes much cleaner.

 
 However, this block is problematic. For each map call it modifies the
 /global/ gic_chip. It's not a per-interrupt thing, but rather changes
 the callback for all gic interrupts, on *any* gic in the system. Is this
 really what you want?
 
 If it is, then I would expect the callback to be modified once sometime
 around gic_init_bases() time.

Yes need to move it up, now its being set for each IRQ domain mapping call.

 
 If it is not, and what you really want is per-irq behaviour, then what
 you need to do is have a separate gic_priodrop_chip that can be used on
 a per-irq basis instead of the gic_chip.

Prio drop/deactivate is per CPU and all IRQs are affected including SGIs.
It's possible to run mixed CPU modes, but this patch enables all CPUs for
device passthrough, similar to hyp mode enable.

Another way would be the reverse - set all non-passthrough irqs to 
gic_priodrop_chip
and the passed through IRQ to gic_chip.  I think keeping it in one function
and just setting a bit to enable/disable is cleaner.


 
  if (hw  32) {
  irq_set_percpu_devid(irq);
  irq_set_chip_and_handler(irq, gic_chip,
 @@ -857,4 +875,125 @@ IRQCHIP_DECLARE(cortex_a9_gic, arm,cortex-a9-gic, 
 gic_of_init);
  IRQCHIP_DECLARE(msm_8660_qgic, qcom,msm-8660-qgic, gic_of_init);
  IRQCHIP_DECLARE(msm_qgic2, qcom,msm-qgic2, gic_of_init);
  
 +#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP
 +/* If HYP mode enabled and PRIO DROP set EOIR function to handle PRIO DROP 
 */
 +static inline void gic_priodrop_remap_eoi(struct irq_chip *chip)
 +{
 +if (is_hyp_mode_available())
 +chip-irq_eoi = gic_eoi_irq_priodrop;
 +}
 +
 +/* If HYP mode set enable interrupt priority drop/deactivation, and mark
 + * SGIs to deactive through writes to GCICC_DIR. For Guest only enable 
 normal
 + * mode.
 + */
 
 Nit: Read Documentation/kernel-doc-nano-HOWTO.txt. It's a good idea to
 stick to that format when writing function documenation. Also,
 convention is for multiline comments to have an empty /* line before the
 first line of text.

Will do.

 


 +}
 +
 +void gic_spi_clr_priodrop(int irq)
 +{
 +struct irq_data *d = irq_get_irq_data(irq);
 +if (likely(irq = 32  irq  1019)) {
 
  1019 ...
 
 +clear_bit(irq % 32, (void *) gic_irq_prio_drop[irq/32]);
 +writel_relaxed(irq, gic_cpu_base(d) + GIC_CPU_DIR);
 +}
 +}
 +
 +int gic_spi_get_priodrop(int irq)
 +{
 +if (likely(irq = 32  irq = 1019))
 
 ... = 1019
 
 Looks like some off-by-one errors going on here. Also, the rest of the
 gic code uses 1020, not 1019 as the upper limit. What is the reason for
 being difference in this code block?

Hmmm a mistake.

 ___
 linux-arm-kernel mailing list
 linux-arm-ker...@lists.infradead.org
 http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] add initial kvm dev passhtrough support

2013-06-12 Thread Mario Smarduch

Resending, initial email from my exchange client got rejected
due to HTML content

On 6/12/2013 8:45 AM, Mario Smarduch wrote:
  
 
Hi Antonios, 
 thanks for your feedback, initially we’ll work with static binding
 gain performance data given latency/throughput is key, later add dynamic
 binding (as well as re-optimize affinity code). And as you already
 know move towards VFIO, which is a longer term effort.
 
 
 +struct kvm_arm_assigned_dev_kernel {
 +   struct list_head list;
 +   struct kvm_arm_assigned_device dev;
 +   irqreturn_t (*irq_handler)(int, void *);
 +   void *irq_arg;
 +};
 +
 
  
 
 Instead of irq_arg, isn't something such as target_vcpu more clear?
 
  
 
MS Agree.
 
  
 
 diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c
 index 17c5ac7..f4cb804 100644
 --- a/arch/arm/kvm/vgic.c
 +++ b/arch/arm/kvm/vgic.c
 @@ -449,6 +449,41 @@ static u32 vgic_get_target_reg(struct kvm *kvm, int 
 irq)
 return val;
  }
 
 +/* Follow the IRQ vCPU affinity so passthrough device interrupts are 
 injected
 + * on physical CPU they execute.
 + */
 +static void vgic_set_passthru_affinity(struct kvm *kvm, int irq, u32 
 target)
 +{
 +   struct list_head *dev_list_ptr = kvm-arch.assigned_dev_head;
 +   struct list_head *ptr;
 +   struct kvm_arm_assigned_dev_kernel *assigned_dev;
 +   struct vgic_dist *dist = kvm-arch.vgic;
 +   char *buf;
 +   int cpu, hwirq;
 +
 +   mutex_lock(kvm-arch.dev_pasthru_lock);
 +   list_for_each(ptr, dev_list_ptr) {
 +   assigned_dev = list_entry(ptr,
 +   struct kvm_arm_assigned_dev_kernel, list);
 +   if (assigned_dev-dev.guest_res.girq == irq) {
 +   if (assigned_dev-irq_arg)
 +   free_irq(irq, assigned_dev-irq_arg);
 +   cpu = kvm-vcpus[target]-cpu;
 +   hwirq = assigned_dev-dev.dev_res.hostirq.hwirq;
 +   irq_set_affinity(hwirq, cpumask_of(cpu));
 +   assigned_dev-irq_arg = kvm-vcpus[target];
 +   buf = assigned_dev-dev.dev_res.hostirq.host_name;
 +   sprintf(buf, %s-KVM Pass-through,
 +   
 assigned_dev-dev.dev_res.devname);
 +   gic_spi_set_priodrop(hwirq);
 +   dist-guest_irq[hwirq - VGIC_NR_PRIVATE_IRQS] = 
 irq;
 +   request_irq(hwirq, assigned_dev-irq_handler, 0, 
 buf,
 +   
 assigned_dev-irq_arg);
 +   }
 +   }
 +   mutex_unlock(kvm-arch.dev_pasthru_lock);
 +}
 +
 
  
 
 Maybe vgic_set_pasthru_affinity is not an ideal name for the function, since 
 you do more than that here.
 
 After looking at your code I think things will be much easier if you decouple 
 the host irq affinity bits from here. After that there is not much stopping 
 from affinity following the CPU a vCPU will execute.
 
 I would rename this to something to reflect that you enable priodrop for this 
 IRQ here, for example only vgic_set_passthrough could suffice (I'm don't like 
 the pasthru abbreviation a lot). Then the affinity bits can be put in a 
 different function.
 
  
 
MJS Agree naming could be better.
 
 
 
 In arch/arm/kvm/arm.c kvm_arch_vcpu_load() you can follow up whenever a vcpu 
 is moved to a different cpu. However in practice I don't know if the 
 additional complexity of having the irq affinity follow the vcpu 
 significantly improves irq latency.
 
  
 
MJS  This should save a costly IPI if for example Phys IRQ is taken on CPU 0
and target vCPU on CPU 1. I agree kvm_arch_vcpu_load() is a good place if you 
let vCPUs float. vigic_set_passthrough_affinity can be optimized more to 
eliminate 
the free_irq(), requesnt_irq(). For now it’s a simple implementation we’re
assuming static binding, start gathering performance/latency data. 
Will change the name as you suggest.
 
 
 
 
 -- 
 
 *Antonios Motakis*, Virtual Open Systems*
 */Open Source KVM Virtualization Development
 /www.virtualopensystems.com http://www.virtualopensystems.com
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] armv7 initial device passthrough support

2013-06-13 Thread Mario Smarduch

Updated Device Passthrough Patch.
- optimized IRQ-CPU-vCPU binding, irq is installed once
- added dynamic IRQ affinity on schedule in
- added documentation and few other coding recommendations.

Per earlier discussion VFIO is our target but we like
something earlier to work with to tackle performance
latency issue (some ARM related) for device passthrough 
while we migrate towards VFIO.

- Mario


Signed-off-by: Mario Smarduch mario.smard...@huawei.com
---
 arch/arm/include/asm/kvm_host.h |   31 +
 arch/arm/include/asm/kvm_vgic.h |   10 ++
 arch/arm/kvm/Makefile   |1 +
 arch/arm/kvm/arm.c  |   80 +
 arch/arm/kvm/assign-dev.c   |  248 +++
 arch/arm/kvm/vgic.c |  134 +
 include/linux/irqchip/arm-gic.h |1 +
 include/uapi/linux/kvm.h|   33 ++
 8 files changed, 538 insertions(+)
 create mode 100644 arch/arm/kvm/assign-dev.c

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 57cb786..c85c3a0 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,10 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+
+   /* Device Passthrough Fields */
+   struct list_headassigned_dev_head;
+   struct mutexdev_passthrough_lock;
 };
 
 #define KVM_NR_MEM_OBJS 40
@@ -146,6 +150,13 @@ struct kvm_vcpu_stat {
u32 halt_wakeup;
 };
 
+struct kvm_arm_assigned_dev_kernel {
+   struct list_head list;
+   struct kvm_arm_assigned_device dev;
+   irqreturn_t (*irq_handler)(int, void *);
+   unsigned long vcpuid_irq_arg;
+};
+
 struct kvm_vcpu_init;
 int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init);
@@ -157,6 +168,26 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct 
kvm_one_reg *reg);
 u64 kvm_call_hyp(void *hypfn, ...);
 void force_vm_exit(const cpumask_t *mask);
 
+#ifdef CONFIG_KVM_ARM_INT_PRIO_DROP
+int kvm_arm_get_device_resources(struct kvm *,
+   struct kvm_arm_get_device_resources *);
+int kvm_arm_assign_device(struct kvm *, struct kvm_arm_assigned_device *);
+void kvm_arm_setdev_irq_affinity(struct kvm_vcpu *vcpu, int cpu);
+#else
+static inline int kvm_arm_get_device_resources(struct kvm *k, struct 
kvm_arm_get_device_resources *r)
+{
+   return -1;
+}
+static inline int kvm_arm_assign_device(struct kvm *k, struct 
kvm_arm_assigned_device *d)
+{
+   return -1;
+}
+
+static inline void kvm_arm_setdev_irq_affinity(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+#endif
+
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 struct kvm;
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h
index 343744e..fb6afd2 100644
--- a/arch/arm/include/asm/kvm_vgic.h
+++ b/arch/arm/include/asm/kvm_vgic.h
@@ -107,6 +107,16 @@ struct vgic_dist {
 
/* Bitmap indicating which CPU has something pending */
unsigned long   irq_pending_on_cpu;
+
+   /* Device passthrough  fields */
+   /* Host irq to guest irq mapping */
+   u8  guest_irq[VGIC_NR_SHARED_IRQS];
+
+   /* Pending passthruogh irq */
+   struct vgic_bitmap  passthrough_spi_pending;
+
+   /* At least one passthrough IRQ pending for some vCPU */
+   u32 passthrough_pending;
 #endif
 };
 
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 53c5ed8..823fc38 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -21,3 +21,4 @@ obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
 obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o
 obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o
+obj-$(CONFIG_KVM_ARM_INT_PRIO_DROP) += assign-dev.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 37d216d..ba54c64 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -26,6 +26,8 @@
 #include linux/mman.h
 #include linux/sched.h
 #include linux/kvm.h
+#include linux/interrupt.h
+#include linux/ioport.h
 #include trace/events/kvm.h
 
 #define CREATE_TRACE_POINTS
@@ -43,6 +45,7 @@
 #include asm/kvm_emulate.h
 #include asm/kvm_coproc.h
 #include asm/kvm_psci.h
+#include asm/kvm_host.h
 
 #ifdef REQUIRES_VIRT
 __asm__(.arch_extension   virt);
@@ -139,6 +142,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
/* Mark the initial VMID generation invalid */
kvm-arch.vmid_gen = 0;
+   /*
+* Initialize Dev Passthrough Fields
+*/
+   INIT_LIST_HEAD(kvm-arch.assigned_dev_head);
+   mutex_init(kvm-arch.dev_passthrough_lock);
 
return ret;
 out_free_stage2_pgd:
@@ -169,6 +177,40 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, 
unsigned long npages)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
int i;
+   struct list_head

Re: [PATCH 2/2] armv7 initial device passthrough support

2013-06-24 Thread Mario Smarduch



On 6/15/2013 5:47 PM, Paolo Bonzini wrote:
 Il 13/06/2013 11:19, Mario Smarduch ha scritto:
 Updated Device Passthrough Patch.
 - optimized IRQ-CPU-vCPU binding, irq is installed once
 - added dynamic IRQ affinity on schedule in
 - added documentation and few other coding recommendations.

 Per earlier discussion VFIO is our target but we like
 something earlier to work with to tackle performance
 latency issue (some ARM related) for device passthrough 
 while we migrate towards VFIO.
 
 I don't think this is acceptable upstream, unfortunately.  KVM device
 assignment is deprecated and we should not add more users.
That's fine we'll work our way towards dev-tree VFIO reusing what we can
working with the community.

At this point we're more concerned with numbers and best practices as 
opposed to mechanism this part will be time consuming. 
VFIO will be more background for us.

 
 What are the latency issues you have?

Our focus now is on IRQ latency and throughput. Right now it appears lowest 
latency
is 2x + exit/enter + IRQ injection overhead. We can't tolerate additional 
IPIs or deferred IRQ injection approaches. We're looking for numbers closer
to what IBMs ELI managed. Also high res timers which ARM Virt. Ext supports 
very well. Exitless interrupts which ARM handles very well too. There are
some future hw ARM interrupt enhancements coming up which may help a lot as 
well.

There are many other latency/perf. reqs for NFV related to RT,
essentially Guest must run near native. In the end it may turn out this
may need to be outside of main tree we'll see.

- Mario
 
 Paolo
 
 - Mario


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] armv7 initial device passthrough support

2013-06-25 Thread Mario Smarduch

On 6/24/2013 10:01 PM, Christoffer Dall wrote:


 There are many other latency/perf. reqs for NFV related to RT,
 essentially Guest must run near native. In the end it may turn out this
 may need to be outside of main tree we'll see.

 It doesn't sound like this will be the end result.  Everything that you
 try to do in your patch set can be accomplished using VFIO and a more
 generic infrastructure for virtual IRQ integration with KVM and user
 space.

I mentioned in previous email we will pursue VFIO, but even
at that VFIO is a starting point for NFV.
 
 We should avoid creating an environment with important functionality
 outside of the main tree, if at all possible.

Of course that would be ideal but with NFV it may be more involved. 
This is similar Linux and TEM adaption  around 04/05. We  wanted 
to adapt Linux but it lacked required features that's when CGL specifications 
came into play to provide  guidance a lot of features (TIPC, OpenIMPI, 
preempt_rt, AEM) lived outside  mainline, supported by OS vendors delivering 
CGL compliant distro, while others decided to stick with IT, penetrating
some applications like HLR.

With NFV a likely scenario may evolve, TEMs need to start demonstrating
to operators fixed and wireless virtualization use cases. The only
significant difference is that unlike CGL for Linux, KVM has nor real
representation and understanding of NFV reqs (as opposed to proprietary 
vendors). 
I can't speak  for all TEMs but it's likely they will go off on their own
to demo/proto-type and worry about Open Source acceptance later.

 
 -Christoffer
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] armv7 initial device passthrough support

2013-06-25 Thread Mario Smarduch

On 6/25/2013 12:27 AM, Stuart Yoder wrote:


 We should avoid creating an environment with important functionality
 outside of the main tree, if at all possible.
 
 Also, as we architect that generic infrastructure we need to keep in mind that
 there are important use cases for doing I/O in user space that are not
 KVM guests-- just normal applications that need direct device
 access.

Yes that's a good point especially data plane NE, also LTE has 
these use cases at the radio side.

 
 Stuart
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: RFC: vfio interface for platform devices (v2)

2013-07-04 Thread Mario Smarduch


I'm having trouble understanding how this works where
the Guest Device Model != Host. How do you inform the guest
where the device is mapped in its physical address space,
and handle GPA faults?

- Mario

On 7/3/2013 11:40 PM, Yoder Stuart-B08248 wrote:
 Version 2
   -VFIO_GROUP_GET_DEVICE_FD-- specified that the path is a sysfs path
   -VFIO_DEVICE_GET_INFO-- defined 2 flags instead of 1
   -deleted VFIO_DEVICE_GET_DEVTREE_INFO ioctl
   -VFIO_DEVICE_GET_REGION_INFO-- updated as per AlexW's suggestion,
defined 5 new flags and associated structs
   -VFIO_DEVICE_GET_IRQ_INFO-- updated as per AlexW's suggestion,
defined 1 new flag and associated struct
   -removed redundant example
 
 --
 VFIO for Platform Devices
 
 The existing kernel interface for vfio-pci is pretty close to what is needed
 for platform devices:
-mechanism to create a container
-add groups/devices to a container
-set the IOMMU model
-map DMA regions
-get an fd for a specific device, which allows user space to determine
 info about device regions (e.g. registers) and interrupt info
-support for mmapping device regions
-mechanism to set how interrupts are signaled
 
 Many platform device are simple and consist of a single register
 region and a single interrupt.  For these types of devices the
 existing vfio interfaces should be sufficient.
 
 However, platform devices can get complicated-- logically represented
 as a device tree hierarchy of nodes.  For devices with multiple regions
 and interrupts, new mechanisms are needed in vfio to correlate the
 regions/interrupts with the device tree structure that drivers use
 to determine the meaning of device resources.
 
 In some cases there are relationships between device, and devices
 reference other devices using phandle links.  The kernel won't expose
 relationships between devices, but just exposes mappable register
 regions and interrupts.
 
 The changes needed for vfio are around some of the device tree
 related info that needs to be available with the device fd.
 
 1.  VFIO_GROUP_GET_DEVICE_FD
 
   User space knows by out-of-band means which device it is accessing
   and will call VFIO_GROUP_GET_DEVICE_FD passing a specific sysfs path
   to get the device information:
 
   fd = ioctl(group, VFIO_GROUP_GET_DEVICE_FD,
  /sys/bus/platform/devices/ffe21.usb));
 
 2.  VFIO_DEVICE_GET_INFO
 
The number of regions corresponds to the regions defined
in reg and ranges in the device tree.  
 
Two new flags are added to struct vfio_device_info:
 
#define VFIO_DEVICE_FLAGS_PLATFORM (1  ?) /* A platform bus device */
#define VFIO_DEVICE_FLAGS_DEVTREE  (1  ?) /* device tree info available 
 */
 
It is possible that there could be platform bus devices 
that are not in the device tree, so we use 2 flags to
allow for that.
 
If just VFIO_DEVICE_FLAGS_PLATFORM is set, it means
that there are regions and IRQs but no device tree info
available.
 
If just VFIO_DEVICE_FLAGS_DEVTREE is set, it means
there is device tree info available.
 
 3. VFIO_DEVICE_GET_REGION_INFO
 
For platform devices with multiple regions, information
is needed to correlate the regions with the device 
tree structure that drivers use to determine the meaning
of device resources.

The VFIO_DEVICE_GET_REGION_INFO is extended to provide
device tree information.
 
The following information is needed:
   -the device tree path to the node corresponding to the
region
   -whether it corresponds to a reg or ranges property
   -there could be multiple sub-regions per reg or ranges and
the sub-index within the reg/ranges is needed
 
There are 5 new flags added to vfio_region_info :
 
struct vfio_region_info {
 __u32   argsz;
 __u32   flags;
#define VFIO_REGION_INFO_FLAG_CACHEABLE (1  ?)
#define VFIO_DEVTREE_REGION_INFO_FLAG_REG (1  ?)
#define VFIO_DEVTREE_REGION_INFO_FLAG_RANGE (1  ?)
#define VFIO_DEVTREE_REGION_INFO_FLAG_INDEX (1  ?)
#define VFIO_DEVTREE_REGION_INFO_FLAG_PATH (1  ?)
 __u32   index;  /* Region index */
 __u32   resv;   /* Reserved for alignment */
 __u64   size;   /* Region size (bytes) */
 __u64   offset; /* Region offset from start of device fd */
};
  
VFIO_REGION_INFO_FLAG_CACHEABLE
-if set indicates that the region must be mapped as cacheable
 
VFIO_DEVTREE_REGION_INFO_FLAG_REG
-if set indicates that the region corresponds to a reg property
 in the device tree representation of the device
 
VFIO_DEVTREE_REGION_INFO_FLAG_RANGE
-if set indicates that the region corresponds to a ranges property
 in the device tree representation of the device
 
VFIO_DEVTREE_REGION_INFO_FLAG_INDEX
-if set indicates that there is a

huge 2nd stage pages and live migration

2014-03-28 Thread Mario Smarduch

Hello

I've been working on live migration for ARM-KVM, and noticed
problem completing migration with huge 2nd stage tables.
 

Aafter write protecting the VM, for write fault 512 page bits
are set in dirty_bitmap[] to take into account future writes to 
huge page.The pmd is write protected again when QEMU  reads the 
dirty log, and the cycle repeats. With this not even a idle 
32MB VM  completes live migration.

If QEMU uses THPs, and 2nd stage tables use pte's, then there
is no problem, live migration is quick. I'm assumung QEMU and Guest 
huge pages with 2nd stage page table pte's should work fine too.

I'm wondering how this has been solved (for any architecture)? 

- Mario
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/3] migration dirty bitmap support ARMv7

2014-04-14 Thread Mario Smarduch


The patch set supports migration dirty bitmap support implementation for
arm-kvm.  Spliting of pmd's to pte's as suggested is implemented on demand
when migration is started. 

I tested it on 4-way SMP ARMv7, with SMP guests.
2GB VMs with dirty shared memory segments upto 1.8 GB 
and relatively fast update rates 16Mb/5mS. 

Next course of action would be rmap support which 
scales much better on bigger systems. Although one
think that confused me, x86 migrations were sometimes
10 to 15 times slower, I think it must be something 
wrong with my configuration.


Mario Smarduch (3):
  headers for migration dirtybitmap support
  initial write protect of VM address space and on dirty log read
  hooks to interface with QEMU for initial write protect, dirty log read

 arch/arm/include/asm/kvm_host.h |9 +++
 arch/arm/kvm/arm.c  |   62 ++-
 arch/arm/kvm/mmu.c  |  158 ++-
 3 files changed, 226 insertions(+), 3 deletions(-)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/3] migration dirtybitmap support ARMv7

2014-04-14 Thread Mario Smarduch


- support QEMU interface for initial VM Write Protect
- QEMU Dirty bit map log retrieval


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/arm.c |   62 +++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index bd18bb8..9076e3d 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -241,6 +241,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
   const struct kvm_memory_slot *old,
   enum kvm_mr_change change)
 {
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   kvm_mmu_slot_remove_write_access(kvm, mem-slot);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -773,9 +775,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
+/*
+ * Walks the memslot dirty bitmap, write protects dirty pages for next rount,
+ * and stores the dirty bitmap fo QEMU retrieval.
+ *
+ */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-   return -EINVAL;
+   int r;
+   struct kvm_memory_slot *memslot;
+   unsigned long n, i;
+   unsigned long *dirty_bitmap;
+   unsigned long *dirty_bitmap_buffer;
+   bool is_dirty = false;
+   gfn_t offset;
+
+   mutex_lock(kvm-slots_lock);
+   r = -EINVAL;
+
+   if (log-slot = KVM_USER_MEM_SLOTS)
+   goto out;
+
+   memslot = id_to_memslot(kvm-memslots, log-slot);
+   dirty_bitmap = memslot-dirty_bitmap;
+
+   r = -ENOENT;
+   if (!dirty_bitmap)
+   goto out;
+
+   n = kvm_dirty_bitmap_bytes(memslot);
+   dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+   memset(dirty_bitmap_buffer, 0, n);
+
+   spin_lock(kvm-mmu_lock);
+   for (i = 0; i  n / sizeof(long); i++) {
+   unsigned long mask;
+
+   if (!dirty_bitmap[i])
+   continue;
+
+   is_dirty = true;
+   offset = i * BITS_PER_LONG;
+   kvm_mmu_write_protect_pt_masked(kvm, memslot, offset,
+   dirty_bitmap[i]);
+   mask = dirty_bitmap[i];
+   dirty_bitmap_buffer[i] = mask;
+   dirty_bitmap[i] = 0;
+   }
+
+   if (is_dirty)
+   kvm_tlb_flush_vmid(kvm);
+
+   spin_unlock(kvm-mmu_lock);
+   r = -EFAULT;
+
+   if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n))
+   goto out;
+
+   r = 0;
+out:
+   mutex_unlock(kvm-slots_lock);
+   return r;
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] migration dirtybitmap support ARMv7

2014-04-14 Thread Mario Smarduch


- Support write protection of entire VM address space
- Split pmds section in migration mode
- Write protect dirty pages on Dirty log read 

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |  158 +++-
 1 file changed, 156 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 7789857..502e776 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,13 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+void kvm_tlb_flush_vmid(struct kvm *kvm)
+{
+   phys_addr_t x;
+   /* based on function description 2nd argument is irrelevent */
+   kvm_tlb_flush_vmid_ipa(kvm, x);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
@@ -639,6 +646,143 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+/*
+ * Called when QEMU retrieves the dirty log and write protects dirty pages
+ * for next QEMU call to retrieve the dirty logn
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   phys_addr_t ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte, new_pte;
+
+   while (mask) {
+   ipa = (slot-base_gfn + gfn_offset + __ffs(mask))  PAGE_SHIFT;
+   pgd = pgdp + pgd_index(ipa);
+   if (!pgd_present(*pgd))
+   goto update_mask;
+   pud = pud_offset(pgd, ipa);
+   if (!pud_present(*pud))
+   goto update_mask;
+   pmd = pmd_offset(pud, ipa);
+   if (!pmd_present(*pmd))
+   goto update_mask;
+   pte = pte_offset_kernel(pmd, ipa);
+   if (!pte_present(*pte))
+   goto update_mask;
+   if ((*pte  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
+   goto update_mask;
+   new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
+   *pte = new_pte;
+update_mask:
+   mask = mask - 1;
+   }
+}
+
+/*
+ * In migration splits PMDs into PTEs to keep track of dirty pages. Without
+ * spliting light execution prevents migration.
+ */
+bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
+{
+   struct page *page;
+   pfn_t pfn = pmd_pfn(*pmd);
+   pte_t *pte, new_pte;
+   int i;
+
+   page = alloc_page(GFP_KERNEL);
+   if (page == NULL)
+   return false;
+
+   pte = page_address(page);
+   for (i = 0; i  PMD_SIZE/PAGE_SIZE; i++) {
+   new_pte = pfn_pte(pfn+i, PAGE_S2);
+   pte[i] = new_pte;
+   }
+   kvm_clean_pte(pte);
+   pmd_populate_kernel(NULL, pmd, pte);
+
+   /*
+   * flush the whole TLB for VM  relying on hardware broadcast
+   */
+   kvm_tlb_flush_vmid(kvm);
+   get_page(virt_to_page(pte));
+   return true;
+}
+
+/*
+ * Called from QEMU when migration dirty logging is started. Write the protect
+ * current set. Future faults writes are tracked through WP of when dirty log
+ * log.
+ */
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte, new_pte;
+   pgd_t *pgdp = kvm-arch.pgd;
+   struct kvm_memory_slot *memslot = id_to_memslot(kvm-memslots, slot);
+   u64 start = memslot-base_gfn  PAGE_SHIFT;
+   u64 end = (memslot-base_gfn + memslot-npages)  PAGE_SHIFT;
+   u64 addr = start, addr1;
+
+   spin_lock(kvm-mmu_lock);
+   kvm-arch.migration_in_progress = true;
+   while (addr  end) {
+   if (need_resched() || spin_needbreak(kvm-mmu_lock)) {
+   kvm_tlb_flush_vmid(kvm);
+   cond_resched_lock(kvm-mmu_lock);
+   }
+
+   pgd = pgdp + pgd_index(addr);
+   if (!pgd_present(*pgd)) {
+   addr = pgd_addr_end(addr, end);
+   continue;
+   }
+
+   pud = pud_offset(pgd, addr);
+   if (pud_huge(*pud) || !pud_present(*pud)) {
+   addr = pud_addr_end(addr, end);
+   continue;
+   }
+
+   pmd = pmd_offset(pud, addr);
+   if (!pmd_present(*pmd)) {
+   addr = pmd_addr_end(addr, end);
+   continue;
+   }
+
+   if (kvm_pmd_huge(*pmd)) {
+   if (!split_pmd(kvm, pmd, addr)) {
+   kvm-arch.migration_in_progress = false;
+   return;
+   }
+   addr = pmd_addr_end(addr

[PATCH 1/3] migration dirtybitmap support ARMv7

2014-04-14 Thread Mario Smarduch


Headers for migration, prototypes

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 098f7dd..9b71f13 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,7 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+   int migration_in_progress;
 };
 
 #define KVM_NR_MEM_OBJS 40
@@ -228,4 +229,12 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
+
+void kvm_tlb_flush_vmid(struct kvm *kvm);
+
 #endif /* __ARM_KVM_HOST_H__ */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH 0/3] migration dirty bitmap support ARMv7

2014-04-15 Thread Mario Smarduch

Hi Marc,
Thanks for the feedback, very valuable going forward.  I'll clean the patch 
up and 
repost with much more documentation. One key error you pointed out,
and I overlooked  but knew about it, is to error out from get dirty log if 
whole 
VM can't be  WPed, at that point QEMU will abort.

- Mario

-Original Message-
From: Marc Zyngier [mailto:marc.zyng...@arm.com] 
Sent: Tuesday, April 15, 2014 2:13 AM
To: Mario Smarduch
Cc: kvm...@lists.cs.columbia.edu; christoffer.d...@linaro.org; 이정석; 정성진; 
kvm@vger.kernel.org
Subject: Re: [PATCH 0/3] migration dirty bitmap support ARMv7

Mario,

On 15/04/14 02:24, Mario Smarduch wrote:
 
 The patch set supports migration dirty bitmap support implementation 
 for arm-kvm.  Spliting of pmd's to pte's as suggested is implemented 
 on demand when migration is started.
 
 I tested it on 4-way SMP ARMv7, with SMP guests.
 2GB VMs with dirty shared memory segments upto 1.8 GB and relatively 
 fast update rates 16Mb/5mS.
 
 Next course of action would be rmap support which scales much better 
 on bigger systems. Although one think that confused me, x86 migrations 
 were sometimes
 10 to 15 times slower, I think it must be something wrong with my 
 configuration.
 
 
 Mario Smarduch (3):
   headers for migration dirtybitmap support
   initial write protect of VM address space and on dirty log read
   hooks to interface with QEMU for initial write protect, dirty log 
 read
 
  arch/arm/include/asm/kvm_host.h |9 +++
  arch/arm/kvm/arm.c  |   62 ++-
  arch/arm/kvm/mmu.c  |  158 
 ++-
  3 files changed, 226 insertions(+), 3 deletions(-)
 

Overall, I think this patch series requires some work, the most glaring issue 
being the total lack of documentation (even by my own standards).
Also, it is fairly obvious that ARMv8 should be done at the same time (as all 
that code is shared, and will avoid API abuse...), and that a fair amount of 
the code is actually generic across architecture, and should be shared.

I'm very much looking forward to the next iteration of this series.

Thanks,

M.
--
Jazz is not dead. It just smells funny...
N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

Re: [PATCH 2/3] migration dirtybitmap support ARMv7

2014-04-15 Thread Mario Smarduch

On 04/15/2014 01:58 AM, Marc Zyngier wrote:
 
 Why do you nuke the whole TLBs for this VM? I assume you're going to
 repeatedly call this for all the huge pages, aren't you? Can you delay
 this flush to do it only once?
 
 +get_page(virt_to_page(pte));
 +return true;
 +}
 +
 +/*
 + * Called from QEMU when migration dirty logging is started. Write the 
 protect
 + * current set. Future faults writes are tracked through WP of when dirty 
 log
 + * log.
 
 Same as above.
 
 + */
 +
 +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 +{
 +pgd_t *pgd;
 +pud_t *pud;
 +pmd_t *pmd;
 +pte_t *pte, new_pte;
 +pgd_t *pgdp = kvm-arch.pgd;
 +struct kvm_memory_slot *memslot = id_to_memslot(kvm-memslots, slot);
 +u64 start = memslot-base_gfn  PAGE_SHIFT;
 +u64 end = (memslot-base_gfn + memslot-npages)  PAGE_SHIFT;
 +u64 addr = start, addr1;
 +
 +spin_lock(kvm-mmu_lock);
 +kvm-arch.migration_in_progress = true;
 +while (addr  end) {
 +if (need_resched() || spin_needbreak(kvm-mmu_lock)) {
 +kvm_tlb_flush_vmid(kvm);
 
 Looks like you're extremely flush happy. If you're holding the lock, why
 do you need all the extra flushes in the previous function?

Reduced it to one flush, upon termination of the write protect loop.




 +
 +if (kvm_pmd_huge(*pmd)) {
 +if (!split_pmd(kvm, pmd, addr)) {
 +kvm-arch.migration_in_progress = false;
 +return;
 
 Bang, you're dead.
Yes added the unlock, also added return code in get dirty log function
to abort migration.

 

 

  pte_t new_pte = pfn_pte(pfn, PAGE_S2);
  if (writable) {
 +if (migration_active  hugetlb) {
 +/* get back pfn from fault_ipa */
 +pfn += (fault_ipa  PAGE_SHIFT) 
 +((1  (PMD_SHIFT - PAGE_SHIFT))-1);
 +new_pte = pfn_pte(pfn, PAGE_S2);
 
 Please explain this.
 Next patch series will update this, there was another
problem of handling pmd huge pages and directing them to
pte handling.



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/3] migration dirtybitmap support ARMv7

2014-04-15 Thread Mario Smarduch

Hi Eric, Mark -
   what repository should I use to pick up Eric patches?

For kvm_vm_ioctl_get_dirty_log() not sure what to make generic
it appears generic enough and it does what it needs to do?

Thanks,
  Mario

On 04/15/2014 02:06 AM, Marc Zyngier wrote:
 On 15/04/14 02:24, Mario Smarduch wrote:

 - support QEMU interface for initial VM Write Protect
 - QEMU Dirty bit map log retrieval


 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/kvm/arm.c |   62 
 +++-
  1 file changed, 61 insertions(+), 1 deletion(-)

 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index bd18bb8..9076e3d 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -241,6 +241,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 const struct kvm_memory_slot *old,
 enum kvm_mr_change change)
  {
 +if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
 +kvm_mmu_slot_remove_write_access(kvm, mem-slot);
  }
 
 There is a patch by Eric Auger doing the same thing. Please use it as a
 dependency.
 
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
 @@ -773,9 +775,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
  }
  }
  
 +/*
 + * Walks the memslot dirty bitmap, write protects dirty pages for next 
 rount,
 + * and stores the dirty bitmap fo QEMU retrieval.
 + *
 + */
  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
  {
 -return -EINVAL;
 +int r;
 +struct kvm_memory_slot *memslot;
 +unsigned long n, i;
 +unsigned long *dirty_bitmap;
 +unsigned long *dirty_bitmap_buffer;
 +bool is_dirty = false;
 +gfn_t offset;
 +
 +mutex_lock(kvm-slots_lock);
 +r = -EINVAL;
 +
 +if (log-slot = KVM_USER_MEM_SLOTS)
 +goto out;
 +
 +memslot = id_to_memslot(kvm-memslots, log-slot);
 +dirty_bitmap = memslot-dirty_bitmap;
 +
 +r = -ENOENT;
 +if (!dirty_bitmap)
 +goto out;
 +
 +n = kvm_dirty_bitmap_bytes(memslot);
 +dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
 +memset(dirty_bitmap_buffer, 0, n);
 +
 +spin_lock(kvm-mmu_lock);
 +for (i = 0; i  n / sizeof(long); i++) {
 +unsigned long mask;
 +
 +if (!dirty_bitmap[i])
 +continue;
 +
 +is_dirty = true;
 +offset = i * BITS_PER_LONG;
 +kvm_mmu_write_protect_pt_masked(kvm, memslot, offset,
 +dirty_bitmap[i]);
 +mask = dirty_bitmap[i];
 +dirty_bitmap_buffer[i] = mask;
 +dirty_bitmap[i] = 0;
 +}
 +
 +if (is_dirty)
 +kvm_tlb_flush_vmid(kvm);
 +
 +spin_unlock(kvm-mmu_lock);
 +r = -EFAULT;
 +
 +if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n))
 +goto out;
 +
 +r = 0;
 +out:
 +mutex_unlock(kvm-slots_lock);
 +return r;
  }
 
 This is a direct copy of the x86 code. Please make it generic.
 
  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,

 
 Thanks,
 
   M.
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/5] live migration dirty bitmap support for ARMv7

2014-04-16 Thread Mario Smarduch


Revised iteration after initial comments. Still just for ARMv7. I looked
at the ARMv8 code and yes it practically appears to reuse most of 
fault handling in ARMv7, I wasn't aware so much code was in common. 
But before then want to make sure it's reliable on real hardware.

This patch adds support for ARMv7 Live Migration, primarily dirty bit map
management is added. The patch follows the normal migration flow managed by
user space, first write protecting the entire address space and later 
keeping track of dirty pages. In the process of initial write protection,
and first time faults huge pages are broken up into small pages to support
migration on loaded systems.

Mario Smarduch (5):
  add ARMv7 HYP API to flush VM TLBs without address param
  live migration support for initial write protect of VM to manage
dirty pages
  live migration support for VM dirty log management
  add 2nd stage page fault handling during live migration
  add kvm_arch glogal live migration variable

 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |7 ++
 arch/arm/kvm/arm.c  |   75 -
 arch/arm/kvm/interrupts.S   |5 +
 arch/arm/kvm/mmu.c  |  220 ++-
 5 files changed, 305 insertions(+), 3 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/5] live migration support for initial write protect of VM

2014-04-16 Thread Mario Smarduch


Add support for initial write protection of guest VM, to later manage dirty
pages. Reduced TLB flushing to one flush after memory region is write protected.
This is based on Erics patch, which applied cleanly. The only patch I found
in the archives was the memory region delete, but still in arm.c.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |1 +
 arch/arm/kvm/arm.c  |4 ++
 arch/arm/kvm/mmu.c  |  125 +++
 3 files changed, 130 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 315e3f3..7ac1fdc 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -229,5 +229,6 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 void kvm_tlb_flush_vm(struct kvm *kvm);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9a4bc10..7714cc6 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -249,6 +249,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
unmap_stage2_range(kvm, gpa, size);
spin_unlock(kvm-mmu_lock);
}
+
+   /* Request has been issued to migrate the guest, 1st write protect VM */
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   kvm_mmu_slot_remove_write_access(kvm, mem-slot);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index d7a1846..b85ab56 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -648,6 +648,131 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+/**
+ * split_pmd - splits huge pages into small pages, required to keep a dirty
+ *  log of small memory granules, otherwise huge pages would need to be
+ *  migrated. Practically an idle system has problems migrating with
+ *  huge pages.  Called during WP of entire VM address space, done
+ *  initially when  migration thread isses the KVM_MEM_LOG_DIRTY_PAGES ioctl.
+ *  mmu_lock lock must be acquired by caller
+ *
+ * @kvm:The KVM pointer
+ * @pmd:pmd to 2nd stage huge page
+ * @addr: ` Guest Physical Address
+ */
+static bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
+{
+   struct page *page;
+   pfn_t pfn = pmd_pfn(*pmd);
+   pte_t *pte, new_pte;
+   int i;
+
+   page = alloc_page(GFP_KERNEL);
+   if (page == NULL)
+   return false;
+
+   pte = page_address(page);
+   /* first break up the huge page into small page pte's */
+   for (i = 0; i  PTRS_PER_PMD; i++) {
+   new_pte = pfn_pte(pfn+i, PAGE_S2);
+   pte[i] = new_pte;
+   }
+   kvm_clean_pte(pte);
+   /* now set the pmd to pte table */
+   pmd_populate_kernel(NULL, pmd, pte);
+
+   get_page(virt_to_page(pte));
+   return true;
+}
+
+
+/**
+ * kvm_mmu_slot_remove_access - write protects entire VM address space.
+ *  Called at start of migration when KVM_MEM_LOG_DIRTY_PAGES ioctl is
+ *  issued. After this function returns all pages - minus the ones faulted
+ *  in when mmu_lock is released, but those pages will be marked in dirty log
+ *  and are not forgotten.
+ *
+ *  Initial VM write protect sweep is required to keep track of dirty pages for
+ *  subsequent memory region dirty log retrieval.
+ *  - mmu_lock is held during - protect against concurent faults, mmu notifier
+ *invalidate/unmap/update user pte, or direct device write to guest memory
+ *
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot the dirty log is retrieved for
+ */
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte, new_pte;
+   pgd_t *pgdp = kvm-arch.pgd;
+   struct kvm_memory_slot *memslot = id_to_memslot(kvm-memslots, slot);
+   u64 start = memslot-base_gfn  PAGE_SHIFT;
+   u64 end = (memslot-base_gfn + memslot-npages)  PAGE_SHIFT;
+   u64 addr = start;
+
+   spin_lock(kvm-mmu_lock);
+   kvm-arch.migration_in_progress = 1;
+   while (addr  end) {
+   /* Relieve contention for mmu_lock. there is no need to flush
+* TLBs here. TLB updates will be picked up on TLB refills or
+* flush of VM TLBs. The important things is after you terminate
+* loop all pmds have been split, write protected and visible
+*/
+   if (need_resched() || spin_needbreak(kvm-mmu_lock))
+   cond_resched_lock(kvm-mmu_lock);
+
+   pgd = pgdp + pgd_index(addr);
+   if (!pgd_present(*pgd)) {
+   addr = pgd_addr_end(addr, end

[PATCH 1/5] add ARMv7 HYP API to flush VM TLBs without address param

2014-04-16 Thread Mario Smarduch


Add HYP API to invalidate all VM TLBs without passing address parameter,
that kvm_tlb_flush_vmid_ipa() uses. Hopefully this is a valid way
to do it. Tests show nothing is broken.

The address parameter is confusing since whole VM is being invalidated.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |2 ++
 arch/arm/kvm/interrupts.S   |5 +
 arch/arm/kvm/mmu.c  |9 +
 4 files changed, 17 insertions(+)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 661da11..090398d 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -76,6 +76,7 @@ extern char __kvm_hyp_code_end[];
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vm(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 098f7dd..315e3f3 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -228,4 +228,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+void kvm_tlb_flush_vm(struct kvm *kvm);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 0d68d40..f81c228 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -45,8 +45,13 @@ __kvm_hyp_code_start:
  *
  * As v7 does not support flushing per IPA, just nuke the whole TLB
  * instead, ignoring the ipa value.
+ *
+ * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 to flush all VM
+ * TLBs, with no need to pass IPA. Eliminate confusing code which flushes
+ * whole VM but still requires an IPA which is unused.
  */
 ENTRY(__kvm_tlb_flush_vmid_ipa)
+ENTRY(__kvm_tlb_flush_vm)
push{r2, r3}
 
dsb ishst
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e8580e2..d7a1846 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,15 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/* Flushes entire VMs TLBs, for ARMv7 reuses __kvm_tlb_flush_vmid_ipa
+ * interface without the misleading address argument
+ */
+void kvm_tlb_flush_vm(struct kvm *kvm)
+{
+   if (kvm)
+   kvm_call_hyp(__kvm_tlb_flush_vm, kvm);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/5] live migration support for VM dirty log management

2014-04-16 Thread Mario Smarduch


Add support for dirty bitmap management. Wanted to make it generic but function
does a couple things different then the x86 version.


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |   71 ++-
 arch/arm/kvm/mmu.c  |   53 +
 3 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 7ac1fdc..16ed4e4 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -230,5 +230,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 
value);
 
 void kvm_tlb_flush_vm(struct kvm *kvm);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 7714cc6..7882343 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -785,9 +785,78 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
+
+/**
+ * kvm_mmu_slot_remove_access - retrieves the log of dirty pages for a memslot.
+ * It's itteratively during migration to retrieve pages written since
+ * last call. In the process write protects ptes that are dirty for next
+ * time, holds the mmu_lock while write protecting dirty pages.
+ *
+ * @kvm:The KVM pointer
+ * @log:Bitmap of dirty pages return.
+ */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-   return -EINVAL;
+   int r;
+   struct kvm_memory_slot *memslot;
+   unsigned long n, i;
+   unsigned long *dirty_bitmap;
+   unsigned long *dirty_bitmap_buffer;
+   bool is_dirty = false;
+   gfn_t offset;
+
+   mutex_lock(kvm-slots_lock);
+   r = -EINVAL;
+
+   /* Return with error code will cause migration to abort, this happens
+* when initial write protection of VM to manage dirty pages fails
+*/
+   if (kvm-arch.migration_in_progress == -1)
+   goto out;
+
+   if (log-slot = KVM_USER_MEM_SLOTS)
+   goto out;
+
+   memslot = id_to_memslot(kvm-memslots, log-slot);
+   dirty_bitmap = memslot-dirty_bitmap;
+
+   r = -ENOENT;
+   if (!dirty_bitmap)
+   goto out;
+
+   n = kvm_dirty_bitmap_bytes(memslot);
+   dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+   memset(dirty_bitmap_buffer, 0, n);
+
+   spin_lock(kvm-mmu_lock);
+   for (i = 0; i  n / sizeof(long); i++) {
+   unsigned long mask;
+
+   if (!dirty_bitmap[i])
+   continue;
+
+   is_dirty = true;
+   offset = i * BITS_PER_LONG;
+   kvm_mmu_write_protect_pt_masked(kvm, memslot, offset,
+   dirty_bitmap[i]);
+   mask = dirty_bitmap[i];
+   dirty_bitmap_buffer[i] = mask;
+   dirty_bitmap[i] = 0;
+   }
+
+   if (is_dirty)
+   kvm_tlb_flush_vm(kvm);
+
+   spin_unlock(kvm-mmu_lock);
+   r = -EFAULT;
+
+   if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n))
+   goto out;
+
+   r = 0;
+out:
+   mutex_unlock(kvm-slots_lock);
+   return r;
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b85ab56..47bec1c 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -773,6 +773,59 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int 
slot)
spin_unlock(kvm-mmu_lock);
 }
 
+
+/**
+ * kvm_mmu_write_protected_pt_masked - after migration thread write protects
+ *  the entire VM address space itterative calls are made to get diry pages
+ *  as the VM pages are being migrated. New dirty pages may be subset
+ *  of initial WPed VM or new writes faulted in. Here write protect new
+ *  dirty pages again in preparation of next dirty log read. This function is
+ *  called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages
+ *  need to be migrated.
+ *   'kvm-mmu_lock' must be  held to protect against concurrent modification
+ *   of page tables (2nd stage fault, mmu modifiers, device writes)
+ *
+ * @kvm:The KVM pointer
+ * @slot:  The memory slot the dirty log is retrieved for
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:   The mask of dirty pages at offset 'gnf_offset in this memory
+ *  slot to be writ protect
+ */
+
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   phys_addr_t ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+   pud_t *pud;
+   pmd_t *pmd

[PATCH 4/5] add 2nd stage page fault handling during live migration

2014-04-16 Thread Mario Smarduch


Additional logic to handle second stage page faults during migration. Primarily
page faults are prevented from creating huge pages.


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   33 +++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 47bec1c..ebec33c 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -839,6 +839,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   bool migration_active;
 
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -890,12 +891,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
return -EFAULT;
 
spin_lock(kvm-mmu_lock);
+   /* place inside lock to prevent race condition when whole VM is being
+* write proteced initially, prevent pmd update if it's split up.
+*/
+   migration_active = vcpu-kvm-arch.migration_in_progress;
+
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+
+   /* During migration don't rebuild huge pages */
+   if (!hugetlb  !force_pte  !migration_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
 
-   if (hugetlb) {
+   /* Steer away from installing PMDs if migrating, migration failed,
+* or this an initial page fault. Migrating huge pages is too slow.
+*/
+   if (!migration_active  hugetlb) {
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
@@ -907,6 +918,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
if (writable) {
+   /* First convert huge page pfn to normal 4k page pfn,
+* while  migration is in progress.
+* Second in migration mode and rare case where
+* splitting of huge pages fails check if pmd is
+* mapping a huge page if it is then clear it so
+* stage2_set_pte() can map in a small page.
+*/
+   if (migration_active  hugetlb) {
+   pmd_t *pmd;
+   pfn += (fault_ipa  PAGE_SHIFT) 
+   (PTRS_PER_PMD-1);
+   new_pte = pfn_pte(pfn, PAGE_S2);
+   pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
+   if (pmd  kvm_pmd_huge(*pmd))
+   clear_pmd_entry(kvm, pmd, fault_ipa);
+   }
kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
}
@@ -914,6 +941,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
}
 
+   if (writable)
+   mark_page_dirty(kvm, gfn);
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/5] add kvm_arch global live migration variable

2014-04-16 Thread Mario Smarduch

This should be in an earlier patch, omitted by mistake.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 16ed4e4..d77c425 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,7 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+   int migration_in_progress;
 };
 
 #define KVM_NR_MEM_OBJS 40
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH 3/5] live migration support for VM dirty log management

2014-04-17 Thread Mario Smarduch

MZ So let's play the difference game with x86:

  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log


kvm_vm_ioctl_get_dirty_log() is identical now to x86 version moved it to 
kvm_main.c,
to make it generic, it's declared weak. Do I go into x86 and remove that 
function?

Or leave it to x86 folks to do it?

-Original Message-

 + *

 + }
 +
 + if (is_dirty)
 + kvm_tlb_flush_vm(kvm);

MZ This can be easily abstracted to be a kvm_flush_remote_tlbs on x86, and a 
HW broadcast on ARM.

Kvm_tlb_flush_vm() is replaced  with kvm_flush_remote_tlbs() I made that
function weak and declared a ARM version, in arm mmu.c

The current version sends IPIs to vCPU running the guest, ARMv7+
does not need that.  Is that ok?


 + /* walk set bits in the mask and write protect corresponding pages */
 + while (mask) {
 + ipa = (slot-base_gfn + gfn_offset + __ffs(mask))  PAGE_SHIFT;
 + pgd = pgdp + pgd_index(ipa);
 + if (!pgd_present(*pgd))
 + goto update_mask;

MZ I think something is wrong in your logic. If there is no PGD, it means a 
whole 1GB isn't present. Yet you're just clearing one bit from the mask and 
doing it again. As you're only looking at BITS_PER_LONG MZ contiguous pages at 
a time, it is likely that the same thing will happen for the other pages, and 
you're just wasting precious CPU cycles here.

Yes this is grossly inefficient,  I updated it to walk ptes only, after first 
determining if it straddles a pmd.
Should mostly be pte walks with maybe one PMD walk but unlikely.


 + new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
 + *pte = new_pte;

MZ I'd like to see these two lines in a separate function (something like 
stage2_mark_pte_ro)...

Yes ok.

(emailed from outlook client)
--
Jazz is not dead. It just smells funny.
N떑꿩�r툤y鉉싕b쾊Ф푤v�^�)頻{.n�+돴ㅎh㎍썳變}찠꼿쟺�j:+v돣�쳭喩zZ+�+zf＂톒쉱�~넮녬i鎬z�췿ⅱ�?솳鈺��)刪f

RE: [PATCH 4/5] add 2nd stage page fault handling during live migration

2014-04-18 Thread Mario Smarduch

Hi Marc,

 + if (migration_active  hugetlb) {
 + pmd_t *pmd;
 + pfn += (fault_ipa  PAGE_SHIFT) 
 + (PTRS_PER_PMD-1);

MZ Shouldn't that be pfn += pte_index(fault_addr);?

I'll change much cleaner.

   }
  
 + if (writable)

MZ Shouldn't that be done only when migration is active?
Convention in other architectures is call it anytime page is dirty, the function
checks if dirty map  is allocated if not it returns. 

 + mark_page_dirty(kvm, gfn);
  
  out_unlock:
   spin_unlock(kvm-mmu_lock);

--
Jazz is not dead. It just smells funny.

[PATCH v3 0/4] live migration dirty bitmap support for ARMv7

2014-04-22 Thread Mario Smarduch

Hi,
 this the third iteration of live migration support for the time being on 
ARMv7. The patches depend on Eric Augers patch for memory regions.

Changes since v2:
- move initial VM write protect to memory region architecture prepare function
  (needed to make dirty logging function generic) 
- added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment
- optimized initial VM memory region write protect to do fewer table lookups -
  applied Marc's comment for walking dirty bitmap mask
- added pud_addr_end() for stage2 tables, to make the walk 4-level
- added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic
  one weak, Marc's comment to for generic dirty bitmap log function
- optimized walking dirty bit map mask to skip upper tables - Marc's comment
- deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged 
  the function weak - Marc's comment
- changed Data Abort handler pte index handling - Marc's comment


Mario Smarduch (4):
  add ARMv7 HYP API to flush VM TLBs without address param
  live migration support for initial write protect of VM to track dirty
pages
  live migration support for VM dirty log management
  add 2nd stage page fault handling during live migration

 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |   13 ++
 arch/arm/kvm/arm.c  |8 +-
 arch/arm/kvm/interrupts.S   |5 +
 arch/arm/kvm/mmu.c  |  303 ++-
 arch/x86/kvm/x86.c  |   78 --
 virt/kvm/kvm_main.c |   87 ++-
 7 files changed, 409 insertions(+), 86 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 1/4] add ARMv7 HYP API to flush VM TLBs without address param

2014-04-22 Thread Mario Smarduch

Add HYP interface for global VM TLB invalidation without address
parameter.

- Added ARM version of kvm_flush_remote_tlbs()

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |2 ++
 arch/arm/kvm/interrupts.S   |5 +
 arch/arm/kvm/mmu.c  |   10 ++
 4 files changed, 18 insertions(+)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 661da11..0eeaca1 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -76,6 +76,7 @@ extern char __kvm_hyp_code_end[];
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 098f7dd..1e739f9 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -228,4 +228,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+void kvm_tlb_flush_vmid(struct kvm *kvm);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 0d68d40..8620280 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -45,8 +45,12 @@ __kvm_hyp_code_start:
  *
  * As v7 does not support flushing per IPA, just nuke the whole TLB
  * instead, ignoring the ipa value.
+ *
+ * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 for global VM TLB
+ * flush with no address parameters.
  */
 ENTRY(__kvm_tlb_flush_vmid_ipa)
+ENTRY(__kvm_tlb_flush_vmid)
push{r2, r3}
 
dsb ishst
@@ -65,6 +69,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
pop {r2, r3}
bx  lr
 ENDPROC(__kvm_tlb_flush_vmid_ipa)
+ENDPROC(__kvm_tlb_flush_vmid)
 
 /
  * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e8580e2..7ab77f3 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,16 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/* Function reuses __kvm_tlb_flush_vmid_ipa() HYP interface without additional
+ * address argument to flush entire VM TLBs.
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+   if (kvm)
+   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 2/4] live migration support for initial write protect of VM

2014-04-22 Thread Mario Smarduch



Support for live migration initial write protect.
- moved write protect to architecture memory region prepare function. This
  way you can fail, abort migration without keep track of migration status.
- Above also allows to generalize read dirty log function with x86
- Added stage2_mark_pte_ro()
- optimized initial write protect, skip upper table lookups
- added stage2pmd_addr_end() to do generic 4 level table walk 
- changed kvm_flush_remote_tlbs() to weak function

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |8 ++
 arch/arm/kvm/arm.c  |3 +
 arch/arm/kvm/mmu.c  |  163 +++
 virt/kvm/kvm_main.c |5 +-
 4 files changed, 178 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 1e739f9..9f827c8 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,12 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+
+   /* Marks start of migration, used to handle 2nd stage page faults
+* during migration, prevent installing huge pages and split huge pages
+* to small pages.
+*/
+   int migration_in_progress;
 };
 
 #define KVM_NR_MEM_OBJS 40
@@ -230,4 +236,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 
value);
 
 void kvm_tlb_flush_vmid(struct kvm *kvm);
 
+int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9a4bc10..b916478 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -233,6 +233,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
   struct kvm_userspace_memory_region *mem,
   enum kvm_mr_change change)
 {
+   /* Request for migration issued by user, write protect memory slot */
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   return kvm_mmu_slot_remove_write_access(kvm, mem-slot);
return 0;
 }
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 7ab77f3..4d029a6 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -31,6 +31,11 @@
 
 #include trace.h
 
+#define stage2pud_addr_end(addr, end)  \
+({ u64 __boundary = ((addr) + PUD_SIZE)  PUD_MASK;\
+   (__boundary - 1  (end) - 1) ? __boundary : (end);  \
+})
+
 extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 
 static pgd_t *boot_hyp_pgd;
@@ -569,6 +574,15 @@ static int stage2_set_pte(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cache,
return 0;
 }
 
+/* Write protect page */
+static void stage2_mark_pte_ro(pte_t *pte)
+{
+   pte_t new_pte;
+
+   new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
+   *pte = new_pte;
+}
+
 /**
  * kvm_phys_addr_ioremap - map a device range to guest IPA
  *
@@ -649,6 +663,155 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+/**
+ * split_pmd - splits huge pages to small pages, required to keep a dirty log 
of
+ *  smaller memory granules, otherwise huge pages would need to be
+ *  migrated. Practically an idle system has problems migrating with
+ *  huge pages.  Called during WP of entire VM address space, done
+ *  initially when  migration thread isses the KVM_MEM_LOG_DIRTY_PAGES
+ *  ioctl.
+ *  The mmu_lock is held during splitting.
+ *
+ * @kvm:The KVM pointer
+ * @pmd:Pmd to 2nd stage huge page
+ * @addr: ` Guest Physical Address
+ */
+int split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
+{
+   struct page *page;
+   pfn_t pfn = pmd_pfn(*pmd);
+   pte_t *pte;
+   int i;
+
+   page = alloc_page(GFP_KERNEL);
+   if (page == NULL)
+   return -ENOMEM;
+
+   pte = page_address(page);
+   /* cycle through ptes first, use pmd pfn */
+   for (i = 0; i  PTRS_PER_PMD; i++) {
+   pte[i] = pfn_pte(pfn+i, 0);
+   stage2_mark_pte_ro(pte[i]);
+   }
+   kvm_clean_pte(pte);
+   /* After page table setup set pmd */
+   pmd_populate_kernel(NULL, pmd, pte);
+
+   /* get reference on pte page */
+   get_page(virt_to_page(pte));
+   return 0;
+}
+
+/**
+ * kvm_mmu_slot_remove_access - write protects entire VM address space.
+ *  Called at start of migration when KVM_MEM_LOG_DIRTY_PAGES ioctl is
+ *  issued. After this function returns all pages (minus the ones faulted
+ *  in when mmu_lock is released) must be write protected to keep track of
+ *  dirty pages to migrate on subsequent dirty log retrieval.
+ *  mmu_lock is held during write protecting, released on contention.
+ *
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot the dirty log is retrieved for
+ */
+int

[PATCH v3 4/4] add 2nd stage page fault handling during live migration

2014-04-22 Thread Mario Smarduch


- added pte_index() to add to pmd pfn

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   31 +--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 52d4dd6..61ee812 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -924,6 +924,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   bool migration_active;

write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -975,12 +976,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
return -EFAULT;

spin_lock(kvm-mmu_lock);
+   /* place inside lock to prevent race condition when whole VM is being
+* write proteced. Prevent race of huge page install when migration is
+* active.
+*/
+   migration_active = vcpu-kvm-arch.migration_in_progress;
+
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+
+   /* During migration don't rebuild huge pages */
+   if (!hugetlb  !force_pte  !migration_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);

-   if (hugetlb) {
+   /* During migration don't install new huge pages */
+   if (hugetlb  !migration_active) {
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
@@ -992,6 +1002,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
if (writable) {
+   /* First convert huge page pfn to normal 4k page pfn,
+* while  migration is in progress.
+* Second in migration mode and rare case where
+* splitting of huge pages fails check if pmd is
+* mapping a huge page if it is then clear it so
+* stage2_set_pte() can map in a small page.
+*/
+   if (migration_active  hugetlb) {
+   pmd_t *pmd;
+   pfn += pte_index(fault_ipa);
+   new_pte = pfn_pte(pfn, PAGE_S2);
+   pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
+   if (pmd  kvm_pmd_huge(*pmd))
+   clear_pmd_entry(kvm, pmd, fault_ipa);
+   }
kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
}
@@ -999,6 +1024,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
}

+   if (writable)
+   mark_page_dirty(kvm, gfn);

 out_unlock:
spin_unlock(kvm-mmu_lock);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 3/4] live migration support for VM dirty log management

2014-04-22 Thread Mario Smarduch


- made kvm_vm_ioctl_get_dirty_log() generic moved to kvm_main.c, deleted 
arm,x86  versions
- optimized kvm_mmu_write_protected_pt_masked() to skip upper table lookups


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |5 --
 arch/arm/kvm/mmu.c  |   99 +++
 arch/x86/kvm/x86.c  |   78 --
 virt/kvm/kvm_main.c |   82 
 5 files changed, 184 insertions(+), 83 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 9f827c8..c5c27d8 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -237,5 +237,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 
value);
 void kvm_tlb_flush_vmid(struct kvm *kvm);
 
 int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index b916478..6ca3e84 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -784,11 +784,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-   return -EINVAL;
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 4d029a6..52d4dd6 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -812,6 +812,105 @@ int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int 
slot)
return 0;
 }
 
+/**
+ * kvm_mmu_write_protected_pt_masked - after migration thread write protects
+ *  the entire VM address space itterative call are made to get diry pags
+ *  as the VM pages are being migrated. New dirty pages may be subset
+ *  of initial WPed VM or new writes faulted in. Here write protect new
+ *  dirty pages again in preparation of next dirty log read. This function is
+ *  called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages
+ *  need to be migrated.
+ *   'kvm-mmu_lock' must be  held to protect against concurrent modification
+ *   of page tables (2nd stage fault, mmu modifiers, ...)
+ *
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot the dirty log is retrieved for
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:   The mask of dirty pages at offset 'gnf_offset in this memory
+ *  slot to be writ protect
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   phys_addr_t ipa, next, offset_ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
+   gfn_t gfnofst = slot-base_gfn + gfn_offset;
+   bool crosses_pmd;
+
+   ipa = (gfnofst + __ffs(mask))  PAGE_SHIFT;
+   offset_ipa  = gfnofst  PAGE_SHIFT;
+   next = (gfnofst + (BITS_PER_LONG - 1))  PAGE_SHIFT;
+
+   /* check if mask width crosses 2nd level page table range, and
+* possibly 3rd, 4th. If not skip upper table lookups. Unlikely
+* to be true machine memory regions tend to start on atleast PMD
+* boundary and mask is a power of 2.
+*/
+   crosses_pmd = ((offset_ipa  PMD_MASK) ^ (next  PMD_MASK)) ? true :
+   false;
+   /* If pgd, pud, pmd not present and you cross pmd range check next
+* index. Unlikely that pgd and pud would be not present. Between
+* dirty page marking and now page tables may have been altered.
+*/
+   pgd = pgdp + pgd_index(ipa);
+   if (unlikely(crosses_pmd  !pgd_present(*pgd))) {
+   pgd = pgdp + pgd_index(next);
+   if (!pgd_present(*pgd))
+   return;
+   }
+
+   pud = pud_offset(pgd, ipa);
+   if (unlikely(crosses_pmd  !pud_present(*pud))) {
+   pud = pud_offset(pgd, next);
+   if (!pud_present(*pud))
+   return;
+   }
+
+   pmd = pmd_offset(pud, ipa);
+   if (unlikely(crosses_pmd  !pmd_present(*pmd))) {
+   pmd = pmd_offset(pud, next);
+   if (!pmd_present(*pmd))
+   return;
+   }
+
+   for (;;) {
+   pte = pte_offset_kernel(pmd, ipa);
+   if (!pte_present(*pte))
+   goto next_ipa;
+
+   if ((*pte  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
+   goto next_ipa;
+
+   stage2_mark_pte_ro(pte);
+
+next_ipa:
+   mask = mask - 1;
+   if (!mask

Re: [PATCH v3 2/4] live migration support for initial write protect of VM

2014-04-24 Thread Mario Smarduch

On 04/24/2014 09:39 AM, Steve Capper wrote:
 On Wed, Apr 23, 2014 at 12:18:07AM +0100, Mario Smarduch wrote:


 Support for live migration initial write protect.
 - moved write protect to architecture memory region prepare function. This
   way you can fail, abort migration without keep track of migration status.
 - Above also allows to generalize read dirty log function with x86
 - Added stage2_mark_pte_ro()
 - optimized initial write protect, skip upper table lookups
 - added stage2pmd_addr_end() to do generic 4 level table walk
 - changed kvm_flush_remote_tlbs() to weak function
 
 Hello Mario,
 I've taken a quick look at this and have a few suggestions below.
 (I'm not a KVM expert, but took a look at the memory manipulation).

Hi Steve,
your suggestions are very helpful, my response inline.

Thanks.
  Mario
 
 Future versions of this series could probably benefit from being sent
 to lakml too?
 
 Cheers,
 --
 Steve
 

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h |8 ++
  arch/arm/kvm/arm.c  |3 +
  arch/arm/kvm/mmu.c  |  163 
 +++
  virt/kvm/kvm_main.c |5 +-
  4 files changed, 178 insertions(+), 1 deletion(-)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 1e739f9..9f827c8 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -67,6 +67,12 @@ struct kvm_arch {

 /* Interrupt controller */
 struct vgic_distvgic;
 +
 +   /* Marks start of migration, used to handle 2nd stage page faults
 +* during migration, prevent installing huge pages and split huge 
 pages
 +* to small pages.
 +*/
 +   int migration_in_progress;
  };

  #define KVM_NR_MEM_OBJS 40
 @@ -230,4 +236,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, 
 u64 value);

  void kvm_tlb_flush_vmid(struct kvm *kvm);

 +int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 +
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 9a4bc10..b916478 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -233,6 +233,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
  {
 +   /* Request for migration issued by user, write protect memory slot */
 +   if ((change != KVM_MR_DELETE)  (mem-flags  
 KVM_MEM_LOG_DIRTY_PAGES))
 +   return kvm_mmu_slot_remove_write_access(kvm, mem-slot);
 return 0;
  }

 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 7ab77f3..4d029a6 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -31,6 +31,11 @@

  #include trace.h

 +#define stage2pud_addr_end(addr, end)  \
 +({ u64 __boundary = ((addr) + PUD_SIZE)  PUD_MASK;\
 +   (__boundary - 1  (end) - 1) ? __boundary : (end);  \
 +})
 
 A matter of personal preference: can this be a static inline function
 instead? That way you could avoid ambiguity with the parameter types.
 (not an issue here, but this has bitten me in the past).

Yes good point, will change.
 
 +
  extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];

  static pgd_t *boot_hyp_pgd;
 @@ -569,6 +574,15 @@ static int stage2_set_pte(struct kvm *kvm, struct 
 kvm_mmu_memory_cache *cache,
 return 0;
  }

 +/* Write protect page */
 +static void stage2_mark_pte_ro(pte_t *pte)
 +{
 +   pte_t new_pte;
 +
 +   new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
 +   *pte = new_pte;
 +}
 
 This isn't making the pte read only.
 It's nuking all the flags from the pte and replacing them with factory
 settings. (In this case the PAGE_S2 pgprot).
 If we had other attributes that we later wish to retain this could be
 easily overlooked. Perhaps a new name for the function?

Yes that's pretty bad, I'll clear the write protect bit only.

 
 +
  /**
   * kvm_phys_addr_ioremap - map a device range to guest IPA
   *
 @@ -649,6 +663,155 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
 phys_addr_t *ipap)
 return false;
  }

 +/**
 + * split_pmd - splits huge pages to small pages, required to keep a dirty 
 log of
 + *  smaller memory granules, otherwise huge pages would need to be
 + *  migrated. Practically an idle system has problems migrating with
 + *  huge pages.  Called during WP of entire VM address space, done
 + *  initially when  migration thread isses the KVM_MEM_LOG_DIRTY_PAGES
 + *  ioctl.
 + *  The mmu_lock is held during splitting.
 + *
 + * @kvm:The KVM pointer
 + * @pmd:Pmd to 2nd stage huge page
 + * @addr: ` Guest Physical Address
 Nitpick: typo `

Yes overlooked it, will delete.
 
 + */
 +int split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
 
 Maybe worth renaming

[PATCH v4 2/5] live migration support for initial write protect of VM

2014-04-28 Thread Mario Smarduch

Patch adds support for live migration initial split up of huge pages
in memory slot and write protection of all pages in memory slot.


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |8 ++
 arch/arm/include/asm/kvm_mmu.h  |   11 ++
 arch/arm/kvm/arm.c  |3 +
 arch/arm/kvm/mmu.c  |  215 +++
 virt/kvm/kvm_main.c |5 +-
 5 files changed, 241 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 1e739f9..9f827c8 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,12 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+
+   /* Marks start of migration, used to handle 2nd stage page faults
+* during migration, prevent installing huge pages and split huge pages
+* to small pages.
+*/
+   int migration_in_progress;
 };
 
 #define KVM_NR_MEM_OBJS 40
@@ -230,4 +236,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 
value);
 
 void kvm_tlb_flush_vmid(struct kvm *kvm);
 
+int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index a91c863..342ae81 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -111,6 +111,17 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
pte_val(*pte) |= L_PTE_S2_RDWR;
 }
 
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+   pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR);
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+   return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
+}
+
+
 static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
 {
pmd_val(*pmd) |= L_PMD_S2_RDWR;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9a4bc10..b916478 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -233,6 +233,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
   struct kvm_userspace_memory_region *mem,
   enum kvm_mr_change change)
 {
+   /* Request for migration issued by user, write protect memory slot */
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   return kvm_mmu_slot_remove_write_access(kvm, mem-slot);
return 0;
 }
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 7ab77f3..15bbca2 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -44,6 +44,41 @@ static phys_addr_t hyp_idmap_vector;
 
 #define kvm_pmd_huge(_x)   (pmd_huge(_x) || pmd_trans_huge(_x))
 
+/* Used for 2nd stage and identity mappings. For stage 2 mappings
+ * instead of unsigned long, u64 is use  which won't overflow on ARMv7 for
+ * IPAs above 4GB. For ARMv8 use default functions.
+ */
+
+static phys_addr_t kvm_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+#if BITS_PER_LONG == 32
+   u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;
+   return __boundary - 1  end - 1 ? __boundary : end;
+#else
+   return pgd_addr_end(addr, end);
+#endif
+}
+
+static phys_addr_t kvm_pud_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+#if BITS_PER_LONG == 32
+   u64 __boundary = ((addr) + PUD_SIZE)  PUD_MASK;
+   return __boundary - 1  end - 1 ? __boundary : end;
+#else
+   return pud_addr_end(addr, end);
+#endif
+}
+
+static phys_addr_t kvm_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+#if BITS_PER_LONG == 32
+   u64 __boundary = ((addr) + PMD_SIZE)  PMD_MASK;
+   return __boundary - 1  end - 1 ? __boundary : end;
+#else
+   return pmd_addr_end(addr, end);
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
/*
@@ -649,6 +684,186 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+/**
+ * kvm_split_pmd - splits huge pages to small pages, required to keep a dirty
+ * log of smaller memory granules, otherwise huge pages would need to be
+ * migrated. Practically an idle system has problems migrating with
+ * huge pages.  Called during WP of entire VM address space, done
+ * initially when  migration thread isses the KVM_MEM_LOG_DIRTY_PAGES
+ * ioctl.
+ * The mmu_lock is held during splitting.
+ *
+ * @kvm:The KVM pointer
+ * @pmd:Pmd to 2nd stage huge page
+ * @addr:   Guest Physical Address
+ */
+static int kvm_split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
+{
+   struct page *page;
+   pfn_t pfn = pmd_pfn(*pmd);
+   pte_t *pte;
+   int i;
+
+   page = alloc_page(GFP_KERNEL);
+   if (page == NULL)
+   return -ENOMEM;
+
+   pte = page_address(page);
+   /* cycle through ptes first, use pmd pfn */
+   for (i = 0; i

[PATCH v4 3/5] live migration support for VM dirty log management

2014-04-28 Thread Mario Smarduch


This patch adds support for keeping track of VM dirty pages, by updating 
per memslot dirty bitmap and write protecting the page again.


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |5 --
 arch/arm/kvm/mmu.c  |  101 +++
 arch/x86/kvm/x86.c  |   78 --
 virt/kvm/kvm_main.c |   84 
 5 files changed, 188 insertions(+), 83 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 9f827c8..c5c27d8 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -237,5 +237,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 
value);
 void kvm_tlb_flush_vmid(struct kvm *kvm);
 
 int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index b916478..6ca3e84 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -784,11 +784,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-   return -EINVAL;
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 15bbca2..3442594 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -864,6 +864,107 @@ out:
return ret;
 }
 
+
+/**
+ * kvm_mmu_write_protected_pt_masked - after migration thread write protects
+ *  the entire VM address space itterative call are made to get diry pags
+ *  as the VM pages are being migrated. New dirty pages may be subset
+ *  of initial WPed VM or new writes faulted in. Here write protect new
+ *  dirty pages again in preparation of next dirty log read. This function is
+ *  called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages
+ *  need to be migrated.
+ *   'kvm-mmu_lock' must be  held to protect against concurrent modification
+ *   of page tables (2nd stage fault, mmu modifiers, ...)
+ *
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot the dirty log is retrieved for
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:   The mask of dirty pages at offset 'gnf_offset in this memory
+ *  slot to be writ protect
+ */
+
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   phys_addr_t ipa, next, offset_ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
+   gfn_t gfnofst = slot-base_gfn + gfn_offset;
+   bool crosses_pmd;
+
+   ipa = (gfnofst + __ffs(mask))  PAGE_SHIFT;
+   offset_ipa  = gfnofst  PAGE_SHIFT;
+   next = (gfnofst + (BITS_PER_LONG - 1))  PAGE_SHIFT;
+
+   /* check if mask width crosses 2nd level page table range, and
+* possibly 3rd, 4th. If not skip upper table lookups. Unlikely
+* to be true machine memory regions tend to start on atleast PMD
+* boundary and mask is a power of 2.
+*/
+   crosses_pmd = ((offset_ipa  PMD_MASK) ^ (next  PMD_MASK)) ? true :
+   false;
+
+   /* If pgd, pud, pmd not present and you cross pmd range check next
+* index. Unlikely that pgd and pud would be not present. Between
+* dirty page marking and now page tables may have been altered.
+*/
+   pgd = pgdp + pgd_index(ipa);
+   if (unlikely(crosses_pmd  !pgd_present(*pgd))) {
+   pgd = pgdp + pgd_index(next);
+   if (!pgd_present(*pgd))
+   return;
+   }
+
+   pud = pud_offset(pgd, ipa);
+   if (unlikely(crosses_pmd  !pud_present(*pud))) {
+   pud = pud_offset(pgd, next);
+   if (!pud_present(*pud))
+   return;
+   }
+
+   pmd = pmd_offset(pud, ipa);
+   if (unlikely(crosses_pmd  !pmd_present(*pmd))) {
+   pmd = pmd_offset(pud, next);
+   if (!pmd_present(*pmd))
+   return;
+   }
+
+   for (;;) {
+   pte = pte_offset_kernel(pmd, ipa);
+   if (!pte_present(*pte))
+   goto next_ipa;
+
+   if (kvm_s2pte_readonly(pte))
+   goto next_ipa;
+   kvm_set_s2pte_readonly(pte);
+next_ipa:
+   mask = mask - 1;
+   if (!mask)
+   break;
+
+   /* find next page */
+   ipa = (gfnofst

[PATCH v4 0/5] live migration dirty bitmap support for ARMv7

2014-04-28 Thread Mario Smarduch


Hi,
 this the fourth iteration of live migration support for the time being
tested on ARMv7. The patches depend on Eric Augers patch for memory regions.

- Tested on two 4-way A15 systems, 2-way/4-way SMP guest upto 2GB memory
- Various dirty data rates tested - 2GB/1s ... 2048 pgs/5ms
- validated source/destination memory image integrity
- Issue: time skips few seconds on dest., timekeeper offset from last
  cycle appears to big, need to investigate further.

Changes since v3:
- changed pte updates to reset write bit instead of setting default 
  value for existing pte's - Steve's comment 
- In addition to PUD add 2nd stage 4GB range functions - Steves
  suggestion
- Restructured initial memory slot write protect function for PGD, PUD, PMD
  table walking - Steves suggestion
- Renamed variable types to resemble their use - Steves suggestions
- Added couple pte helpers for 2nd stage tables - Steves suggestion
- Updated unmap_range() that handles 2nd stage tables and identity mappings
  to handle 2nd stage addresses 4GB. Left ARMv8 unchanged.

Changes since v2:
- move initial VM write protect to memory region architecture prepare function
  (needed to make dirty logging function generic) 
- added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment
- optimized initial VM memory region write protect to do fewer table lookups -
  applied Marc's comment for walking dirty bitmap mask
- added pud_addr_end() for stage2 tables, to make the walk 4-level
- added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic
  one weak, Marc's comment to for generic dirty bitmap log function
- optimized walking dirty bit map mask to skip upper tables - Marc's comment
- deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged 
  the function weak - Marc's comment
- changed Data Abort handler pte index handling - Marc's comment


Mario Smarduch (5):
  add ARMv7 HYP API to flush VM TLBs without address param
  live migration support for initial write protect of VM
  live migration support for VM dirty log management
  add 2nd stage page fault handling during live migration
  change update_range to handle  4GB 2nd stage range for ARMv7

 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |   13 ++
 arch/arm/include/asm/kvm_mmu.h  |   11 ++
 arch/arm/kvm/arm.c  |8 +-
 arch/arm/kvm/interrupts.S   |5 +
 arch/arm/kvm/mmu.c  |  377 +--
 arch/x86/kvm/x86.c  |   78 
 virt/kvm/kvm_main.c |   89 -
 8 files changed, 488 insertions(+), 94 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 4/5] add 2nd stage page fault handling during live migration

2014-04-28 Thread Mario Smarduch

This patch add support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and splits up existing huge pages.


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   31 +--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 3442594..88f5503 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -978,6 +978,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   bool migration_active;
 
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -1029,12 +1030,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
return -EFAULT;
 
spin_lock(kvm-mmu_lock);
+   /* place inside lock to prevent race condition when whole VM is being
+* write proteced. Prevent race of huge page install when migration is
+* active.
+*/
+   migration_active = vcpu-kvm-arch.migration_in_progress;
+
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+
+   /* During migration don't rebuild huge pages */
+   if (!hugetlb  !force_pte  !migration_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
 
-   if (hugetlb) {
+   /* During migration don't install new huge pages */
+   if (hugetlb  !migration_active) {
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
@@ -1046,6 +1056,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
if (writable) {
+   /* First convert huge page pfn to normal 4k page pfn,
+* while  migration is in progress.
+* Second in migration mode and rare case where
+* splitting of huge pages fails check if pmd is
+* mapping a huge page if it is then clear it so
+* stage2_set_pte() can map in a small page.
+*/
+   if (migration_active  hugetlb) {
+   pmd_t *pmd;
+   pfn += pte_index(fault_ipa);
+   new_pte = pfn_pte(pfn, PAGE_S2);
+   pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
+   if (pmd  kvm_pmd_huge(*pmd))
+   clear_pmd_entry(kvm, pmd, fault_ipa);
+   }
kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
}
@@ -1053,6 +1078,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
}
 
+   if (writable)
+   mark_page_dirty(kvm, gfn);
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 5/5] change update_range to handle 4GB 2nd stage range for ARMv7

2014-04-28 Thread Mario Smarduch


This patch adds support for unmapping 2nd stage page tables for addresses 4GB
on ARMv7.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 88f5503..afbf8ba 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -176,21 +176,25 @@ static void clear_pte_entry(struct kvm *kvm, pte_t *pte, 
phys_addr_t addr)
}
 }
 
+/* Function shared between identity and 2nd stage mappings. For 2nd stage
+ * the IPA may be  4GB on ARMv7, and page table range functions
+ * will fail. kvm_xxx_addr_end() is used to handle both cases.
+ */
 static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
-   unsigned long long start, u64 size)
+   phys_addr_t start, u64 size)
 {
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
-   unsigned long long addr = start, end = start + size;
-   u64 next;
+   phys_addr_t addr = start, end = start + size;
+   phys_addr_t next;
 
while (addr  end) {
pgd = pgdp + pgd_index(addr);
pud = pud_offset(pgd, addr);
if (pud_none(*pud)) {
-   addr = pud_addr_end(addr, end);
+   addr = kvm_pud_addr_end(addr, end);
continue;
}
 
@@ -200,13 +204,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
 * move on.
 */
clear_pud_entry(kvm, pud, addr);
-   addr = pud_addr_end(addr, end);
+   addr = kvm_pud_addr_end(addr, end);
continue;
}
 
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
-   addr = pmd_addr_end(addr, end);
+   addr = kvm_pmd_addr_end(addr, end);
continue;
}
 
@@ -221,10 +225,10 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
 */
if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
clear_pmd_entry(kvm, pmd, addr);
-   next = pmd_addr_end(addr, end);
+   next = kvm_pmd_addr_end(addr, end);
if (page_empty(pmd)  !page_empty(pud)) {
clear_pud_entry(kvm, pud, addr);
-   next = pud_addr_end(addr, end);
+   next = kvm_pud_addr_end(addr, end);
}
}
 
-- 
1.7.9.5



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 5/5] change update_range to handle 4GB 2nd stage range for ARMv7

2014-05-05 Thread Mario Smarduch

Hi Gavin,
   thanks, didn't catch that, I'll remove these calls.

- Mario

On 05/05/2014 04:34 PM, Gavin Guo wrote:
 Hi Mario,
 
 On Tue, Apr 29, 2014 at 9:06 AM, Mario Smarduch m.smard...@samsung.com 
 wrote:

 This patch adds support for unmapping 2nd stage page tables for addresses 
 4GB
 on ARMv7.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/kvm/mmu.c |   20 
  1 file changed, 12 insertions(+), 8 deletions(-)

 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 88f5503..afbf8ba 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -176,21 +176,25 @@ static void clear_pte_entry(struct kvm *kvm, pte_t 
 *pte, phys_addr_t addr)
 }
  }

 +/* Function shared between identity and 2nd stage mappings. For 2nd stage
 + * the IPA may be  4GB on ARMv7, and page table range functions
 + * will fail. kvm_xxx_addr_end() is used to handle both cases.
 + */
  static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
 -   unsigned long long start, u64 size)
 +   phys_addr_t start, u64 size)
  {
 pgd_t *pgd;
 pud_t *pud;
 pmd_t *pmd;
 pte_t *pte;
 -   unsigned long long addr = start, end = start + size;
 -   u64 next;
 +   phys_addr_t addr = start, end = start + size;
 +   phys_addr_t next;

 while (addr  end) {
 pgd = pgdp + pgd_index(addr);
 pud = pud_offset(pgd, addr);
 if (pud_none(*pud)) {
 -   addr = pud_addr_end(addr, end);
 +   addr = kvm_pud_addr_end(addr, end);
 continue;
 }

 @@ -200,13 +204,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
  * move on.
  */
 clear_pud_entry(kvm, pud, addr);
 -   addr = pud_addr_end(addr, end);
 +   addr = kvm_pud_addr_end(addr, end);
 continue;
 }

 pmd = pmd_offset(pud, addr);
 if (pmd_none(*pmd)) {
 -   addr = pmd_addr_end(addr, end);
 +   addr = kvm_pmd_addr_end(addr, end);
 continue;
 }

 @@ -221,10 +225,10 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
  */
 if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
 clear_pmd_entry(kvm, pmd, addr);
 -   next = pmd_addr_end(addr, end);
 +   next = kvm_pmd_addr_end(addr, end);
 if (page_empty(pmd)  !page_empty(pud)) {
 clear_pud_entry(kvm, pud, addr);
 -   next = pud_addr_end(addr, end);
 +   next = kvm_pud_addr_end(addr, end);
 }
 }

 --
 1.7.9.5



 
 It seems that your adding kvm_pmd_addr_end(addr, end) already exists
 in the following patch and may need to remove these parts from your
 patch.
 
 commit a3c8bd31af260a17d626514f636849ee1cd1f63e
 Author: Marc Zyngier marc.zyng...@arm.com
 Date:   Tue Feb 18 14:29:03 2014 +
 
 ARM: KVM: introduce kvm_p*d_addr_end
 
 The use of p*d_addr_end with stage-2 translation is slightly dodgy,
 as the IPA is 40bits, while all the p*d_addr_end helpers are
 taking an unsigned long (arm64 is fine with that as unligned long
 is 64bit).
 
 The fix is to introduce 64bit clean versions of the same helpers,
 and use them in the stage-2 page table code.
 
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Acked-by: Catalin Marinas catalin.mari...@arm.com
 Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
 
 Gavin
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 1/4] add ARMv7 HYP API to flush VM TLBs without address param

2014-05-07 Thread Mario Smarduch

Patch adds HYP interface for global VM TLB invalidation without address
parameter. 

- Added ARM version of kvm_flush_remote_tlbs()


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |2 ++
 arch/arm/kvm/interrupts.S   |5 +
 arch/arm/kvm/mmu.c  |   10 ++
 4 files changed, 18 insertions(+)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 53b3c4a..21bc519 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 193ceaf..ac3bb65 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -231,4 +231,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+void kvm_tlb_flush_vmid(struct kvm *kvm);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 0d68d40..8620280 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -45,8 +45,12 @@ __kvm_hyp_code_start:
  *
  * As v7 does not support flushing per IPA, just nuke the whole TLB
  * instead, ignoring the ipa value.
+ *
+ * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 for global VM TLB
+ * flush with no address parameters.
  */
 ENTRY(__kvm_tlb_flush_vmid_ipa)
+ENTRY(__kvm_tlb_flush_vmid)
push{r2, r3}
 
dsb ishst
@@ -65,6 +69,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
pop {r2, r3}
bx  lr
 ENDPROC(__kvm_tlb_flush_vmid_ipa)
+ENDPROC(__kvm_tlb_flush_vmid)
 
 /
  * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 80bb1e6..95c172a 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,16 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/* Function reuses __kvm_tlb_flush_vmid_ipa() HYP interface without additional
+ * address argument to flush entire VM TLBs.
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+   if (kvm)
+   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 4/4] add 2nd stage page fault handling during live migration

2014-05-07 Thread Mario Smarduch

This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and splits up existing huge pages.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   30 --
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1458b6e..b0633dc 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1034,6 +1034,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   bool migration_active;
 
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -1085,12 +1086,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
return -EFAULT;
 
spin_lock(kvm-mmu_lock);
+
+   /* place inside lock to prevent race condition when whole VM is being
+* write proteced. Prevent race of huge page install when migration is
+* active.
+*/
+   migration_active = vcpu-kvm-arch.migration_in_progress;
+
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+
+   /* During migration no need rebuild huge pages */
+   if (!hugetlb  !force_pte  !migration_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
 
-   if (hugetlb) {
+   /* During migration don't install new huge pages */
+   if (hugetlb  !migration_active) {
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
@@ -1102,6 +1113,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
if (writable) {
+   /* First convert huge page pfn to small page pfn,
+* while  migration is in progress.
+* Second if pmd is  mapping a huge page then
+* clear pmd so stage2_set_pte() can split the pmd.
+*/
+   if (migration_active  hugetlb) {
+   pmd_t *pmd;
+   pfn += pte_index(fault_ipa);
+   new_pte = pfn_pte(pfn, PAGE_S2);
+   pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
+   if (pmd  kvm_pmd_huge(*pmd))
+   clear_pmd_entry(kvm, pmd, fault_ipa);
+   }
kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
}
@@ -1109,6 +1133,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
}
 
+   if (writable)
+   mark_page_dirty(kvm, gfn);
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 1/4] add ARMv7 HYP API to flush VM TLBs without address param

2014-05-14 Thread Mario Smarduch

On 05/14/2014 09:47 AM, Christoffer Dall wrote:
 On Wed, May 07, 2014 at 05:40:13PM -0700, Mario Smarduch wrote:
 Patch adds HYP interface for global VM TLB invalidation without address
 parameter.

 - Added ARM version of kvm_flush_remote_tlbs()


 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_asm.h  |1 +
  arch/arm/include/asm/kvm_host.h |2 ++
  arch/arm/kvm/interrupts.S   |5 +
  arch/arm/kvm/mmu.c  |   10 ++
  4 files changed, 18 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
 index 53b3c4a..21bc519 100644
 --- a/arch/arm/include/asm/kvm_asm.h
 +++ b/arch/arm/include/asm/kvm_asm.h
 @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];

  extern void __kvm_flush_vm_context(void);
  extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 +extern void __kvm_tlb_flush_vmid(struct kvm *kvm);

  extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
  #endif
 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 193ceaf..ac3bb65 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -231,4 +231,6 @@ int kvm_perf_teardown(void);
  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);

 +void kvm_tlb_flush_vmid(struct kvm *kvm);
 +
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
 index 0d68d40..8620280 100644
 --- a/arch/arm/kvm/interrupts.S
 +++ b/arch/arm/kvm/interrupts.S
 @@ -45,8 +45,12 @@ __kvm_hyp_code_start:
   *
   * As v7 does not support flushing per IPA, just nuke the whole TLB
   * instead, ignoring the ipa value.
 + *
 + * void __kvm_tlb_flush_vm(struct kvm *kvm) - alias on ARMv7 for global VM 
 TLB
 + * flush with no address parameters.
   */
  ENTRY(__kvm_tlb_flush_vmid_ipa)
 +ENTRY(__kvm_tlb_flush_vmid)
   push {r2, r3}

   dsb ishst
 @@ -65,6 +69,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
   pop {r2, r3}
   bx lr
  ENDPROC(__kvm_tlb_flush_vmid_ipa)
 +ENDPROC(__kvm_tlb_flush_vmid)
 
 yikes, can you please make this a separate function that calls the other
 one?

Done separate function, got the idea from entry-common.s ENTRY(ret_to_user), 
ENTRY(ret_to_user_from_irq) and others.

 

  /
   * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 80bb1e6..95c172a 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -56,6 +56,16 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
 phys_addr_t ipa)
   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  }

 +/* Function reuses __kvm_tlb_flush_vmid_ipa() HYP interface without 
 additional
 + * address argument to flush entire VM TLBs.
 + */
 
 This is not proper kernel commenting formatting, please see
 Documentation/CodingStyle.
 
 For new exported functions in the KVM/ARM code, please add kdocs style
 documentation to the functions.

Done.

 
 +void kvm_flush_remote_tlbs(struct kvm *kvm)
 
 This doesn't build?:

I reworked the patch series to build successfully after
applying each patch. This patch was missing a weak 
declaration of the function in virt/kvm/kvm_main.c.

I simplified some related code for PMD splitting
reusing current mmu.c code, instead of reinventing. 
I'll email new patch series tomorrow, you might not want 
to waste your time on 2-4.

Thanks.
- Mario

 
 arch/arm/kvm/mmu.o: In function `kvm_flush_remote_tlbs':
 mmu.c:(.text+0xc7c): multiple definition of `kvm_flush_remote_tlbs'
 
 +{
 + if (kvm)
 + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 +}
 +
 +
  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
int min, int max)
  {
 --
 1.7.9.5


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 1/4] add ARMv7 HYP API to flush VM TLBs without address param

2014-05-15 Thread Mario Smarduch

Patch adds HYP interface for global VM TLB invalidation without address
parameter. Added ARM version of kvm_flush_remote_tlbs(), made generic one weak.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_asm.h |1 +
 arch/arm/kvm/interrupts.S  |   11 +++
 arch/arm/kvm/mmu.c |   15 +++
 virt/kvm/kvm_main.c|2 +-
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 53b3c4a..21bc519 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 0d68d40..bddc66b 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
bx  lr
 ENDPROC(__kvm_tlb_flush_vmid_ipa)
 
+/**
+ * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
+ *
+ * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
+ * parameter
+ */
+
+ENTRY(__kvm_tlb_flush_vmid)
+   b   __kvm_tlb_flush_vmid_ipa
+ENDPROC(__kvm_tlb_flush_vmid)
+
 /
  * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
  * domain, for all VMIDs
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 80bb1e6..eea3f0a 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,21 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * kvm_flush_remote_tlbs() - flush all VM TLB entries
+ *
+ * Interface to HYP function to flush all VM TLB entries without address
+ * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by
+ * kvm_tlb_flush_vmid_ipa().
+ *
+ * @kvm:   pointer to kvm structure.
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+   if (kvm)
+   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa70c6e..ba25765 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned 
int req)
return called;
 }
 
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+void __weak kvm_flush_remote_tlbs(struct kvm *kvm)
 {
long dirty_count = kvm-tlbs_dirty;
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 0/4] live migration dirty bitmap support for ARMv7

2014-05-15 Thread Mario Smarduch

This is v6 patcheset of live mgiration support for ARMv7.

- Tested on two 4-way A15 hardware, QEMU 2-way/4-way SMP guest upto 2GB
- Various dirty data rates tested - 2GB/1s ... 2048 pgs/5ms
- validated source/destination memory image integrity

Changes since v1:
- add unlock of VM mmu_lock to prevent a deadlock
- moved migratiion active inside mmu_lock acquire for visibility in 2nd stage
  data abort handler
- Added comments

Changes since v2: 
- move initial VM write protect to memory region architecture prepare function
  (needed to make dirty logging function generic) 
- added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment
- optimized initial VM memory region write protect to do fewer table lookups -
  applied Marc's comment for walking dirty bitmap mask
- added pud_addr_end() for stage2 tables, to make the walk 4-level
- added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic
  one weak, Marc's comment to for generic dirty bitmap log function
- optimized walking dirty bit map mask to skip upper tables - Marc's comment
- deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged 
  the function weak - Marc's comment
- changed Data Abort handler pte index handling - Marc's comment

Changes since v3:
- changed pte updates to reset write bit instead of setting default 
  value for existing pte's - Steve's comment
- In addition to PUD add 2nd stage 4GB range functions - Steves
  suggestion
- Restructured initial memory slot write protect function for PGD, PUD, PMD
  table walking - Steves suggestion
- Renamed variable types to resemble their use - Steves suggestions
- Added couple pte helpers for 2nd stage tables - Steves suggestion
- Updated unmap_range() that handles 2nd stage tables and identity mappings
  to handle 2nd stage addresses 4GB. Left ARMv8 unchanged.

Changes since v4:
- rebased to 3.15.0-rc1 - 'next' to pickup p*addr_end patches - Gavins comment
- Update PUD address end function to support 4-level page table walk
- Elimiated 5th patch of the series that fixed unmap_range(), since it was
  fixed by Marcs patches.

Changes since v5:
- Created seperate entry point for VMID TLB flush with no param - Christoffers
  comment
- Update documentation for kvm_flush_remote_tlbs() - Christoffers comment
- Simplified splitting of huge pages - inittial WP and 2nd stage DABT handler
  clear the huge page PMD, and use current code to fault in small pages.
  Removed kvm_split_pmd().

Mario Smarduch (4):
  add ARMv7 HYP API to flush VM TLBs without address param
  live migration support for initial write protect of VM
  live migration support for VM dirty log management
  add 2nd stage page fault handling during live migration

 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |   11 ++
 arch/arm/include/asm/kvm_mmu.h  |   10 ++
 arch/arm/kvm/arm.c  |8 +-
 arch/arm/kvm/interrupts.S   |   11 ++
 arch/arm/kvm/mmu.c  |  292 ++-
 arch/x86/kvm/x86.c  |   86 
 virt/kvm/kvm_main.c |   84 ++-
 8 files changed, 409 insertions(+), 94 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 4/4] add 2nd stage page fault handling during live migration

2014-05-15 Thread Mario Smarduch

This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and splits up existing huge pages.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   36 ++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b939312..10e7bf6 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1002,6 +1002,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   bool migration_active;
 
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -1053,12 +1054,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
return -EFAULT;
 
spin_lock(kvm-mmu_lock);
+
+   /*
+* Place inside lock to prevent race condition when whole VM is being
+* write proteced. Prevent race of huge page install when migration is
+* active.
+*/
+   migration_active = vcpu-kvm-arch.migration_in_progress;
+
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+
+   /* When migrating don't spend cycles coalescing huge pages */
+   if (!hugetlb  !force_pte  !migration_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
 
-   if (hugetlb) {
+   /* During migration don't install huge pages */
+   if (hugetlb  !migration_active) {
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
@@ -1069,6 +1081,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, new_pmd);
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
+
+   /*
+* If pmd is  mapping a huge page then split it up into
+* small pages, when doing live migration.
+*/
+   if (migration_active) {
+   pmd_t *pmd;
+   if (hugetlb) {
+   pfn += pte_index(fault_ipa);
+   gfn = fault_ipa  PAGE_SHIFT;
+   }
+   new_pte = pfn_pte(pfn, PAGE_S2);
+   pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
+   if (pmd  kvm_pmd_huge(*pmd))
+   clear_pmd_entry(kvm, pmd, fault_ipa);
+   }
+
if (writable) {
kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
@@ -1077,6 +1106,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
}
 
+   /* Assuming 4k pages, set one bit/page in memslot dirty_bitmap[] */
+   if (writable)
+   mark_page_dirty(kvm, gfn);
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 2/4] live migration support for initial write protect of VM

2014-05-15 Thread Mario Smarduch

Patch adds memslot support for initial write protection and split up of huge 
pages

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |8 +++
 arch/arm/include/asm/kvm_mmu.h  |   10 +++
 arch/arm/kvm/arm.c  |3 +
 arch/arm/kvm/mmu.c  |  143 +++
 4 files changed, 164 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 193ceaf..0e55b17 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,12 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+   /*
+* Marks start of migration, used to handle 2nd stage page faults
+* during migration, prevent installing huge pages and split huge pages
+* to small pages.
+*/
+   int migration_in_progress;
 };
 
 #define KVM_NR_MEM_OBJS 40
@@ -231,4 +237,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5c7aa3c..7f9d9d3 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -114,6 +114,16 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= L_PMD_S2_RDWR;
 }
 
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+   pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR);
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+   return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
+}
+
 /* Open coded p*d_addr_end that can deal with 64bit addresses */
 #define kvm_pgd_addr_end(addr, end)\
 ({ u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 3c82b37..1055266 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -234,6 +234,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
   struct kvm_userspace_memory_region *mem,
   enum kvm_mr_change change)
 {
+   /* Request for migration issued by user, write protect memory slot */
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   return kvm_mmu_slot_remove_write_access(kvm, mem-slot);
return 0;
 }
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index eea3f0a..b71ad27 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -748,6 +748,149 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+
+/*
+ * Walks PMD page table range and write protects it. Called with
+ * 'kvm-mmu_lock' * held
+ */
+static void stage2_wp_pmd_range(phys_addr_t addr, phys_addr_t end, pmd_t *pmd)
+{
+   pte_t *pte;
+
+   while (addr  end) {
+   pte = pte_offset_kernel(pmd, addr);
+   addr += PAGE_SIZE;
+   if (!pte_present(*pte))
+   continue;
+   /* skip write protected pages */
+   if (kvm_s2pte_readonly(pte))
+   continue;
+   kvm_set_s2pte_readonly(pte);
+   }
+}
+
+/*
+ * Walks PUD  page table range to write protects it , if necessary spluts up
+ * huge pages to small pages. Called with 'kvm-mmu_lock' held.
+ */
+static void stage2_wp_pud_range(struct kvm *kvm, phys_addr_t addr,
+   phys_addr_t end, pud_t *pud)
+{
+   pmd_t *pmd;
+   phys_addr_t pmd_end;
+
+   while (addr  end) {
+   /* If needed give up CPU during PUD page table walk */
+   if (need_resched() || spin_needbreak(kvm-mmu_lock))
+   cond_resched_lock(kvm-mmu_lock);
+
+   pmd = pmd_offset(pud, addr);
+   if (!pmd_present(*pmd)) {
+   addr = kvm_pmd_addr_end(addr, end);
+   continue;
+   }
+
+   if (kvm_pmd_huge(*pmd)) {
+   /*
+* Clear pmd entry DABT handler will install smaller
+* pages.
+*/
+   clear_pmd_entry(kvm, pmd, addr);
+   addr = kvm_pmd_addr_end(addr, end);
+   continue;
+   }
+
+   pmd_end = kvm_pmd_addr_end(addr, end);
+   stage2_wp_pmd_range(addr, pmd_end, pmd);
+   addr = pmd_end;
+   }
+}
+
+/*
+ * Walks PGD page table range to write protect it. Called with 'kvm-mmu_lock'
+ * held.
+ */
+static int stage2_wp_pgd_range(struct kvm *kvm, phys_addr_t addr,
+   phys_addr_t end, pgd_t *pgd)
+{
+   phys_addr_t pud_end

[PATCH v6 3/4] live migration support for VM dirty log management

2014-05-15 Thread Mario Smarduch

This patch adds support for keeping track of VM dirty pages, by updating
per memslot dirty bitmap and write protecting the page again.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |5 --
 arch/arm/kvm/mmu.c  |   98 +++
 arch/x86/kvm/x86.c  |   86 --
 virt/kvm/kvm_main.c |   82 
 5 files changed, 183 insertions(+), 91 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 0e55b17..4fef77d 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -238,5 +238,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 1055266..0b847b5 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -777,11 +777,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-   return -EINVAL;
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b71ad27..b939312 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -891,6 +891,104 @@ out:
return ret;
 }
 
+
+/**
+ * kvm_mmu_write_protected_pt_masked - walk mask relative start of memslot and
+ * write protect again for next dirty log read.
+ *
+ *  After migration thread write protects entire VM iterative calls are made
+ *  to get diry page log. The log is returned and dirty pages are write
+ *  protected again. This function is called as a result KVM_GET_DIRTY_LOG
+ *  ioctl.
+ *  'kvm-mmu_lock' must be  held to protect against concurrent modification
+ *  of page tables (2nd stage fault, mmu modifiers, ...)
+ *
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot the dirty log is retrieved for
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:   The mask of dirty pages at offset 'gnf_offset in this memory
+ *  slot to be writ protect
+ */
+
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   phys_addr_t ipa, next, offset_ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
+   gfn_t gfnofst = slot-base_gfn + gfn_offset;
+   bool crosses_pmd;
+
+   ipa = (gfnofst + __ffs(mask))  PAGE_SHIFT;
+   offset_ipa  = gfnofst  PAGE_SHIFT;
+   next = (gfnofst + (BITS_PER_LONG - 1))  PAGE_SHIFT;
+
+   /* check if mask width crosses 2nd level page table range, and
+* possibly 3rd, 4th. If not skip upper table lookups. Unlikely
+* to be true.
+*/
+   crosses_pmd = ((offset_ipa  PMD_MASK) ^ (next  PMD_MASK)) ? true :
+   false;
+
+   /* If pgd, pud, pmd not present and you cross pmd range check next
+* index.
+*/
+   pgd = pgdp + pgd_index(ipa);
+   if (unlikely(crosses_pmd  !pgd_present(*pgd))) {
+   pgd = pgdp + pgd_index(next);
+   if (!pgd_present(*pgd))
+   return;
+   }
+
+   pud = pud_offset(pgd, ipa);
+   if (unlikely(crosses_pmd  !pud_present(*pud))) {
+   pud = pud_offset(pgd, next);
+   if (!pud_present(*pud))
+   return;
+   }
+
+   pmd = pmd_offset(pud, ipa);
+   if (unlikely(crosses_pmd  !pmd_present(*pmd))) {
+   pmd = pmd_offset(pud, next);
+   if (!pmd_present(*pmd))
+   return;
+   }
+
+   for (;;) {
+   pte = pte_offset_kernel(pmd, ipa);
+   if (!pte_present(*pte))
+   goto next_ipa;
+
+   if (kvm_s2pte_readonly(pte))
+   goto next_ipa;
+   kvm_set_s2pte_readonly(pte);
+next_ipa:
+   mask = mask - 1;
+   if (!mask)
+   break;
+
+   /* find next page */
+   ipa = (gfnofst + __ffs(mask))  PAGE_SHIFT;
+
+   /* skip upper page table lookups */
+   if (!crosses_pmd)
+   continue;
+
+   pgd = pgdp + pgd_index(ipa);
+   if (unlikely(!pgd_present(*pgd)))
+   goto next_ipa;
+   pud = pud_offset(pgd, ipa

Re: [PATCH v5 2/4] live migration support for initial write protect of VM

2014-05-15 Thread Mario Smarduch

On 05/15/2014 11:53 AM, Christoffer Dall wrote:
 
 [I know you sent out a newer version but I already reviewed some of this
 patch on the plane today but couldn't send it out before I got home.
 Anyway, here it is:]
 
 On Wed, May 07, 2014 at 05:40:14PM -0700, Mario Smarduch wrote:
 Patch adds support for live migration initial split up of huge pages
 in memory slot and write protection of all pages in memory slot.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h |7 ++
  arch/arm/include/asm/kvm_mmu.h  |   16 +++-
  arch/arm/kvm/arm.c  |3 +
  arch/arm/kvm/mmu.c  |  179 
 +++
  virt/kvm/kvm_main.c |6 +-
  5 files changed, 209 insertions(+), 2 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index ac3bb65..91744c3 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -67,6 +67,11 @@ struct kvm_arch {
  
  /* Interrupt controller */
  struct vgic_distvgic;
 +/* Marks start of migration, used to handle 2nd stage page faults
 + * during migration, prevent installing huge pages and split huge pages
 + * to small pages.
 + */
 
 commenting style
 
 this is a bit verbose for a field in a struct, perhaps moving the longer
 version to where you set this?
Will do.
 
 +int migration_in_progress;
  };
  
  #define KVM_NR_MEM_OBJS 40
 @@ -233,4 +238,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, 
 u64 value);
  
  void kvm_tlb_flush_vmid(struct kvm *kvm);
  
 +int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 +
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
 index 5c7aa3c..b339fa9 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -114,13 +114,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
  pmd_val(*pmd) |= L_PMD_S2_RDWR;
  }
  
 +static inline void kvm_set_s2pte_readonly(pte_t *pte)
 +{
 +pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR);
 
 This relies on the pte already having been set as RDONLY or RDWR, if you
 are creating a new pte and calling this function it could be easy to
 miss that distinction, I would prefer:
 
 pte_val(*pte) = L_PTE_S2_RDWR;
 pte_val(*pte) |= L_PTE_S2_RDONLY;

Currently it's called only on set, or live pte's, I'll change
it so it's applicate to all cases. 
 
 +}
 +
 +static inline bool kvm_s2pte_readonly(pte_t *pte)
 +{
 +return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
 +}
 +
  /* Open coded p*d_addr_end that can deal with 64bit addresses */
  #define kvm_pgd_addr_end(addr, end) \
  ({  u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
  (__boundary - 1  (end) - 1)? __boundary: (end);\
  })
  
 -#define kvm_pud_addr_end(addr,end)  (end)
 +/* For - 4-level table walk return PUD range end if end  1GB */
 
 not sure you need this comment, the scheme is very common all over the
 kernel.
Yes.
 
 +#define kvm_pud_addr_end(addr, end) \
 +({  u64 __boundary = ((addr) + PUD_SIZE)  PUD_MASK;\
 +(__boundary - 1  (end) - 1) ? __boundary : (end);  \
 +})
 
 why do we need this?  We should only ever have 3 levels of page tables,
 right?

I removed in v6 patch.
 
  
  #define kvm_pmd_addr_end(addr, end) \
  ({  u64 __boundary = ((addr) + PMD_SIZE)  PMD_MASK;\
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..1055266 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -234,6 +234,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 struct kvm_userspace_memory_region *mem,
 enum kvm_mr_change change)
  {
 +/* Request for migration issued by user, write protect memory slot */
 
 Does this necessarily only happen when there's a request for migration?
 Isn't it just a log call that could be used for other things
 (potentially)?


From QEMU view migration thread calls KVM memory listener kvm_log_global_start
and that kicks off dirty log tracking for each memslot. There are other 
operations 
like region add (kvm_region_add)  that starts kvm_log_start for that memslot, 
or other odd case if you add a region that overlaps regions you may start 
logging
the whole region.
 
But in either case it appears you're migrating already. 

But no I don't see any other feature that triggers this.

 
 +if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
 +return kvm_mmu_slot_remove_write_access(kvm, mem-slot);
  return 0;
  }
  
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 95c172a..85145d8 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -743,6 +743,185

Re: [PATCH v6 0/4] live migration dirty bitmap support for ARMv7

2014-05-15 Thread Mario Smarduch

Will do that, I'm sure there will be another iteration :).

On 05/15/2014 11:51 AM, Christoffer Dall wrote:
 On Thu, May 15, 2014 at 11:27:27AM -0700, Mario Smarduch wrote:
 This is v6 patcheset of live mgiration support for ARMv7.
 
 migration
 
 This is an extremely terse cover letter.  It would have been nice with a
 few sentences of which existing features this leverages, which support
 was missing, what the preferred approach is, etc.  Also, links to a wiki
 page or just a few notes on how you did the testing below with which
 user space tools etc. would also have been great.
 

 - Tested on two 4-way A15 hardware, QEMU 2-way/4-way SMP guest upto 2GB
 - Various dirty data rates tested - 2GB/1s ... 2048 pgs/5ms
 - validated source/destination memory image integrity

 Changes since v1:
 - add unlock of VM mmu_lock to prevent a deadlock
 - moved migratiion active inside mmu_lock acquire for visibility in 2nd stage
   data abort handler
 - Added comments

 Changes since v2: 
 - move initial VM write protect to memory region architecture prepare 
 function
   (needed to make dirty logging function generic) 
 - added stage2_mark_pte_ro() - to mark ptes ro - Marc's comment
 - optimized initial VM memory region write protect to do fewer table lookups 
 -
   applied Marc's comment for walking dirty bitmap mask
 - added pud_addr_end() for stage2 tables, to make the walk 4-level
 - added kvm_flush_remote_tlbs() to use ARM TLB invalidation, made the generic
   one weak, Marc's comment to for generic dirty bitmap log function
 - optimized walking dirty bit map mask to skip upper tables - Marc's comment
 - deleted x86,arm kvm_vm_ioctl_get_dirty_log(), moved to kvm_main.c tagged 
   the function weak - Marc's comment
 - changed Data Abort handler pte index handling - Marc's comment

 Changes since v3:
 - changed pte updates to reset write bit instead of setting default 
   value for existing pte's - Steve's comment
 - In addition to PUD add 2nd stage 4GB range functions - Steves
   suggestion
 - Restructured initial memory slot write protect function for PGD, PUD, PMD
   table walking - Steves suggestion
 - Renamed variable types to resemble their use - Steves suggestions
 - Added couple pte helpers for 2nd stage tables - Steves suggestion
 - Updated unmap_range() that handles 2nd stage tables and identity mappings
   to handle 2nd stage addresses 4GB. Left ARMv8 unchanged.

 Changes since v4:
 - rebased to 3.15.0-rc1 - 'next' to pickup p*addr_end patches - Gavins 
 comment
 - Update PUD address end function to support 4-level page table walk
 - Elimiated 5th patch of the series that fixed unmap_range(), since it was
   fixed by Marcs patches.

 Changes since v5:
 - Created seperate entry point for VMID TLB flush with no param - 
 Christoffers
   comment
 - Update documentation for kvm_flush_remote_tlbs() - Christoffers comment
 - Simplified splitting of huge pages - inittial WP and 2nd stage DABT handler
   clear the huge page PMD, and use current code to fault in small pages.
   Removed kvm_split_pmd().

 Mario Smarduch (4):
   add ARMv7 HYP API to flush VM TLBs without address param
   live migration support for initial write protect of VM
   live migration support for VM dirty log management
   add 2nd stage page fault handling during live migration

  arch/arm/include/asm/kvm_asm.h  |1 +
  arch/arm/include/asm/kvm_host.h |   11 ++
  arch/arm/include/asm/kvm_mmu.h  |   10 ++
  arch/arm/kvm/arm.c  |8 +-
  arch/arm/kvm/interrupts.S   |   11 ++
  arch/arm/kvm/mmu.c  |  292 
 ++-
  arch/x86/kvm/x86.c  |   86 
  virt/kvm/kvm_main.c |   84 ++-
  8 files changed, 409 insertions(+), 94 deletions(-)

 -- 
 1.7.9.5


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 2/4] live migration support for initial write protect of VM

2014-05-16 Thread Mario Smarduch

Hi Christoffer,
  few more comments
 struct vgic_distvgic;
 +   /* Marks start of migration, used to handle 2nd stage page faults
 +* during migration, prevent installing huge pages and split huge pages
 +* to small pages.
 +*/

 commenting style

 this is a bit verbose for a field in a struct, perhaps moving the longer
 version to where you set this?
 Will do.

 +   int migration_in_progress;
  };

I think this flag could be removed all together. Migration can be
stopped at any time (started too), through user request or other events. 
When that happens (like migrate_cancel) migrate cleanup bh runs and eventually 
calls 
KVM memory listener kvm_log_global_start() (cancel handler) 
that stops logging, clears KVM_MEM_LOG_DIRTY_PAGES, and region ops ioctl,
clears dirty_bitmap. In either case dirty_bitmap for memslot is set or 
unset during migration to track dirty pages, following that field seems to be 
a better way to keep track of migration. This again is QEMU view but it appears 
all these policies are driven from user space.



  
 +/* kvm_split_pmd - splits huge pages to small pages, required to keep a 
 dirty
 + * log of smaller memory granules, otherwise huge pages would need to be
 + * migrated. Practically an idle system has problems migrating with

 This seems abrupt.  Why can't we just represent a 2M huge page as 512 4K
 bits and write protect the huge pages, if you take a write fault on a 2M
 page, then split it then.
 
 That's one alternative the one I put into v6 is clear the PMD
 and force user_mem_abort() to fault in 4k pages, and mark the
 dirty_bitmap[] for that page, reuse the current code. Have not
 checked the impact on performance, it takes few seconds longer
 to converge for the tests I'm running. 

I was thinking about this and if PMD attributes need to be passed
onto the PTEs then it appears what you recommend is required.
But during run time I don't see how 2nd stage attributes can
change, could the guest do anything to change them (SH, Memattr)?


Performance may also be other reason but that always depends
on the load, clearing a PMD seems easier and reuses current code.
Probably several load tests/benchmarks can help here.
Also noticed hw PMD/PTE attributes differ a little which
is not significant now, but moving forward different page size
and any new revisions to fields may require additional maintenance.

I'll be out next week and back 26'th, I'll create a link with
details on test environment and tests. The cover letter will
will go through general overview only.

Thanks,
  Mario

 

 If your use case is HA, then you will be doing this a lot, and you don't
 want to hurt performance of your main live system more than necessary.
 

 + * huge pages.  Called during WP of entire VM address space, done



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 2/4] live migration support for initial write protect of VM

2014-05-27 Thread Mario Smarduch

Hi Christoffer,
 I was out traveling last week + holiday.

You had lots of comments in last version (incl. below), reworking to submit a 
new series. Un-clutter from basic issues, and update current logic. In next 
couple days I'll submit new  series.

Also looking into a wiki to document test env (but may windup with a github 
link).

Thanks,
  Mario



On 05/27/2014 12:58 PM, Christoffer Dall wrote:
 On Thu, May 15, 2014 at 11:27:29AM -0700, Mario Smarduch wrote:
 Patch adds memslot support for initial write protection and split up of huge 
 pages
 
 I lost track of where we are with these patches, but I see a lot of
 issues in this patch that I believe I already commented on (but I may
 not have had time to comment before you sent out v6).
 
 In any case, I'm going to wait with reviewing things carefully until you
 send out a v7, but for v7:
  - Please document the rationale and design behind what you're doing in
the commit text of each patch.  Each of these patches are quite
large, but the commit messages are barely two lines.  I suggest you
take a look at 'git log arch/arm/kvm' for example to get a feel for
what I'm looking for.
 
  - There is nothing specific in the interface to KVM discussing
migration or live migration, it is only used as an example for
features in trying to stay generic.  Please use similar generic
concepts in the kernel to make things coherent.  'git grep
migration arch/x86/kvm' also tells you that x86 gets away with full
support for live migration without referring to migration except as
examples of how features might be useful.
 
 Thanks for the work, looking forward to seeing a new revision.
 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 3/4] live migration support for VM dirty log management

2014-05-27 Thread Mario Smarduch

On 05/27/2014 01:12 PM, Christoffer Dall wrote:
 On Thu, May 15, 2014 at 11:27:30AM -0700, Mario Smarduch wrote:
 This patch adds support for keeping track of VM dirty pages, by updating
 per memslot dirty bitmap and write protecting the page again.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h |3 ++
  arch/arm/kvm/arm.c  |5 --
  arch/arm/kvm/mmu.c  |   98 
 +++
  arch/x86/kvm/x86.c  |   86 --
  virt/kvm/kvm_main.c |   82 
  5 files changed, 183 insertions(+), 91 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 0e55b17..4fef77d 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -238,5 +238,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
  
  int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 +struct kvm_memory_slot *slot,
 +gfn_t gfn_offset, unsigned long mask);
  
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 1055266..0b847b5 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -777,11 +777,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
  }
  }
  
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 -{
 -return -EINVAL;
 -}
 -
  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
  struct kvm_arm_device_addr *dev_addr)
  {
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index b71ad27..b939312 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -891,6 +891,104 @@ out:
  return ret;
  }
  
 +
 +/**
 + * kvm_mmu_write_protected_pt_masked - walk mask relative start of memslot 
 and
 + *  write protect again for next dirty log read.
 + *
 + *  After migration thread write protects entire VM iterative calls are made
 + *  to get diry page log. The log is returned and dirty pages are write
 + *  protected again. This function is called as a result KVM_GET_DIRTY_LOG
 + *  ioctl.
 + *  'kvm-mmu_lock' must be  held to protect against concurrent modification
 + *  of page tables (2nd stage fault, mmu modifiers, ...)
 + *
 + * @kvm:The KVM pointer
 + * @slot:   The memory slot the dirty log is retrieved for
 + * @gfn_offset: The gfn offset in memory slot
 + * @mask:   The mask of dirty pages at offset 'gnf_offset in this memory
 + *  slot to be writ protect
 + */
 +
 +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 +struct kvm_memory_slot *slot,
 +gfn_t gfn_offset, unsigned long mask)
 +{
 +phys_addr_t ipa, next, offset_ipa;
 +pgd_t *pgdp = kvm-arch.pgd, *pgd;
 +pud_t *pud;
 +pmd_t *pmd;
 +pte_t *pte;
 +gfn_t gfnofst = slot-base_gfn + gfn_offset;
 +bool crosses_pmd;
 +
 +ipa = (gfnofst + __ffs(mask))  PAGE_SHIFT;
 +offset_ipa  = gfnofst  PAGE_SHIFT;
 +next = (gfnofst + (BITS_PER_LONG - 1))  PAGE_SHIFT;
 +
 +/* check if mask width crosses 2nd level page table range, and
 + * possibly 3rd, 4th. If not skip upper table lookups. Unlikely
 + * to be true.
 + */
 +crosses_pmd = ((offset_ipa  PMD_MASK) ^ (next  PMD_MASK)) ? true :
 +false;
 
 you can just assign the value, no need for the tertiary operator, a bool
 will always be true or false.  (Marc wanted to make this explicit
 elsewhere in the code, an uses the 'val = !!(expression)' syntax).
 
Ah ok.
 +
 +/* If pgd, pud, pmd not present and you cross pmd range check next
 + * index.
 + */
 +pgd = pgdp + pgd_index(ipa);
 +if (unlikely(crosses_pmd  !pgd_present(*pgd))) {
 +pgd = pgdp + pgd_index(next);
 +if (!pgd_present(*pgd))
 +return;
 +}
 +
 +pud = pud_offset(pgd, ipa);
 +if (unlikely(crosses_pmd  !pud_present(*pud))) {
 +pud = pud_offset(pgd, next);
 +if (!pud_present(*pud))
 +return;
 +}
 +
 +pmd = pmd_offset(pud, ipa);
 +if (unlikely(crosses_pmd  !pmd_present(*pmd))) {
 +pmd = pmd_offset(pud, next);
 +if (!pmd_present(*pmd))
 +return;
 +}
 +
 +for (;;) {
 +pte = pte_offset_kernel(pmd, ipa);
 +if (!pte_present(*pte))
 +goto next_ipa;
 +
 +if (kvm_s2pte_readonly(pte))
 +goto next_ipa;
 +kvm_set_s2pte_readonly(pte);
 +next_ipa:
 +mask = mask - 1;
 +if (!mask)
 +break;
 +
 +/* find next page */
 +ipa = (gfnofst + __ffs(mask))  PAGE_SHIFT

Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration

2014-05-27 Thread Mario Smarduch

On 05/27/2014 01:19 PM, Christoffer Dall wrote:
 On Thu, May 15, 2014 at 11:27:31AM -0700, Mario Smarduch wrote:
 This patch adds support for handling 2nd stage page faults during migration,
 it disables faulting in huge pages, and splits up existing huge pages.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/kvm/mmu.c |   36 ++--
  1 file changed, 34 insertions(+), 2 deletions(-)

 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index b939312..10e7bf6 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -1002,6 +1002,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
  struct vm_area_struct *vma;
  pfn_t pfn;
 +bool migration_active;
  
  write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
  if (fault_status == FSC_PERM  !write_fault) {
 @@ -1053,12 +1054,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  return -EFAULT;
  
  spin_lock(kvm-mmu_lock);
 +
 +/*
 + * Place inside lock to prevent race condition when whole VM is being
 + * write proteced. Prevent race of huge page install when migration is
 + * active.
 + */
 +migration_active = vcpu-kvm-arch.migration_in_progress;
 +
  if (mmu_notifier_retry(kvm, mmu_seq))
  goto out_unlock;
 -if (!hugetlb  !force_pte)
 +
 +/* When migrating don't spend cycles coalescing huge pages */
 +if (!hugetlb  !force_pte  !migration_active)
  hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
  
 -if (hugetlb) {
 +/* During migration don't install huge pages */
 
 again, all this is not about migration per se, it's about when logging
 dirty pages, (which may be commonly used for migration).
 

Yes that's true , I'll update but until recently (new RFC on qemu list) where
dirty logging is used for getting VM RSS or hot memory regions, I don't see any
other use case.

 +if (hugetlb  !migration_active) {
  pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
  new_pmd = pmd_mkhuge(new_pmd);
  if (writable) {
 @@ -1069,6 +1081,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, new_pmd);
  } else {
  pte_t new_pte = pfn_pte(pfn, PAGE_S2);
 +
 +/*
 + * If pmd is  mapping a huge page then split it up into
 + * small pages, when doing live migration.
 + */
 +if (migration_active) {
 +pmd_t *pmd;
 +if (hugetlb) {
 +pfn += pte_index(fault_ipa);
 +gfn = fault_ipa  PAGE_SHIFT;
 +}
 
 how can you have hugetlb when we entered this else-clause conditional on
 having !hugetlb?
 
- if(hugetlb  !migration_active)

forces all page faults to enter here while in migration. Huge page entries
are cleared and stage2_set_pte() splits the huge page, and installs the pte
for the fault_ipa. I placed that there since it flows with installing 
a pte as well as splitting a huge page. But your comment on performance
split up huge page vs. deferred  page faulting should move it out of here. 


 +new_pte = pfn_pte(pfn, PAGE_S2);
 +pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
 +if (pmd  kvm_pmd_huge(*pmd))
 +clear_pmd_entry(kvm, pmd, fault_ipa);
 
 If we have a huge pmd entry, how did we take a fault on there?  Would
 that be if a different CPU inserted a huge page entry since we got here,
 is this what you're trying to handle?
 
 I'm confused.
 

I thing this related to the above.

 +}
 +
  if (writable) {
  kvm_set_s2pte_writable(new_pte);
  kvm_set_pfn_dirty(pfn);
 @@ -1077,6 +1106,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
  }
  
 +/* Assuming 4k pages, set one bit/page in memslot dirty_bitmap[] */
 
 Assuming? this makes me nervous.  The point is probably that it's
 harmless if we're not logging dirty pages, because then nobody reads teh
 data structure, and if we are logging, then we are mapping everything
 using 4K pages?
 
 It's probably clearer code-wise to condition this on whether or not we
 are logging dirty page, and the branch is also likely to be much faster
 than the function call to mark_page_dirty.
 

I'm not sure I get the point. The call is always safe, you either 
have old copy or new copy of memory slot with dirty_bitmap set or not set.
The log read is done while holding kvm slots_lock.

Is the comment related to performance, not supporting multiple page sizes,
or it's unsafe to call

Re: [PATCH v6 3/4] live migration support for VM dirty log management

2014-05-28 Thread Mario Smarduch

On 05/28/2014 02:08 AM, Christoffer Dall wrote:
 On Tue, May 27, 2014 at 02:55:21PM -0700, Mario Smarduch wrote:
 On 05/27/2014 01:12 PM, Christoffer Dall wrote:
 On Thu, May 15, 2014 at 11:27:30AM -0700, Mario Smarduch wrote:
 
 [...]
 
 +
 +  /* If pgd, pud, pmd not present and you cross pmd range check next
 +   * index.
 +   */
 +  pgd = pgdp + pgd_index(ipa);
 +  if (unlikely(crosses_pmd  !pgd_present(*pgd))) {
 +  pgd = pgdp + pgd_index(next);
 +  if (!pgd_present(*pgd))
 +  return;
 +  }
 +
 +  pud = pud_offset(pgd, ipa);
 +  if (unlikely(crosses_pmd  !pud_present(*pud))) {
 +  pud = pud_offset(pgd, next);
 +  if (!pud_present(*pud))
 +  return;
 +  }
 +
 +  pmd = pmd_offset(pud, ipa);
 +  if (unlikely(crosses_pmd  !pmd_present(*pmd))) {
 +  pmd = pmd_offset(pud, next);
 +  if (!pmd_present(*pmd))
 +  return;
 +  }
 +
 +  for (;;) {
 +  pte = pte_offset_kernel(pmd, ipa);
 +  if (!pte_present(*pte))
 +  goto next_ipa;
 +
 +  if (kvm_s2pte_readonly(pte))
 +  goto next_ipa;
 +  kvm_set_s2pte_readonly(pte);
 +next_ipa:
 +  mask = mask - 1;
 +  if (!mask)
 +  break;
 +
 +  /* find next page */
 +  ipa = (gfnofst + __ffs(mask))  PAGE_SHIFT;
 +
 +  /* skip upper page table lookups */
 +  if (!crosses_pmd)
 +  continue;
 +
 +  pgd = pgdp + pgd_index(ipa);
 +  if (unlikely(!pgd_present(*pgd)))
 +  goto next_ipa;
 +  pud = pud_offset(pgd, ipa);
 +  if (unlikely(!pud_present(*pud)))
 +  goto next_ipa;
 +  pmd = pmd_offset(pud, ipa);
 +  if (unlikely(!pmd_present(*pmd)))
 +  goto next_ipa;
 +  }

 So I think the reason this is done separately on x86 is that they have
 an rmap structure for their gfn mappings so that they can quickly lookup
 ptes based on a gfn and write-protect it without having to walk the
 stage-2 page tables.

 Yes, they also use rmapps for mmu notifiers, invalidations on huge VMs and 
 large ranges resulted in excessive times. 

 Unless you want to introduce this on ARM, I think you will be much

 Eventually yes but that would also require reworking mmu notifiers.  I had 
 two step approach in mind. Initially get the dirty page marking to work, 
 TLB flushing, GIC/arch-timer migration, validate migration under various 
 stress loads (page reclaim) with mmu notifiers, test several VMs and 
 migration 
 times. 

 Then get rmapp (or something similar) working - eventually for huge VMs it's
 needed. In short two phases.

 better off just having a single (properly written) iterating
 write-protect function, that takes a start and end IPA and a bitmap for
 which pages to actually write-protect, which can then handle the generic
 case (either NULL or all-ones bitmap) or a specific case, which just
 traverses the IPA range given as input.  Such a function should follow
 the model of page table walk functions discussed previously
 (separate functions: wp_pgd_enties(), wp_pud_entries(),
 wp_pmd_entries(), wp_pte_entries()).

 However, you may want to verify my assumption above with the x86 people
 and look at sharing the rmap logic between architectures.

 In any case, this code is very difficult to read and understand, and it
 doesn't look at all like the other code we have to walk page tables.  I
 understand you are trying to optimize for performance (by skipping some
 intermediate page table level lookups), but you never declare that goal
 anywhere in the code or in the commit message.

 Marc's comment noticed I was walking a small range (128k), using upper table
 iterations that covered 1G, 2MB ranges. As you mention the code tries to
 optimize upper table lookups. Yes the function is too bulky, but I'm not 
 sure how 
 to remove the upper table checks since page tables may change between the 
 time pages are marked dirty and the log is retrieved. And if a memory slot 
 is very dirty walking upper tables will impact performance. I'll think some 
 more on this function.

 I think you should aim at the simplest possible implementation that
 functionally works, first.  Let's verify that this thing works, have
 clean working code that implementation-wise is as minimal as possible.
 
 Then we can run perf on that and see if our migrations are very slow,
 where we are actually spending time, and only then optimize it.
 
 The solution to this specific problem for the time being appears quite
 clear to me: Follow the exact same scheme as for unmap_range (the one I
 sent out here:
 https://lists.cs.columbia.edu/pipermail/kvmarm/2014-May/009592.html, the
 diff is hard to read, so I recommend you apply the patch and look at the
 resulting code).  Have a similar scheme, call it wp_ipa_range() or
 something like that, and use that for now.

Ok I'll reuse that code. I'll need

Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration

2014-05-28 Thread Mario Smarduch


emslot dirty_bitmap during and after write protect.
 

 -Christoffer

Regarding huge pud that's causing some design problems, should huge PUD
pages be considered at all?

Thanks,
  Mario

 
 ___
 kvmarm mailing list
 kvm...@lists.cs.columbia.edu
 https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration

2014-05-28 Thread Mario Smarduch


Little bit more details on this question -

For 2nd stage 3-level tables PUD blocks don't exist - although
it appears you can have a PGD block but I don't see any
support for that. But should the code still work as if PUDs
(4-level table) are used and check for pud_huge()?

Looking at ARMv8 there are several block formats, I don't know which one
will be use for 2nd stage (4KB, 16,...) but one of them supports 4-level
table (have not looked at this in detail, could be wrong here).

Should pud_huge() be supported for future compatibility?

This impacts logging -
 - Some decisions are needed either clear the PUD entry and
   force them to pages or mark dirty bit map for each 4k page
   in the PUD Block range, IA64 appears to that in mark_pages_dirty().

 - If you assume pud_huge() then you probably have to support
   the logic for PUD Block descriptor even though
   it's not used in 3-level table at this time.

I think until PUD Blocks are actually used it's maybe better to
ignore them.

- Mario



On 05/28/2014 11:42 AM, Mario Smarduch wrote:
 
 emslot dirty_bitmap during and after write protect.


 -Christoffer
 
 Regarding huge pud that's causing some design problems, should huge PUD
 pages be considered at all?
 
 Thanks,
   Mario


 ___
 kvmarm mailing list
 kvm...@lists.cs.columbia.edu
 https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

 
 ___
 kvmarm mailing list
 kvm...@lists.cs.columbia.edu
 https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration

2014-05-29 Thread Mario Smarduch


 So this needs to be cleared up given this is key to logging.
 Cases this code handles during migration -
 1. huge page fault described above - write protect fault so you breakup
the huge page.
 2. All other faults - first time access, pte write protect you again wind up 
 in
stage2_set_pte().

 Am I missing something here?

 
 no, I forgot about the fact that we can take the permission fault now.
 Hmm, ok, so either we need to use the original approach of always
 splitting up huge pages or we need to just follow the regular huge page
 path here and just mark all 512 4K pages dirty in the log, or handle it
 in stage2_set_pte().
 
 I would say go with the most simple appraoch for now (which may be going
 back to splitting all pmd_huge() into regular pte's), and we can take a
 more careful look in the next patch iteration.
 

Looking at the overall memslot update architecture and various
fail scenarios - user_mem_abort() appears to be the most
optimal and reliable place. First Write Protect huge pages after
memslots are committed and deal with rest in user_mem_abort().

Still need some feedback on the pud_huge() before revising for
next iteration?

- Mario

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 4/4] add 2nd stage page fault handling during live migration

2014-05-29 Thread Mario Smarduch

On 05/29/2014 10:57 AM, Christoffer Dall wrote:
 On Thu, May 29, 2014 at 10:08:07AM -0700, Mario Smarduch wrote:

 So this needs to be cleared up given this is key to logging.
 Cases this code handles during migration -
 1. huge page fault described above - write protect fault so you breakup
the huge page.
 2. All other faults - first time access, pte write protect you again wind 
 up in
stage2_set_pte().

 Am I missing something here?


 no, I forgot about the fact that we can take the permission fault now.
 Hmm, ok, so either we need to use the original approach of always
 splitting up huge pages or we need to just follow the regular huge page
 path here and just mark all 512 4K pages dirty in the log, or handle it
 in stage2_set_pte().

 I would say go with the most simple appraoch for now (which may be going
 back to splitting all pmd_huge() into regular pte's), and we can take a
 more careful look in the next patch iteration.


 Looking at the overall memslot update architecture and various
 fail scenarios - user_mem_abort() appears to be the most
 optimal and reliable place. First Write Protect huge pages after
 memslots are committed and deal with rest in user_mem_abort().

 Still need some feedback on the pud_huge() before revising for
 next iteration?

 Just assume it's not used for now, and that you don't have to consider
 it, and make that assumption clear in the commit message, so it doesn't
 block this work.  I have a feeling we need to go through a few
 iterations here, so let's get that rolling.
 
 Thanks.
 
Ok thanks I'm on it now.

- Mario
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 2/4] live migration support for initial write protect of VM

2014-05-30 Thread Mario Smarduch


  
 +static inline void kvm_set_s2pte_readonly(pte_t *pte)
 +{
 +pte_val(*pte) = ~(L_PTE_S2_RDONLY ^ L_PTE_S2_RDWR);
 
 This relies on the pte already having been set as RDONLY or RDWR, if you
 are creating a new pte and calling this function it could be easy to
 miss that distinction, I would prefer:
 
 pte_val(*pte) = L_PTE_S2_RDWR;
 pte_val(*pte) |= L_PTE_S2_RDONLY;
 

Confused on this comment, this appears to just add the read-only
permission. But will leave other permission bits intact, and
clears out the rest of the pte?

- Mario

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 4/4] arm: dirty page logging 2nd stage page fault handling support

2014-06-03 Thread Mario Smarduch

This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and disolves huge pages to page tables.
In case migration is canceled huge pages will be used again.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   36 ++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1c546c9..aca4fbf 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -966,6 +966,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   /* Get logging status, if dirty_bitmap is not NULL then logging is on */
+   bool logging_active = !!memslot-dirty_bitmap;
 
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -1019,10 +1021,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
spin_lock(kvm-mmu_lock);
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+
+   /* When logging don't spend cycles to check for huge pages */
+   if (!hugetlb  !force_pte  !logging_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
 
-   if (hugetlb) {
+   /*
+* Force all not present/perm faults to PTE handling, address both
+* PMD and PTE faults
+*/
+   if (hugetlb  !logging_active) {
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
@@ -1034,6 +1042,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
if (writable) {
+   /*
+* If pmd is  mapping a huge page then clear it and let
+* stage2_set_pte() create a pte table. At the sametime
+* you write protect the pte (PAGE_S2 pgprot_t).
+*/
+   if (logging_active) {
+   pmd_t *pmd;
+   if (hugetlb) {
+   pfn += pte_index(fault_ipa);
+   gfn = fault_ipa  PAGE_SHIFT;
+   new_pte = pfn_pte(pfn, PAGE_S2);
+   }
+   pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
+   if (pmd  kvm_pmd_huge(*pmd))
+   clear_pmd_entry(kvm, pmd, fault_ipa);
+   }
kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
}
@@ -1041,6 +1065,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
}
 
+   /*
+* Log the dirty page in dirty_bitmap[], call regardless if logging is
+* disabled or enabled both cases handled safely.
+* TODO: for larger page size mark mulitple dirty page bits for each
+*   4k page.
+*/
+   if (writable)
+   mark_page_dirty(kvm, gfn);
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 0/4] arm: dirty page logging support for ARMv7

2014-06-03 Thread Mario Smarduch

 is 409600, 8192, 5
  o QEMU is instrumented to save RAM memory regions on source and destination
after memory is migrated, but before guest started. Later files are 
checksummed on both ends for correctness, given VMs are small this works. 
  o Guest kernel is instrumented to capture current cycle counter - last cycle
and compare to qemu down time to test arch timer accuracy. 
  o Network failover is at L3 due to interface limitations, ping continues
working transparently
  o Also tested 'migrate_cancel' to test reassemble of huge pages (inserted low
level instrumentation code).

Changes since v6:
- primarily reworked initial write protect, and write protect of dirty pages on
  logging request
- Only code logic change, disolve huge pages to page tables in page fault 
  handler
- Made many many changes based on Christoffers comments.

Mario Smarduch (4):
  add ARMv7 HYP API to flush VM TLBs without address param
  dirty page logging inital mem region write protect (w/no huge PUD
support)
  dirty log write protect management sppport
  dirt page logging 2nd stage page fault handling support

 arch/arm/include/asm/kvm_asm.h|1 +
 arch/arm/include/asm/kvm_host.h   |5 +
 arch/arm/include/asm/kvm_mmu.h|   20 +++
 arch/arm/include/asm/pgtable-3level.h |1 +
 arch/arm/kvm/arm.c|   11 +-
 arch/arm/kvm/interrupts.S |   11 ++
 arch/arm/kvm/mmu.c|  243 -
 arch/x86/kvm/x86.c|   86 
 virt/kvm/kvm_main.c   |   83 ++-
 9 files changed, 367 insertions(+), 94 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param

2014-06-03 Thread Mario Smarduch

Patch adds HYP interface for global VM TLB invalidation without address
parameter. Added ARM version of kvm_flush_remote_tlbs(), made the generic
implementation a weak symbol.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_asm.h |1 +
 arch/arm/kvm/interrupts.S  |   11 +++
 arch/arm/kvm/mmu.c |   14 ++
 virt/kvm/kvm_main.c|2 +-
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 53b3c4a..21bc519 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 0d68d40..bddc66b 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
bx  lr
 ENDPROC(__kvm_tlb_flush_vmid_ipa)
 
+/**
+ * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
+ *
+ * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
+ * parameter
+ */
+
+ENTRY(__kvm_tlb_flush_vmid)
+   b   __kvm_tlb_flush_vmid_ipa
+ENDPROC(__kvm_tlb_flush_vmid)
+
 /
  * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
  * domain, for all VMIDs
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 2ac9588..ef29540 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,20 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * kvm_flush_remote_tlbs() - flush all VM TLB entries
+ * @kvm:   pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries without address
+ * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by
+ * kvm_tlb_flush_vmid_ipa().
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+   if (kvm)
+   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa70c6e..ba25765 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned 
int req)
return called;
 }
 
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+void __weak kvm_flush_remote_tlbs(struct kvm *kvm)
 {
long dirty_count = kvm-tlbs_dirty;
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)

2014-06-03 Thread Mario Smarduch

Patch adds memslot support for initial write protection and split up of huge
pages. This patch series assumes that huge PUDs will not be used to map VM
memory. This patch depends on the unmap_range() patch, it needs to be applied
first.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h   |2 +
 arch/arm/include/asm/kvm_mmu.h|   20 ++
 arch/arm/include/asm/pgtable-3level.h |1 +
 arch/arm/kvm/arm.c|6 ++
 arch/arm/kvm/mmu.c|  114 +
 5 files changed, 143 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 193ceaf..59565f5 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -231,4 +231,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5cc0b0f..08ab5e8 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= L_PMD_S2_RDWR;
 }
 
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+   pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+   return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
+}
+
+static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
+{
+   pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
+}
+
+static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
+{
+   return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
+}
+
 /* Open coded p*d_addr_end that can deal with 64bit addresses */
 #define kvm_pgd_addr_end(addr, end)\
 ({ u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
diff --git a/arch/arm/include/asm/pgtable-3level.h 
b/arch/arm/include/asm/pgtable-3level.h
index 85c60ad..d8bb40b 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -129,6 +129,7 @@
 #define L_PTE_S2_RDONLY(_AT(pteval_t, 1)  6)   /* 
HAP[1]   */
 #define L_PTE_S2_RDWR  (_AT(pteval_t, 3)  6)   /* HAP[2:1] */
 
+#define L_PMD_S2_RDONLY(_AT(pteval_t, 1)  6)   /* 
HAP[1]   */
 #define L_PMD_S2_RDWR  (_AT(pmdval_t, 3)  6)   /* HAP[2:1] */
 
 /*
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 3c82b37..dfd63ac 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -242,6 +242,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
   const struct kvm_memory_slot *old,
   enum kvm_mr_change change)
 {
+   /*
+* At this point memslot has been committed and the there is an
+* allocated dirty_bitmap[] so marking of diryt pages works now on.
+*/
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   kvm_mmu_wp_memory_region(kvm, mem-slot);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index ef29540..e5dff85 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -760,6 +760,120 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+
+/**
+ * stage2_wp_pte_range - write protect PTE range
+ * @pmd:   pointer to pmd entry
+ * @addr:  range start address
+ * @end:   range end address
+ */
+static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+   pte_t *pte;
+
+   pte = pte_offset_kernel(pmd, addr);
+   do {
+   if (!pte_none(*pte)) {
+   if (!kvm_s2pte_readonly(pte))
+   kvm_set_s2pte_readonly(pte);
+   }
+   } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmd_range - write protect PMD range
+ * @pud:   pointer to pud entry
+ * @addr:  range start address
+ * @end:   range end address
+ */
+static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+   pmd_t *pmd;
+   phys_addr_t next;
+
+   pmd = pmd_offset(pud, addr);
+
+   do {
+   next = kvm_pmd_addr_end(addr, end);
+   if (!pmd_none(*pmd)) {
+   if (kvm_pmd_huge(*pmd)) {
+   /*
+* Write Protect the PMD, give user_mem_abort()
+* a choice to clear and fault on demand or
+* break up the huge page

[PATCH v7 3/4] arm: dirty log write protect management support

2014-06-03 Thread Mario Smarduch

This patch adds support for keeping track of VM dirty pages. As dirty page log
is retrieved, the pages that have been written are write protected again for
next write and log read.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |5 ---
 arch/arm/kvm/mmu.c  |   79 +++
 arch/x86/kvm/x86.c  |   86 ---
 virt/kvm/kvm_main.c |   81 
 5 files changed, 163 insertions(+), 91 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 59565f5..b760f9c 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dfd63ac..f06fb21 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-   return -EINVAL;
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e5dff85..1c546c9 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
spin_unlock(kvm-mmu_lock);
 }
 
+/**
+ * stage2_wp_mask_range() - write protect memslot pages set in mask
+ * @pmd - pointer to page table
+ * @start_ipa - the start range of mask
+ * @addr - start_ipa or start range of adjusted mask if crossing PMD range
+ * @mask - mask of dirty pages
+ *
+ * Walk mask and write protect the associated dirty pages in the memory region.
+ * If mask crosses a PMD range adjust it to next page table and return.
+ */
+static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa,
+   phys_addr_t *addr, unsigned long *mask)
+{
+   pte_t *pte;
+   bool crosses_pmd;
+   int i;
+
+   for (i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE;
+   *mask;
+   i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) {
+   crosses_pmd = !!((start_ipa  PMD_MASK) ^ (*addr  PMD_MASK));
+   if (unlikely(crosses_pmd)) {
+   /* Adjust mask dirty bits relative to next page table */
+   *mask = (PTRS_PER_PTE - pte_index(start_ipa));
+   return;
+   }
+
+   pte = pte_offset_kernel(pmd, *addr);
+   if (!pte_none(*pte))
+   kvm_set_s2pte_readonly(pte);
+   *mask = ~(1  i);
+   }
+}
+
+/**
+ * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:   The mask of dirty pages at offset 'gnf_offset' in this memory
+ *  slot to be write protected
+ *
+ * Called from dirty page logging read function to write protect bits set in
+ * mask to record future writes to these pages in dirty page log. This function
+ * uses simplified page table walk knowing that mask spawns range of two PMDs.
+ *
+ * 'kvm-mmu_lock' must be  held to protect against concurrent modification
+ * of page tables (2nd stage fault, mmu modifiers, ...)
+ *
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   pud_t *pud;
+   pmd_t *pmd;
+   phys_addr_t start_ipa = (slot-base_gfn + gfn_offset)  PAGE_SHIFT;
+   phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE;
+   phys_addr_t addr = start_ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+
+   do {
+   pgd = pgdp + pgd_index(addr);
+   if (pgd_present(*pgd)) {
+   pud = pud_offset(pgd, addr);
+   if (!pud_none(*pud)  !pud_huge(*pud)) {
+   pmd = pmd_offset(pud, addr);
+   if (!pmd_none(*pmd)  !kvm_pmd_huge(*pmd))
+   stage2_wp_mask_range(pmd, start_ipa,
+   addr, mask);
+   else
+   addr += PMD_SIZE;
+   } else

[RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-04 Thread Mario Smarduch

Resending patch, noticed I forgot to adjust start_ipa properly in 
stage2_wp_mask_range() and then noticed that pte's can be indexed directly. 
The patch applies cleanly after 2/4 and 4/4 applies cleanly after this patch.

This patch adds support for keeping track of VM dirty pages. As dirty page log
is retrieved, the pages that have been written are write protected again for
next write and log read.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |5 ---
 arch/arm/kvm/mmu.c  |   79 +++
 arch/x86/kvm/x86.c  |   86 ---
 virt/kvm/kvm_main.c |   81 
 5 files changed, 163 insertions(+), 91 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 59565f5..b760f9c 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dfd63ac..f06fb21 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-   return -EINVAL;
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e5dff85..5ede813 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
spin_unlock(kvm-mmu_lock);
 }
 
+/**
+ * stage2_wp_mask_range() - write protect memslot pages set in mask
+ * @pmd - pointer to page table
+ * @start_ipa - the start range of mask
+ * @addr - start_ipa or start range of adjusted mask if crossing PMD range
+ * @mask - mask of dirty pages
+ *
+ * Walk mask and write protect the associated dirty pages in the memory region.
+ * If mask crosses a PMD range adjust it to next page table and return.
+ */
+static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa,
+   phys_addr_t *addr, unsigned long *mask)
+{
+   pte_t *pte;
+   bool crosses_pmd;
+   int i = __ffs(*mask);
+
+   if (unlikely(*addr  start_ipa))
+   start_ipa = *addr - i * PAGE_SIZE;
+   pte = pte_offset_kernel(pmd, start_ipa);
+   for (*addr = start_ipa + i * PAGE_SIZE; *mask;
+   i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) {
+   crosses_pmd = !!((start_ipa  PMD_MASK) ^ (*addr  PMD_MASK));
+   if (unlikely(crosses_pmd)) {
+   /* Adjust mask dirty bits relative to next page table */
+   *mask = (PTRS_PER_PTE - pte_index(start_ipa));
+   return;
+   }
+   if (!pte_none(pte[i]))
+   kvm_set_s2pte_readonly(pte[i]);
+   *mask = ~(1  i);
+   }
+}
+
+/**
+ * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:   The mask of dirty pages at offset 'gnf_offset' in this memory
+ *  slot to be write protected
+ *
+ * Called from dirty page logging read function to write protect bits set in
+ * mask to record future writes to these pages in dirty page log. This function
+ * uses simplified page table walk given  mask can spawn no more then 2 PMD
+ * table range.
+ * 'kvm-mmu_lock' must be  held to protect against concurrent modification
+ * of page tables (2nd stage fault, mmu modifiers, ...)
+ *
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   pud_t *pud;
+   pmd_t *pmd;
+   phys_addr_t start_ipa = (slot-base_gfn + gfn_offset)  PAGE_SHIFT;
+   phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE;
+   phys_addr_t addr = start_ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+
+   do {
+   pgd = pgdp + pgd_index(addr);
+   if (pgd_present(*pgd)) {
+   pud = pud_offset(pgd, addr);
+   if (!pud_none(*pud)  !pud_huge(*pud)) {
+   pmd = pmd_offset(pud, addr);
+   if (!pmd_none(*pmd)  !kvm_pmd_huge(*pmd

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-05 Thread Mario Smarduch

On 06/04/2014 11:55 PM, Xiao Guangrong wrote:
 On 06/05/2014 05:11 AM, Mario Smarduch wrote:
 
 +spin_lock(kvm-mmu_lock);
 +
 +for (i = 0; i  n / sizeof(long); i++) {
 +unsigned long mask;
 +gfn_t offset;
 +
 +if (!dirty_bitmap[i])
 +continue;
 +
 +is_dirty = true;
 +
 +mask = xchg(dirty_bitmap[i], 0);
 +dirty_bitmap_buffer[i] = mask;
 +
 +offset = i * BITS_PER_LONG;
 +kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
 +}
 +if (is_dirty)
 +kvm_flush_remote_tlbs(kvm);
 
 You moved the flush into mmu-lock. Please do not :).
 
 See commit 198c74f43f0f5473f99967aead30ddc622804bc1
 

Thanks for reviewing, I revised to pick up your version.

Functionally there should be no impact on ARM, the
TLB flush function is different.

- Mario
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-06 Thread Mario Smarduch

kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch 
changed this function, this patch picks up those changes, re-tested everything
works. Applies cleanly with other patches.

This patch adds support for keeping track of VM dirty pages. As dirty page log
is retrieved, the pages that have been written are write protected again for
next write and log read.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |5 ---
 arch/arm/kvm/mmu.c  |   79 +++
 arch/x86/kvm/x86.c  |   86 ---
 virt/kvm/kvm_main.c |   86 +++
 5 files changed, 168 insertions(+), 91 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 59565f5..b760f9c 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dfd63ac..f06fb21 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-   return -EINVAL;
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e5dff85..907344c 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
spin_unlock(kvm-mmu_lock);
 }
 
+/**
+ * stage2_wp_mask_range() - write protect memslot pages set in mask
+ * @pmd - pointer to page table
+ * @start_ipa - the start range of mask
+ * @addr - start_ipa or start range of adjusted mask if crossing PMD range
+ * @mask - mask of dirty pages
+ *
+ * Walk mask and write protect the associated dirty pages in the memory region.
+ * If mask crosses a PMD range adjust it to next page table and return.
+ */
+static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa,
+   phys_addr_t *addr, unsigned long *mask)
+{
+   pte_t *pte;
+   bool crosses_pmd;
+   int i = __ffs(*mask);
+
+   if (unlikely(*addr  start_ipa))
+   start_ipa = *addr - i * PAGE_SIZE;
+   pte = pte_offset_kernel(pmd, start_ipa);
+   for (*addr = start_ipa + i * PAGE_SIZE; *mask;
+   i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) {
+   crosses_pmd = !!((start_ipa  PMD_MASK) ^ (*addr  PMD_MASK));
+   if (unlikely(crosses_pmd)) {
+   /* Adjust mask dirty bits relative to next page table */
+   *mask = (PTRS_PER_PTE - pte_index(start_ipa));
+   return;
+   }
+   if (!pte_none(pte[i]))
+   kvm_set_s2pte_readonly(pte[i]);
+   *mask = ~(1  i);
+   }
+}
+
+/**
+ * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask
+ * @kvm:The KVM pointer
+ * @slot:   The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:   The mask of dirty pages at offset 'gnf_offset' in this memory
+ *  slot to be write protected
+ *
+ * Called from dirty page logging read function to write protect bits set in
+ * mask to record future writes to these pages in dirty page log. This function
+ * uses simplified page table walk given  mask can spawn no more then 2 PMD
+ * table range.
+ * 'kvm-mmu_lock' must be  held to protect against concurrent modification
+ * of page tables (2nd stage fault, mmu modifiers, ...)
+ *
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   pud_t *pud;
+   pmd_t *pmd;
+   phys_addr_t start_ipa = (slot-base_gfn + gfn_offset)  PAGE_SHIFT;
+   phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE;
+   phys_addr_t addr = start_ipa;
+   pgd_t *pgdp = kvm-arch.pgd, *pgd;
+
+   do {
+   pgd = pgdp + pgd_index(addr);
+   if (pgd_present(*pgd)) {
+   pud = pud_offset(pgd, addr);
+   if (!pud_none(*pud)  !pud_huge(*pud)) {
+   pmd = pmd_offset(pud, addr);
+   if (!pmd_none(*pmd)  !kvm_pmd_huge(*pmd

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-06 Thread Mario Smarduch

On 06/05/2014 10:52 PM, Xiao Guangrong wrote:
 On 06/06/2014 03:09 AM, Mario Smarduch wrote:
 On 06/04/2014 11:55 PM, Xiao Guangrong wrote:
 On 06/05/2014 05:11 AM, Mario Smarduch wrote:

 +  spin_lock(kvm-mmu_lock);
 +
 +  for (i = 0; i  n / sizeof(long); i++) {
 +  unsigned long mask;
 +  gfn_t offset;
 +
 +  if (!dirty_bitmap[i])
 +  continue;
 +
 +  is_dirty = true;
 +
 +  mask = xchg(dirty_bitmap[i], 0);
 +  dirty_bitmap_buffer[i] = mask;
 +
 +  offset = i * BITS_PER_LONG;
 +  kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
 +  }
 +  if (is_dirty)
 +  kvm_flush_remote_tlbs(kvm);

 You moved the flush into mmu-lock. Please do not :).

 See commit 198c74f43f0f5473f99967aead30ddc622804bc1


 Thanks for reviewing, I revised to pick up your version.

 Functionally there should be no impact on ARM, the
 TLB flush function is different.
 
 Yeah, i agree your point on ARM, but your patch moved
 the function from x86 to the common code, that means
 this function is reused between ARM and x86. No?
 

Yes you pretty much summarized it. My point was more like
I'm glad the change had no impact on ARM :)

Thanks,
- Mario
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 0/4] arm: dirty page logging support for ARMv7

2014-06-09 Thread Mario Smarduch

On 06/08/2014 03:45 AM, Christoffer Dall wrote:
 On Tue, Jun 03, 2014 at 04:19:23PM -0700, Mario Smarduch wrote:
 This patch adds support for dirty page logging so far tested only on ARMv7.
 With dirty page logging, GICv2 vGIC and arch timer save/restore support, 
 live 
 migration is supported. 

 Dirty page logging support -
 - initially write protects VM RAM memory regions - 2nd stage page tables
 - add support to read dirty page log and again write protect the dirty pages 
   - second stage page table for next pass.
 - second stage huge page are disolved into page tables to keep track of
   dirty pages at page granularity. Tracking at huge page granularity limits 
   migration to an almost idle system. There are couple approaches to handling
   huge pages:
   1 - break up huge page into page table and write protect all pte's
   2 - clear the PMD entry, create a page table install the faulted page entry
   and write protect it.
 
 not sure I fully understand.  Is option 2 simply write-protecting all
 PMDs and splitting it at fault time?

No that's 1 above. Option 2 is the optimized solution you describe in patch 4 
review - clear the PMD and let stage2_set_pte allocate a page table and install 
the pte, then it's demand faulting on future access to that PMD range.
 

   This patch implements #2, in the future #1 may be implemented depending on
   more bench mark results.

   Option 1: may over commit and do unnecessary work, but on heavy loads 
 appears
 to converge faster during live migration
   Option 2: Only write protects pages that are accessed, migration
  varies, takes longer then Option 1 but eventually catches up.

 - In the event migration is canceled, normal behavior is resumed huge pages
   are rebuilt over time.
 - Another alternative is use of reverse mappings where for each level 2nd
   stage tables (PTE, PMD, PUD) pointers to spte's are maintained (x86 impl.).
   Primary reverse mapping benefits are for mmu notifiers for large memory 
 range
   invalidations. Reverse mappings also improve dirty page logging, instead of
   walking page tables, spete pointers are accessed directly via reverse map
   array.
 - Reverse mappings will be considered for future support once the current
   implementation is hardened.
 
 Is the following a list of your future work?

I guess yes and no, with exception of lmbench I've ran these tests also
couple other folks have tested with prior revisions. I'll run
more (overnight, burn in tests) adding lmbench, but I'm hoping 
others will run tests to give this more run time, different loads 
and so on.
 
   o validate current dirty page logging support
   o VMID TLB Flushing, migrating multiple guests
   o GIC/arch-timer migration
   o migration under various loads, primarily page reclaim and validate 
 current
 mmu-notifiers
   o Run benchmarks (lmbench for now) and test impact on performance, and
 optimize
   o Test virtio - since it writes into guest memory. Wait until pci is 
 supported
 on ARM.
 
 So you're not testing with virtio now?  Your command line below seems to
 suggest that in fact you are.  /me confused.

Yes so I've see no errors with virtio-mmio transport and virto-net-device,
blk-device backends under moderate loads. But virtio inbound is purely user 
space 
in this case QEMU  so I can't say with certainty that virtio is 100%. 
Sometime back I found problems with virtio-mmio when transport and backend 
are not fused together none of the performance options (UFO, TSO, Partial 
Checksum...) got applied, like they did for virti-net-pci. So to summarize 
I need to see how virtio tracks dirty pages for virtio-mmio, 
and  virtio-pci in QEMU. I have fair idea where to look but have not
done so yet.


 
   o Currently on ARM, KVM doesn't appear to write into Guest address space,
 need to mark those pages dirty too (???).
 
 not sure what you mean here, can you expand?

For few architectures KVM writes into guest memory, one example is PV-EOI,
will write into guest memory to disable/enable PV-EOI while injecting an
interrupt - based one number of in flight interrupts. There is other code
that does it too, but I'm not familiar with all the use cases. So if we do
that on ARM the page(s) must marked dirty.

 
 - Move onto ARMv8 since 2nd stage mmu is shared between both architectures. 
   But in addition to dirty page log additional support for GIC, arch timers, 
   and emulated devices is required. Also working on emulated platform masks
   a lot of potential bugs, but does help to get majority of code working.

 Test Environment:
 ---
 NOTE: RUNNING on FAST Models will hardly ever fail and mask bugs, infact 
   initially light loads were succeeding without dirty page logging 
 support.
 ---
 - Will put all components on github, including test setup diagram
 - In short

Re: [PATCH v7 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param

2014-06-09 Thread Mario Smarduch

On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Tue, Jun 03, 2014 at 04:19:24PM -0700, Mario Smarduch wrote:
 Patch adds HYP interface for global VM TLB invalidation without address
 parameter. Added ARM version of kvm_flush_remote_tlbs(), made the generic
 implementation a weak symbol.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_asm.h |1 +
  arch/arm/kvm/interrupts.S  |   11 +++
  arch/arm/kvm/mmu.c |   14 ++
  virt/kvm/kvm_main.c|2 +-
  4 files changed, 27 insertions(+), 1 deletion(-)

 diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
 index 53b3c4a..21bc519 100644
 --- a/arch/arm/include/asm/kvm_asm.h
 +++ b/arch/arm/include/asm/kvm_asm.h
 @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
  
  extern void __kvm_flush_vm_context(void);
  extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 +extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
  
  extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
  #endif
 diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
 index 0d68d40..bddc66b 100644
 --- a/arch/arm/kvm/interrupts.S
 +++ b/arch/arm/kvm/interrupts.S
 @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
  bx  lr
  ENDPROC(__kvm_tlb_flush_vmid_ipa)
  
 +/**
 + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
 + *
 + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
 + * parameter
 + */
 +
 +ENTRY(__kvm_tlb_flush_vmid)
 +b   __kvm_tlb_flush_vmid_ipa
 +ENDPROC(__kvm_tlb_flush_vmid)
 +
  /
   * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
   * domain, for all VMIDs
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 2ac9588..ef29540 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -56,6 +56,20 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
 phys_addr_t ipa)
  kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  }
  
 +/**
 + * kvm_flush_remote_tlbs() - flush all VM TLB entries
 + * @kvm:   pointer to kvm structure.
 + *
 + * Interface to HYP function to flush all VM TLB entries without address
 + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by
 + * kvm_tlb_flush_vmid_ipa().
 + */
 +void kvm_flush_remote_tlbs(struct kvm *kvm)
 +{
 +if (kvm)
 +kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 +}
 +
  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
int min, int max)
  {
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index fa70c6e..ba25765 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, 
 unsigned int req)
  return called;
  }
  
 -void kvm_flush_remote_tlbs(struct kvm *kvm)
 +void __weak kvm_flush_remote_tlbs(struct kvm *kvm)
  {
  long dirty_count = kvm-tlbs_dirty;
  
 -- 
 1.7.9.5

 
 This doesn't build or link on aarch64 :(
 
 -Christoffer
 

I'll recompile and retest the dirty page logging portion on ARMv8 and
resolve these issues, early next week.

In the meantime if it's ok with you, I'' move forward with the
rest of the patches on ARMv7 to get through critical issues.

Would that work?

- Mario
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)

2014-06-09 Thread Mario Smarduch

On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Tue, Jun 03, 2014 at 04:19:25PM -0700, Mario Smarduch wrote:
 Patch adds memslot support for initial write protection and split up of huge
 pages. This patch series assumes that huge PUDs will not be used to map VM
 memory. This patch depends on the unmap_range() patch, it needs to be applied
 first.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h   |2 +
  arch/arm/include/asm/kvm_mmu.h|   20 ++
  arch/arm/include/asm/pgtable-3level.h |1 +
  arch/arm/kvm/arm.c|6 ++
  arch/arm/kvm/mmu.c|  114 
 +
  5 files changed, 143 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 193ceaf..59565f5 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -231,4 +231,6 @@ int kvm_perf_teardown(void);
  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
  
 +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 +
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
 index 5cc0b0f..08ab5e8 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
  pmd_val(*pmd) |= L_PMD_S2_RDWR;
  }
  
 +static inline void kvm_set_s2pte_readonly(pte_t *pte)
 +{
 +pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pte_readonly(pte_t *pte)
 +{
 +return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
 +}
 +
 +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 +{
 +pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 +{
 +return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
 +}
 +
 
 not crazy about the names, how about kvm_set_s2_pte_readonly etc.?
 
So kvm_set_s2pte_writable(pte_t *pte) was there already just following
that convention.

 the fact that these don't exist for arm64 makes me think it may break
 the build for arm64 as well...

Yes will address it.
 
  /* Open coded p*d_addr_end that can deal with 64bit addresses */
  #define kvm_pgd_addr_end(addr, end) \
  ({  u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
 diff --git a/arch/arm/include/asm/pgtable-3level.h 
 b/arch/arm/include/asm/pgtable-3level.h
 index 85c60ad..d8bb40b 100644
 --- a/arch/arm/include/asm/pgtable-3level.h
 +++ b/arch/arm/include/asm/pgtable-3level.h
 @@ -129,6 +129,7 @@
  #define L_PTE_S2_RDONLY (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PTE_S2_RDWR   (_AT(pteval_t, 3)  6)   /* 
 HAP[2:1] */
  
 +#define L_PMD_S2_RDONLY (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PMD_S2_RDWR   (_AT(pmdval_t, 3)  6)   /* 
 HAP[2:1] */
  
  /*
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..dfd63ac 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -242,6 +242,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 const struct kvm_memory_slot *old,
 enum kvm_mr_change change)
  {
 +/*
 + * At this point memslot has been committed and the there is an
 + * allocated dirty_bitmap[] so marking of diryt pages works now on.
 
 s/diryt/dirty/
 
 works now on ?
Ok
 
 + */
 +if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
 +kvm_mmu_wp_memory_region(kvm, mem-slot);
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index ef29540..e5dff85 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -760,6 +760,120 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
 phys_addr_t *ipap)
  return false;
  }
  
 +
 +/**
 + * stage2_wp_pte_range - write protect PTE range
 + * @pmd:pointer to pmd entry
 + * @addr:   range start address
 + * @end:range end address
 + */
 +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t 
 end)
 +{
 +pte_t *pte;
 +
 +pte = pte_offset_kernel(pmd, addr);
 +do {
 +if (!pte_none(*pte)) {
 +if (!kvm_s2pte_readonly(pte))
 +kvm_set_s2pte_readonly(pte);
 
 do you need the test before setting readonly?
Probably not.

Some memory regions have hardly any pages present and sometimes
not dirty. Was thinking of couple enhancements not to flush if
there are no dirty pages or few dirty pages then just flush by IPA.
But currently not doing anything with this info, leave it for
future.

 
 +}
 +} while (pte++, addr += PAGE_SIZE

Re: [PATCH v7 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)

2014-06-09 Thread Mario Smarduch

On 06/09/2014 11:09 AM, Christoffer Dall wrote:
 On Mon, Jun 09, 2014 at 10:58:18AM -0700, Mario Smarduch wrote:
 On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Tue, Jun 03, 2014 at 04:19:25PM -0700, Mario Smarduch wrote:
 Patch adds memslot support for initial write protection and split up of 
 huge
 pages. This patch series assumes that huge PUDs will not be used to map VM
 memory. This patch depends on the unmap_range() patch, it needs to be 
 applied
 first.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h   |2 +
  arch/arm/include/asm/kvm_mmu.h|   20 ++
  arch/arm/include/asm/pgtable-3level.h |1 +
  arch/arm/kvm/arm.c|6 ++
  arch/arm/kvm/mmu.c|  114 
 +
  5 files changed, 143 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 193ceaf..59565f5 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -231,4 +231,6 @@ int kvm_perf_teardown(void);
  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
  
 +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 +
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/include/asm/kvm_mmu.h 
 b/arch/arm/include/asm/kvm_mmu.h
 index 5cc0b0f..08ab5e8 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= L_PMD_S2_RDWR;
  }
  
 +static inline void kvm_set_s2pte_readonly(pte_t *pte)
 +{
 +  pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pte_readonly(pte_t *pte)
 +{
 +  return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
 +}
 +
 +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 +{
 +  pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 +{
 +  return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
 +}
 +

 not crazy about the names, how about kvm_set_s2_pte_readonly etc.?

 So kvm_set_s2pte_writable(pte_t *pte) was there already just following
 that convention.

 
 ah, ok, no problem then.
 
 the fact that these don't exist for arm64 makes me think it may break
 the build for arm64 as well...

 Yes will address it.

  /* Open coded p*d_addr_end that can deal with 64bit addresses */
  #define kvm_pgd_addr_end(addr, end)   
 \
  ({u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;
 \
 diff --git a/arch/arm/include/asm/pgtable-3level.h 
 b/arch/arm/include/asm/pgtable-3level.h
 index 85c60ad..d8bb40b 100644
 --- a/arch/arm/include/asm/pgtable-3level.h
 +++ b/arch/arm/include/asm/pgtable-3level.h
 @@ -129,6 +129,7 @@
  #define L_PTE_S2_RDONLY   (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PTE_S2_RDWR (_AT(pteval_t, 3)  6)   /* 
 HAP[2:1] */
  
 +#define L_PMD_S2_RDONLY   (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PMD_S2_RDWR (_AT(pmdval_t, 3)  6)   /* 
 HAP[2:1] */
  
  /*
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..dfd63ac 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -242,6 +242,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
   const struct kvm_memory_slot *old,
   enum kvm_mr_change change)
  {
 +  /*
 +   * At this point memslot has been committed and the there is an
 +   * allocated dirty_bitmap[] so marking of diryt pages works now on.

 s/diryt/dirty/

 works now on ?
 Ok
Sorry I thought it was comment. This function is called after
the memslots have been committed so we know dirty bit map
has been allocated and marking the dirty bitmap will work as the pages
are being write protected and we're getting faults.
 
 I don't understand what works now on means, so you need to clarify.
 

 +   */
 +  if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
 +  kvm_mmu_wp_memory_region(kvm, mem-slot);
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index ef29540..e5dff85 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -760,6 +760,120 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
 phys_addr_t *ipap)
return false;
  }
  
 +
 +/**
 + * stage2_wp_pte_range - write protect PTE range
 + * @pmd:  pointer to pmd entry
 + * @addr: range start address
 + * @end:  range end address
 + */
 +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t 
 end)
 +{
 +  pte_t *pte;
 +
 +  pte = pte_offset_kernel(pmd, addr);
 +  do {
 +  if (!pte_none(*pte)) {
 +  if (!kvm_s2pte_readonly(pte

Re: [PATCH v7 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param

2014-06-09 Thread Mario Smarduch

On 06/09/2014 10:49 AM, Christoffer Dall wrote:
 On Mon, Jun 09, 2014 at 10:06:29AM -0700, Mario Smarduch wrote:
 On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Tue, Jun 03, 2014 at 04:19:24PM -0700, Mario Smarduch wrote:
 Patch adds HYP interface for global VM TLB invalidation without address
 parameter. Added ARM version of kvm_flush_remote_tlbs(), made the generic
 implementation a weak symbol.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_asm.h |1 +
  arch/arm/kvm/interrupts.S  |   11 +++
  arch/arm/kvm/mmu.c |   14 ++
  virt/kvm/kvm_main.c|2 +-
  4 files changed, 27 insertions(+), 1 deletion(-)

 diff --git a/arch/arm/include/asm/kvm_asm.h 
 b/arch/arm/include/asm/kvm_asm.h
 index 53b3c4a..21bc519 100644
 --- a/arch/arm/include/asm/kvm_asm.h
 +++ b/arch/arm/include/asm/kvm_asm.h
 @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
  
  extern void __kvm_flush_vm_context(void);
  extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 +extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
  
  extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
  #endif
 diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
 index 0d68d40..bddc66b 100644
 --- a/arch/arm/kvm/interrupts.S
 +++ b/arch/arm/kvm/interrupts.S
 @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
bx  lr
  ENDPROC(__kvm_tlb_flush_vmid_ipa)
  
 +/**
 + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
 + *
 + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
 + * parameter
 + */
 +
 +ENTRY(__kvm_tlb_flush_vmid)
 +  b   __kvm_tlb_flush_vmid_ipa
 +ENDPROC(__kvm_tlb_flush_vmid)
 +
  /
   * Flush TLBs and instruction caches of all CPUs inside the 
 inner-shareable
   * domain, for all VMIDs
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 2ac9588..ef29540 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -56,6 +56,20 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
 phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  }
  
 +/**
 + * kvm_flush_remote_tlbs() - flush all VM TLB entries
 + * @kvm:   pointer to kvm structure.
 + *
 + * Interface to HYP function to flush all VM TLB entries without address
 + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used 
 by
 + * kvm_tlb_flush_vmid_ipa().
 + */
 +void kvm_flush_remote_tlbs(struct kvm *kvm)
 +{
 +  if (kvm)
 +  kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 +}
 +
  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
  {
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index fa70c6e..ba25765 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -184,7 +184,7 @@ static bool make_all_cpus_request(struct kvm *kvm, 
 unsigned int req)
return called;
  }
  
 -void kvm_flush_remote_tlbs(struct kvm *kvm)
 +void __weak kvm_flush_remote_tlbs(struct kvm *kvm)
  {
long dirty_count = kvm-tlbs_dirty;
  
 -- 
 1.7.9.5


 This doesn't build or link on aarch64 :(

 -Christoffer


 I'll recompile and retest the dirty page logging portion on ARMv8 and
 resolve these issues, early next week.

 In the meantime if it's ok with you, I'' move forward with the
 rest of the patches on ARMv7 to get through critical issues.

 Would that work?
 
 Your patches need to at least compile with other architectures, I
 suggest fixing that up front.  You also eventually need to somehow test
 (or ask maintainers to test) a branch with your patches on the
 architectures that your code messes with.  This later point can be
 addressed once we're close to a consensus on ARM.

Ok got it. So far I just checked to make sure x86 compiles since
one function is generic and declared __weak, both x86 and ARM
share it now (Xiaos comment).
 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-09 Thread Mario Smarduch

On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote:
 kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch 
 changed this function, this patch picks up those changes, re-tested 
 everything
 works. Applies cleanly with other patches.

 This patch adds support for keeping track of VM dirty pages. As dirty page 
 log
 is retrieved, the pages that have been written are write protected again for
 next write and log read.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h |3 ++
  arch/arm/kvm/arm.c  |5 ---
  arch/arm/kvm/mmu.c  |   79 +++
  arch/x86/kvm/x86.c  |   86 
 ---
  virt/kvm/kvm_main.c |   86 
 +++
  5 files changed, 168 insertions(+), 91 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 59565f5..b760f9c 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
  
  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 +struct kvm_memory_slot *slot,
 +gfn_t gfn_offset, unsigned long mask);
 
 Do all other architectures implement this function?  arm64?

Besides arm, x86 but the function is not generic.
 
  
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index dfd63ac..f06fb21 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
  }
  }
  
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 -{
 -return -EINVAL;
 -}
 -
 
 What about the other architectures implementing this function?

Six architectures define this function. With this patch this
function is generic in kvm_main.c used by x86.
 
  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
  struct kvm_arm_device_addr *dev_addr)
  {
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index e5dff85..907344c 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
  spin_unlock(kvm-mmu_lock);
  }
  
 +/**
 + * stage2_wp_mask_range() - write protect memslot pages set in mask
 + * @pmd - pointer to page table
 + * @start_ipa - the start range of mask
 + * @addr - start_ipa or start range of adjusted mask if crossing PMD range
 + * @mask - mask of dirty pages
 + *
 + * Walk mask and write protect the associated dirty pages in the memory 
 region.
 + * If mask crosses a PMD range adjust it to next page table and return.
 + */
 +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa,
 +phys_addr_t *addr, unsigned long *mask)
 +{
 +pte_t *pte;
 +bool crosses_pmd;
 +int i = __ffs(*mask);
 +
 +if (unlikely(*addr  start_ipa))
 +start_ipa = *addr - i * PAGE_SIZE;
 
 huh?
 
 +pte = pte_offset_kernel(pmd, start_ipa);
 +for (*addr = start_ipa + i * PAGE_SIZE; *mask;
 +i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) {
 +crosses_pmd = !!((start_ipa  PMD_MASK) ^ (*addr  PMD_MASK));
 +if (unlikely(crosses_pmd)) {
 +/* Adjust mask dirty bits relative to next page table */
 +*mask = (PTRS_PER_PTE - pte_index(start_ipa));
 +return;
 +}
 +if (!pte_none(pte[i]))
 +kvm_set_s2pte_readonly(pte[i]);
 +*mask = ~(1  i);
 
 This is *really* complicated, and *really* unintuitive and *really* hard
 to read!
 
 I feel this may very likely break, and is optimizing prematurely for
 some very special case.  Can't you follow the usual scheme of traversing
 the levels one-by-one and just calculate the 'end' address based on the
 number of bits in your long, and just adjust the mask in the calling
 function each time you are about to call a lower-level function?

Agreed I'll extend wp_range functions, it probably makes no sense
to be optimizing at this phase.

 
 In fact, I think this could be trivially implemented as an extension to
 your existing wp_range functions.  On ARM you are mostly going to
 consider 32 pages, on arm64 you are mostly going to consider 64 pages,
 just calculate that range in terms of IPAs and set that as the limit for
 calling stage2_wp_pgd_range (which should be factor'ed out into its
 function and called from kvm_mmu_wp_memory_region).
 
 
 
 +}
 +}
 +
 +/**
 + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in 
 mask
 + * @kvm:The KVM pointer
 + * @slot:   The memory

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-10 Thread Mario Smarduch

On 06/10/2014 02:22 AM, Christoffer Dall wrote:
 On Mon, Jun 09, 2014 at 06:47:12PM -0700, Mario Smarduch wrote:
 On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote:
 kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch 
 changed this function, this patch picks up those changes, re-tested 
 everything
 works. Applies cleanly with other patches.

 This patch adds support for keeping track of VM dirty pages. As dirty page 
 log
 is retrieved, the pages that have been written are write protected again 
 for
 next write and log read.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h |3 ++
  arch/arm/kvm/arm.c  |5 ---
  arch/arm/kvm/mmu.c  |   79 +++
  arch/x86/kvm/x86.c  |   86 
 ---
  virt/kvm/kvm_main.c |   86 
 +++
  5 files changed, 168 insertions(+), 91 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 59565f5..b760f9c 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 
 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
  
  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 +  struct kvm_memory_slot *slot,
 +  gfn_t gfn_offset, unsigned long mask);

 Do all other architectures implement this function?  arm64?

 Besides arm, x86 but the function is not generic.

 
 you're now calling this from generic code, so all architecture must
 implement it, and the prototype should proably be in
 include/linux/kvm_host.h, not in the arch-specific headers.
Ah ok.
 
  
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index dfd63ac..f06fb21 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
  }
  
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 -{
 -  return -EINVAL;
 -}
 -

 What about the other architectures implementing this function?

 Six architectures define this function. With this patch this
 function is generic in kvm_main.c used by x86.
 
 But you're not defining it as a weak symbol (and I don't suspect that
 you should unless other archs do this in a *very* different way), so you
 need to either remove it from the other archs, make it a weak symbol (I
 hope this is not the case) or do something else.
Mistake on my part I just cut and paste Xiaos x86's recent upstream patch and 
didn't add weak definition.

I looked at IA64, MIPS (two of them ), S390 somewhat similar but quite 
different implementations. They use a sync version, where the dirty bitmaps 
are maintained at arch level and then copied to memslot-dirty_bitmap. There 
is only commonality between x86 and ARM right now, x86 uses
memslot-dirty_bitmap directly.

Maybe this function should go back to architecture layer, it's
unlikely it can become generic across all architectures.

There is also the issue of kvm_flush_remote_tlbs(), that's also weak,
the generic one is using IPIs. Since it's only used in mmu.c maybe make 
this one static.


 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 4/4] arm: dirty page logging 2nd stage page fault handling support

2014-06-10 Thread Mario Smarduch

On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Tue, Jun 03, 2014 at 04:19:27PM -0700, Mario Smarduch wrote:
 This patch adds support for handling 2nd stage page faults during migration,
 it disables faulting in huge pages, and disolves huge pages to page tables.
 
 s/disolves/dissolves/g
Will do.
 
 In case migration is canceled huge pages will be used again.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/kvm/mmu.c |   36 ++--
  1 file changed, 34 insertions(+), 2 deletions(-)

 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 1c546c9..aca4fbf 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -966,6 +966,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
  struct vm_area_struct *vma;
  pfn_t pfn;
 +/* Get logging status, if dirty_bitmap is not NULL then logging is on */
 +bool logging_active = !!memslot-dirty_bitmap;
 
  
  write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
  if (fault_status == FSC_PERM  !write_fault) {
 @@ -1019,10 +1021,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  spin_lock(kvm-mmu_lock);
  if (mmu_notifier_retry(kvm, mmu_seq))
  goto out_unlock;
 -if (!hugetlb  !force_pte)
 +
 +/* When logging don't spend cycles to check for huge pages */
 
 drop the comment: either explain the entire clause (which would be too
 long) or don't explain anything.
 
Ok.
 +if (!hugetlb  !force_pte  !logging_active)
 
 instead of having all this, can't you just change 
 
 if (is_vm_hugetlb_page(vma)) to
 if (is_vm_hugetlb_page(vma)  !logging_active)
 
 then you're also not mucking around with the gfn etc.

I didn't want to modify this function too much, but if that's ok that 
simplifies things a lot.

 
  hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
  
 -if (hugetlb) {
 +/*
 + * Force all not present/perm faults to PTE handling, address both
 + * PMD and PTE faults
 + */
 
 I don't understand this comment?  In which case does this apply?
 
The cases I see here -
- huge page permission fault is forced into page table code while logging
- pte permission/not present handled by page table code as before.
 +if (hugetlb  !logging_active) {
  pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
  new_pmd = pmd_mkhuge(new_pmd);
  if (writable) {
 @@ -1034,6 +1042,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  } else {
  pte_t new_pte = pfn_pte(pfn, PAGE_S2);
  if (writable) {
 +/*
 + * If pmd is  mapping a huge page then clear it and let
 + * stage2_set_pte() create a pte table. At the sametime
 + * you write protect the pte (PAGE_S2 pgprot_t).
 + */
 +if (logging_active) {
 +pmd_t *pmd;
 +if (hugetlb) {
 +pfn += pte_index(fault_ipa);
 +gfn = fault_ipa  PAGE_SHIFT;
 +new_pte = pfn_pte(pfn, PAGE_S2);
 +}
 +pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
 +if (pmd  kvm_pmd_huge(*pmd))
 +clear_pmd_entry(kvm, pmd, fault_ipa);
 +}
 
 now instead of all this, you just need to check for kvm_pmd_huge() in
 stage2_set_pte() and if that's true, you clear it, and then then install
 your new pte.

Yes this really simplifies things!

 
  kvm_set_s2pte_writable(new_pte);
  kvm_set_pfn_dirty(pfn);
  }
 @@ -1041,6 +1065,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
  }
  
 +/*
 + * Log the dirty page in dirty_bitmap[], call regardless if logging is
 + * disabled or enabled both cases handled safely.
 + * TODO: for larger page size mark mulitple dirty page bits for each
 + *   4k page.
 + */
 +if (writable)
 +mark_page_dirty(kvm, gfn);
 
 what if you just faulted in a page on a read which wasn't present
 before but it happens to belong to a writeable memslot, is that page
 then dirty? hmmm.
 
A bug, must also check if it was a write fault not just that we're dealing with
a writable region. This one could be pretty bad on performance, not to mention
in accurate. It will be interesting to see new test results, glad you caught
that.

Thanks,
  Mario.
 
  
  out_unlock:
  spin_unlock(kvm-mmu_lock);
 -- 
 1.7.9.5

 
 Thanks,
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm

Re: [PATCH v7 4/4] arm: dirty page logging 2nd stage page fault handling support

2014-06-11 Thread Mario Smarduch

On 06/10/2014 11:58 PM, Christoffer Dall wrote:
 On Tue, Jun 10, 2014 at 11:23:17AM -0700, Mario Smarduch wrote:
 On 06/08/2014 05:05 AM, Christoffer Dall wrote:
 On Tue, Jun 03, 2014 at 04:19:27PM -0700, Mario Smarduch wrote:
 This patch adds support for handling 2nd stage page faults during 
 migration,
 it disables faulting in huge pages, and disolves huge pages to page tables.

 s/disolves/dissolves/g
 Will do.

 In case migration is canceled huge pages will be used again.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/kvm/mmu.c |   36 ++--
  1 file changed, 34 insertions(+), 2 deletions(-)

 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 1c546c9..aca4fbf 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -966,6 +966,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
 +  /* Get logging status, if dirty_bitmap is not NULL then logging is on */
 +  bool logging_active = !!memslot-dirty_bitmap;

  
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
 @@ -1019,10 +1021,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
spin_lock(kvm-mmu_lock);
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
 -  if (!hugetlb  !force_pte)
 +
 +  /* When logging don't spend cycles to check for huge pages */

 drop the comment: either explain the entire clause (which would be too
 long) or don't explain anything.

 Ok.
 +  if (!hugetlb  !force_pte  !logging_active)

 instead of having all this, can't you just change 

 if (is_vm_hugetlb_page(vma)) to
 if (is_vm_hugetlb_page(vma)  !logging_active)

 then you're also not mucking around with the gfn etc.

 I didn't want to modify this function too much, but if that's ok that 
 simplifies things a lot.

 
 Don't worry about the changes as much as the resulting code.  If
 something requires a lot of refactoring, usually that can be handled by
 splitting up renames, factoring out functions, etc. into multiple
 smaller patches.
 

hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
  
 -  if (hugetlb) {
 +  /*
 +   * Force all not present/perm faults to PTE handling, address both
 +   * PMD and PTE faults
 +   */

 I don't understand this comment?  In which case does this apply?

 The cases I see here -
 - huge page permission fault is forced into page table code while logging
 - pte permission/not present handled by page table code as before.
 
 Hmm, the wording doesn't really work for me.  I don't think this comment
 adds anything or is required, when getting this deep into the fault
 handler etc., one better understand what's going on.
 
 The most suitable place for a comment in this work is probably in
 stage2_set_pte() where you can now detect a kvm_pmd_huge(), when you add
 that, you may want to add a small comment that this only happens when
 logging dirty pages.
 
 +  if (hugetlb  !logging_active) {
pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
new_pmd = pmd_mkhuge(new_pmd);
if (writable) {
 @@ -1034,6 +1042,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
if (writable) {
 +  /*
 +   * If pmd is  mapping a huge page then clear it and let
 +   * stage2_set_pte() create a pte table. At the sametime
 +   * you write protect the pte (PAGE_S2 pgprot_t).
 +   */
 +  if (logging_active) {
 +  pmd_t *pmd;
 +  if (hugetlb) {
 +  pfn += pte_index(fault_ipa);
 +  gfn = fault_ipa  PAGE_SHIFT;
 +  new_pte = pfn_pte(pfn, PAGE_S2);
 +  }
 +  pmd = stage2_get_pmd(kvm, NULL, fault_ipa);
 +  if (pmd  kvm_pmd_huge(*pmd))
 +  clear_pmd_entry(kvm, pmd, fault_ipa);
 +  }

 now instead of all this, you just need to check for kvm_pmd_huge() in
 stage2_set_pte() and if that's true, you clear it, and then then install
 your new pte.

 Yes this really simplifies things!


kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
}
 @@ -1041,6 +1065,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
}
  
 +  /*
 +   * Log the dirty page in dirty_bitmap[], call regardless if logging is
 +   * disabled or enabled both cases handled safely.
 +   * TODO: for larger page size mark mulitple dirty page bits for each

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-11 Thread Mario Smarduch

Hi Paolo,
   for ARM dirty page logging we have couple functions
that are generic.

- kvm_vm_ioctl_get_dirty_log - is identical to x86 version
- kvm_flush_remote_tlbs - ARM version does hardware broadcast
  it's different from the generic one in kvm_main.c

How to proceed to make these generic? Please see below
from Christoffer.

Current patch moves kvm_vm_ioctl_get_dirty_log() into kvm_main.c
and labels it and kvm_flush_remote_tlbs weak.

Please advise.

Thanks,
- Mario


 So I don't see a lot of use of weak symbols in kvm_main.c (actually on
 kvmarm/next I don't see any), but we do want to share code when more
 than one architecture implements something in the exact same way, like
 it seems x86 and ARM is doing here for this particular function.
 
 I think the KVM scheme is usually to check for some define, like:
 
 #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG
   ret = kvm_arch_get_dirty_log(...);
 #else
   ret = kvm_get_dirty_log(...);
 #endif
 
 but Paolo may have a more informed oppinion of how to deal with these.
 
 Thanks,
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-06-17 Thread Mario Smarduch

On 06/11/2014 12:03 AM, Christoffer Dall wrote:


 There is also the issue of kvm_flush_remote_tlbs(), that's also weak,
 the generic one is using IPIs. Since it's only used in mmu.c maybe make 
 this one static.

 So I don't see a lot of use of weak symbols in kvm_main.c (actually on
 kvmarm/next I don't see any), but we do want to share code when more
 than one architecture implements something in the exact same way, like
 it seems x86 and ARM is doing here for this particular function.
 
 I think the KVM scheme is usually to check for some define, like:
 
 #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG
   ret = kvm_arch_get_dirty_log(...);
 #else
   ret = kvm_get_dirty_log(...);
 #endif
 
 but Paolo may have a more informed oppinion of how to deal with these.
 
 Thanks,
 -Christoffer


 
One approach I'm trying looking at the code in kvm_main().
This approach applies more to selecting features as opposed to
selecting generic vs architecture specific functions.

1.-
 - add to 'virt/kvm/Kconfig'
config HAVE_KVM_ARCH_TLB_FLUSH_ALL
   bool

config HAVE_KVM_ARCH_DIRTY_LOG
   bool
2.--
For ARM and later ARM64 add to 'arch/arm[64]/kvm/Kconfig'
config KVM
bool Kernel-based Virtual Machine (KVM) support
...
select HAVE_KVM_ARCH_TLB_FLUSH_ALL
..

Not for HAVE_KVM_ARCH_DIRTY_LOG given it's shared with x86,
but would need to do it for every other architecture that
does not share it (except initially for arm64 since it
will use the variant that returns -EINVAL until feature
is supported)

3--
In kvm_main.c would have something like

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
#ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
kvm_arch_flush_remote_tlbs(kvm);
#else
long dirty_count = kvm-tlbs_dirty;

smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm-stat.remote_tlb_flush;
cmpxchg(kvm-tlbs_dirty, dirty_count, 0);
#endif
}

Then add void kvm_flush_remote_tlbs(struct kvm *kvm) definition
to arm kvm_host.h. Define the function in this case mmu.c

For the dirty log function
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log)
{
#ifdef CONFIG_HAVE_KVM_ARCH_DIRTY_LOG
kvm_arch_vm_ioctl_get_dirty_log(kvm, log);
#else
int r;
struct kvm_memory_slot *memslot;
unsigned long n, i;
unsigned long *dirty_bitmap;
unsigned long *dirty_bitmap_buffer;
bool is_dirty = false;
...

But then you have to go into every architecture and define the
kvm_arch_vm_...() variant.

Is this the right way to go? Or is there a simpler way?

Thanks,
- Mario





--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v8 3/4] arm: dirty log write protect management support

2014-06-18 Thread Mario Smarduch

This patch adds support for keeping track of VM dirty pages. As dirty page log
is retrieved, the pages that have been written are write protected again for
next write and log read. For ARMv8 read of dirty log returns invalid operation.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h |3 ++
 arch/arm/kvm/arm.c  |   83 +++
 arch/arm/kvm/mmu.c  |   22 +++
 3 files changed, 108 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 586c467..dbf3d45 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -233,5 +233,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 
value);
 
 void kvm_tlb_flush_vmid(struct kvm *kvm);
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index e11c2dd..cb3c090 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -783,10 +783,93 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
+#ifdef CONFIG_ARM
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently.  So, to avoid losing data, we keep the following order for
+ * each bit:
+ *
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Flush TLB's if needed.
+ *   4. Copy the snapshot to the userspace.
+ *
+ * Between 2 and 3, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page will be reported dirty at
+ * step 4 using the snapshot taken before and step 3 ensures that successive
+ * writes will be logged for the next call.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+   struct kvm_dirty_log *log)
+{
+   int r;
+   struct kvm_memory_slot *memslot;
+   unsigned long n, i;
+   unsigned long *dirty_bitmap;
+   unsigned long *dirty_bitmap_buffer;
+   bool is_dirty = false;
+
+   mutex_lock(kvm-slots_lock);
+
+   r = -EINVAL;
+   if (log-slot = KVM_USER_MEM_SLOTS)
+   goto out;
+
+   memslot = id_to_memslot(kvm-memslots, log-slot);
+
+   dirty_bitmap = memslot-dirty_bitmap;
+   r = -ENOENT;
+   if (!dirty_bitmap)
+   goto out;
+
+   n = kvm_dirty_bitmap_bytes(memslot);
+
+   dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+   memset(dirty_bitmap_buffer, 0, n);
+
+   spin_lock(kvm-mmu_lock);
+
+   for (i = 0; i  n / sizeof(long); i++) {
+   unsigned long mask;
+   gfn_t offset;
+
+   if (!dirty_bitmap[i])
+   continue;
+
+   is_dirty = true;
+
+   mask = xchg(dirty_bitmap[i], 0);
+   dirty_bitmap_buffer[i] = mask;
+
+   offset = i * BITS_PER_LONG;
+   kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
+   }
+
+   spin_unlock(kvm-mmu_lock);
+
+   lockdep_assert_held(kvm-slots_lock);
+   if (is_dirty)
+   kvm_tlb_flush_vmid(kvm);
+
+   r = -EFAULT;
+   if (copy_to_user(log-dirty_bitmap, dirty_bitmap_buffer, n))
+   goto out;
+
+   r = 0;
+out:
+   mutex_unlock(kvm-slots_lock);
+   return r;
+}
+#else
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
return -EINVAL;
 }
+#endif
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 37edcbe..1caf511 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -888,6 +888,28 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
kvm_tlb_flush_vmid(kvm);
spin_unlock(kvm-mmu_lock);
 }
+
+/**
+ * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask
+ * @kvm:   The KVM pointer
+ * @slot:  The memory slot associated with mask
+ * @gfn_offset:The gfn offset in memory slot
+ * @mask:  The mask of dirty pages at offset 'gfn_offset' in this memory
+ * slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire kvm_mmu_lock.
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   phys_addr_t base_gfn = slot-base_gfn + gfn_offset;
+   phys_addr_t start = (base_gfn

[PATCH v8 4/4] arm: dirty page logging 2nd stage page fault handling support

2014-06-18 Thread Mario Smarduch

This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and dissolves huge pages to page tables.
In case migration is canceled huge pages will be used again. For ARMv8
logging is hardcoded to false.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   31 +--
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1caf511..d49df28 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -641,7 +641,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte, bool iomap)
+ phys_addr_t addr, const pte_t *new_pte, bool iomap,
+ bool logging_active)
 {
pmd_t *pmd;
pte_t *pte, old_pte;
@@ -656,6 +657,15 @@ static int stage2_set_pte(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cache,
return 0;
}
 
+   /*
+* While dirty memory logging, clear PMD entry for huge page and split
+* into smaller pages, to track dirty memory at page granularity.
+*/
+   if (logging_active  kvm_pmd_huge(*pmd)) {
+   phys_addr_t ipa = pmd_pfn(*pmd)  PAGE_SHIFT;
+   clear_pmd_entry(kvm, pmd, ipa);
+   }
+
/* Create stage-2 page mappings - Level 2 */
if (pmd_none(*pmd)) {
if (!cache)
@@ -708,7 +718,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
guest_ipa,
if (ret)
goto out;
spin_lock(kvm-mmu_lock);
-   ret = stage2_set_pte(kvm, cache, addr, pte, true);
+   ret = stage2_set_pte(kvm, cache, addr, pte, true, false);
spin_unlock(kvm-mmu_lock);
if (ret)
goto out;
@@ -925,6 +935,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   /* Get logging status, if dirty_bitmap is not NULL then logging is on */
+#ifdef CONFIG_ARM
+   bool logging_active = !!memslot-dirty_bitmap;
+#else
+   bool logging_active = false;
+#endif
 
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -935,7 +951,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
/* Let's check if we will get back a huge page backed by hugetlbfs */
down_read(current-mm-mmap_sem);
vma = find_vma_intersection(current-mm, hva, hva + 1);
-   if (is_vm_hugetlb_page(vma)) {
+   if (is_vm_hugetlb_page(vma)  !logging_active) {
hugetlb = true;
gfn = (fault_ipa  PMD_MASK)  PAGE_SHIFT;
} else {
@@ -978,7 +994,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
spin_lock(kvm-mmu_lock);
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+   if (!hugetlb  !force_pte  !logging_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
 
if (hugetlb) {
@@ -997,9 +1013,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
kvm_set_pfn_dirty(pfn);
}
coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
-   ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
+   ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false,
+   logging_active);
}
 
+   if (write_fault)
+   mark_page_dirty(kvm, gfn);
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
@@ -1150,7 +1169,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t 
gpa, void *data)
 {
pte_t *pte = (pte_t *)data;
 
-   stage2_set_pte(kvm, NULL, gpa, pte, false);
+   stage2_set_pte(kvm, NULL, gpa, pte, false, false);
 }
 
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v8 2/4] arm: dirty page logging inital mem region write protect (w/no huge PUD support)

2014-06-18 Thread Mario Smarduch

Patch adds  support for initial write protection VM memlsot. This patch series 
assumes that huge PUDs will not be used in 2nd stage tables. For ARMv8 nothing
happens here.


Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h   |1 +
 arch/arm/include/asm/kvm_mmu.h|   20 ++
 arch/arm/include/asm/pgtable-3level.h |1 +
 arch/arm/kvm/arm.c|9 +++
 arch/arm/kvm/mmu.c|  128 +
 5 files changed, 159 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index ac3bb65..586c467 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -232,5 +232,6 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 void kvm_tlb_flush_vmid(struct kvm *kvm);
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5cc0b0f..08ab5e8 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= L_PMD_S2_RDWR;
 }
 
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+   pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+   return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
+}
+
+static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
+{
+   pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
+}
+
+static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
+{
+   return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
+}
+
 /* Open coded p*d_addr_end that can deal with 64bit addresses */
 #define kvm_pgd_addr_end(addr, end)\
 ({ u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
diff --git a/arch/arm/include/asm/pgtable-3level.h 
b/arch/arm/include/asm/pgtable-3level.h
index 85c60ad..d8bb40b 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -129,6 +129,7 @@
 #define L_PTE_S2_RDONLY(_AT(pteval_t, 1)  6)   /* 
HAP[1]   */
 #define L_PTE_S2_RDWR  (_AT(pteval_t, 3)  6)   /* HAP[2:1] */
 
+#define L_PMD_S2_RDONLY(_AT(pteval_t, 1)  6)   /* 
HAP[1]   */
 #define L_PMD_S2_RDWR  (_AT(pmdval_t, 3)  6)   /* HAP[2:1] */
 
 /*
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 3c82b37..e11c2dd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
   const struct kvm_memory_slot *old,
   enum kvm_mr_change change)
 {
+#ifdef CONFIG_ARM
+   /*
+* At this point memslot has been committed and there is an
+* allocated dirty_bitmap[], dirty pages will be be tracked while the
+* memory slot is write protected.
+*/
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   kvm_mmu_wp_memory_region(kvm, mem-slot);
+#endif
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e90b9e4..37edcbe 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -762,6 +762,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+#ifdef CONFIG_ARM
+/**
+ * stage2_wp_pte_range - write protect PTE range
+ * @pmd:   pointer to pmd entry
+ * @addr:  range start address
+ * @end:   range end address
+ */
+static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+   pte_t *pte;
+
+   pte = pte_offset_kernel(pmd, addr);
+   do {
+   if (!pte_none(*pte)) {
+   if (!kvm_s2pte_readonly(pte))
+   kvm_set_s2pte_readonly(pte);
+   }
+   } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmd_range - write protect PMD range
+ * @pud:   pointer to pud entry
+ * @addr:  range start address
+ * @end:   range end address
+ */
+static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+   pmd_t *pmd;
+   phys_addr_t next;
+
+   pmd = pmd_offset(pud, addr);
+
+   do {
+   next = kvm_pmd_addr_end(addr, end);
+   if (!pmd_none(*pmd)) {
+   if (kvm_pmd_huge(*pmd)) {
+   if (!kvm_s2pmd_readonly(pmd))
+   kvm_set_s2pmd_readonly(pmd);
+   } else
+   stage2_wp_pte_range(pmd, addr, next);
+
+   }
+   } while (pmd

[PATCH v8 1/4] arm: add ARMv7 HYP API to flush VM TLBs without address param

2014-06-18 Thread Mario Smarduch

Patch adds HYP interface for global VM TLB invalidation without address
parameter. Moved VM TLB flushing back to architecture layer.
This patch depends on the unmap_range() patch, it needs to be applied first.
No changes to ARMv8.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |2 ++
 arch/arm/kvm/interrupts.S   |   11 +++
 arch/arm/kvm/mmu.c  |   16 
 4 files changed, 30 insertions(+)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 53b3c4a..21bc519 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 193ceaf..ac3bb65 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -231,4 +231,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+void kvm_tlb_flush_vmid(struct kvm *kvm);
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 0d68d40..a3717b7 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
bx  lr
 ENDPROC(__kvm_tlb_flush_vmid_ipa)
 
+/**
+ * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
+ * @kvm:   pointer to kvm structure
+ *
+ * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
+ * parameter
+ */
+ENTRY(__kvm_tlb_flush_vmid)
+   b   __kvm_tlb_flush_vmid_ipa
+ENDPROC(__kvm_tlb_flush_vmid)
+
 /
  * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
  * domain, for all VMIDs
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 2ac9588..e90b9e4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,22 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+#ifdef CONFIG_ARM
+/**
+ * kvm_tlb_flush_vmid() - flush all VM TLB entries
+ * @kvm:   pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries without address
+ * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by
+ * kvm_tlb_flush_vmid_ipa().
+ */
+void kvm_tlb_flush_vmid(struct kvm *kvm)
+{
+   if (kvm)
+   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+#endif
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v8 0/4] arm: dirty page logging support for ARMv7

2014-06-18 Thread Mario Smarduch

This patch adds support for dirty page logging so far tested only on ARMv7,
and verified to compile on ARMv8. With dirty page logging, GICv2 vGIC and arch 
timer save/restore support, live migration is supported.

Dirty page logging support -
- initially write protects VM RAM memory regions - 2nd stage page tables
- add support to read dirty page log and again write protect the dirty pages 
  - second stage page table for next pass.
- second stage huge page are dissolved into page tables to keep track of
  dirty pages at page granularity. Tracking at huge page granularity limits
  migration to an almost idle system.
- In the event migration is canceled, normal behavior is resumed huge pages
  are rebuilt over time.
- At this time reverse mappings are not used to for write protecting of 2nd 
  stage tables.

- Future work
  - Enable diry memory logging to work on ARMv8 FastModels.

Test Environment:
---
NOTE: RUNNING on FAST Models will hardly ever fail and mask bugs, infact
  initially light loads were succeeding without dirty page logging support.
---
- Will put all components on github, including test setup diagram
- In short summary
  o Two ARM Exyonys 5440 development platforms - 4-way 1.7 GHz, with 8GB, 256GB
storage, 1GBs Ethernet, with swap enabled
  o NFS Server runing Ubuntu 13.04
- both ARM boards mount shared file system
- Shared file system includes - QEMU, Guest Kernel, DTB, multiple Ext3 root
  file systems.
  o Component versions: qemu-1.7.5, vexpress-a15, host/guest kernel 3.15-rc1,
  o Use QEMU Ctr+A+C and migrate -d tcp:IP:port command
- Destination command syntax: can change smp to 4, machine model outdated,
  but has been tested on virt by others (need to upgrade)

/mnt/migration/qemu-system-arm -enable-kvm -smp 2 -kernel \
/mnt/migration/zImage -dtb /mnt/migration/guest-a15.dtb -m 1792 \
-M vexpress-a15 -cpu cortex-a15 -nographic \
-append root=/dev/vda rw console=ttyAMA0 rootwait \
-drive if=none,file=/mnt/migration/guest1.root,id=vm1 \
-device virtio-blk-device,drive=vm1 \
-netdev type=tap,id=net0,ifname=tap0 \
-device virtio-net-device,netdev=net0,mac=52:54:00:12:34:58 \
-incoming tcp:0:4321

- Source command syntax same except '-incoming'

  o Test migration of multiple VMs use tap0, tap1, ..., and guest0.root, .
has been tested as well.
  o On source run multiple copies of 'dirtyram.arm' - simple program to dirty
pages periodically.
./dirtyarm.ram total mmap size dirty page size sleep time
Example:
./dirtyram.arm 102580 812 30
- dirty 102580 pages
- 812 pages every 30ms with an incrementing counter
- run anywhere from one to as many copies as VM resources can support. If
  the dirty rate is too high migration will run indefintely
- run date output loop, check date is picked up smoothly
- place guest/host into page reclaim/swap mode - by whatever means in this
  case run multiple copies of 'dirtyram.ram' on host
- issue migrate command(s) on source
- Top result is 409600, 8192, 5
  o QEMU is instrumented to save RAM memory regions on source and destination
after memory is migrated, but before guest started. Later files are
checksummed on both ends for correctness, given VMs are small this works.
  o Guest kernel is instrumented to capture current cycle counter - last cycle
and compare to qemu down time to test arch timer accuracy.
  o Network failover is at L3 due to interface limitations, ping continues
working transparently
  o Also tested 'migrate_cancel' to test reassemble of huge pages (inserted low
level instrumentation code).

- Basic Network Test - Assuming one ethernet interface available

Source host IP 192.168.10.101/24, VM tap0 192.168.2.1/24 and
VM eth0 192.168.2.100/24 with default route 192.168.2.1

Destination host IP 192.168.10.100/24, VM same settings as above.
Both VMs have identical MAC addresses.

Initially NFS server route to 192.168.2.100 is via 192.168.10.101

- ssh 192.168.2.100
- start migration from source to destination
- after migration ends
- on NFS server switch routes.
   route add -host 192.168.2.100 gw 192.168.10.100

ssh should resume after route switch. ping as well should work
seamlessly.

Changes since v7:
- Reworked write protection of dirty page mask
- Moved generic code back to architecture layer, keep it there for time being,
  until a KVM framework for architecture functions to override genric 
  ones is defined.
- Fixed conditon bug for marking pages dirty
  

Mario Smarduch (4):
  add ARMv7 HYP API to flush VM TLBs without address param
  dirty page logging inital mem region write protect (w/no huge PUD
support)
  dirty log write protect management support
  dirty page logging 2nd stage page fault handling

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-07-17 Thread Mario Smarduch

On 07/04/2014 09:29 AM, Paolo Bonzini wrote:
 Il 03/07/2014 17:04, Christoffer Dall ha scritto:
 Hmmm, I'm really not an expert in the 'established procedures' for what
 to put in config files etc., but here's my basic take:

 a) you wouldn't put a config option in Kconfig unless it's comething
 that's actually configurable or some generic feature/subsystem that
 should only be enabled if hardware has certain capabilities or other
 config options enabled.

 b) this seems entirely an implementation issue and not depending on
 anything users should select.
 
 Actually I think Mario's idea is just fine.  Non-user-accessible Kconfig
 symbols are used a lot to invoke an #ifdef elsewhere in the code;
 compare this with his proposal is a bit different but not too much.
 
 Sometimes #defines are used, sometimes Kconfig symbols, but the idea is
 the same.
 
 Paolo

Hi Paolo,
  thanks for your feedback. I forgot to add that I tried define 
ARCH_HAVE_... approach but checkpatch rejected it and insisted
on Kconfig.

Thanks,
- Mario
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RESEND PATCH v7 3/4] arm: dirty log write protect management support

2014-07-17 Thread Mario Smarduch

Hi Christoffer,
   Just back from holiday - a short plan to resume work.

- move VM tlb flush and kvm log functions to generic, per Paolo's
comments use Kconfig approach
- update other architectures make sure they compile
- Keep it ARMv7 for now

Get maintainers to test the branch.

In parallel add dirty log support to ARMv8, to test I would
add a QEMU monitor function to validate general operation.

Your thoughts?

Thanks,
  Mario

On 07/03/2014 08:04 AM, Christoffer Dall wrote:
 On Tue, Jun 17, 2014 at 06:41:52PM -0700, Mario Smarduch wrote:
 On 06/11/2014 12:03 AM, Christoffer Dall wrote:


 There is also the issue of kvm_flush_remote_tlbs(), that's also weak,
 the generic one is using IPIs. Since it's only used in mmu.c maybe make 
 this one static.

 So I don't see a lot of use of weak symbols in kvm_main.c (actually on
 kvmarm/next I don't see any), but we do want to share code when more
 than one architecture implements something in the exact same way, like
 it seems x86 and ARM is doing here for this particular function.

 I think the KVM scheme is usually to check for some define, like:

 #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG
 ret = kvm_arch_get_dirty_log(...);
 #else
 ret = kvm_get_dirty_log(...);
 #endif

 but Paolo may have a more informed oppinion of how to deal with these.

 Thanks,
 -Christoffer


  
 One approach I'm trying looking at the code in kvm_main().
 This approach applies more to selecting features as opposed to
 selecting generic vs architecture specific functions.

 1.-
  - add to 'virt/kvm/Kconfig'
 config HAVE_KVM_ARCH_TLB_FLUSH_ALL
bool

 config HAVE_KVM_ARCH_DIRTY_LOG
bool
 2.--
 For ARM and later ARM64 add to 'arch/arm[64]/kvm/Kconfig'
 config KVM
 bool Kernel-based Virtual Machine (KVM) support
 ...
 select HAVE_KVM_ARCH_TLB_FLUSH_ALL
 ..

 Not for HAVE_KVM_ARCH_DIRTY_LOG given it's shared with x86,
 but would need to do it for every other architecture that
 does not share it (except initially for arm64 since it
 will use the variant that returns -EINVAL until feature
 is supported)

 3--
 In kvm_main.c would have something like

 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
 #ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 kvm_arch_flush_remote_tlbs(kvm);
 #else
 long dirty_count = kvm-tlbs_dirty;

 smp_mb();
 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 ++kvm-stat.remote_tlb_flush;
 cmpxchg(kvm-tlbs_dirty, dirty_count, 0);
 #endif
 }

 Then add void kvm_flush_remote_tlbs(struct kvm *kvm) definition
 to arm kvm_host.h. Define the function in this case mmu.c

 For the dirty log function
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 struct kvm_dirty_log *log)
 {
 #ifdef CONFIG_HAVE_KVM_ARCH_DIRTY_LOG
 kvm_arch_vm_ioctl_get_dirty_log(kvm, log);
 #else
 int r;
 struct kvm_memory_slot *memslot;
 unsigned long n, i;
 unsigned long *dirty_bitmap;
 unsigned long *dirty_bitmap_buffer;
 bool is_dirty = false;
  ...

 But then you have to go into every architecture and define the
 kvm_arch_vm_...() variant.

 Is this the right way to go? Or is there a simpler way?

 Hmmm, I'm really not an expert in the 'established procedures' for what
 to put in config files etc., but here's my basic take:
 
 a) you wouldn't put a config option in Kconfig unless it's comething
 that's actually configurable or some generic feature/subsystem that
 should only be enabled if hardware has certain capabilities or other
 config options enabled.
 
 b) this seems entirely an implementation issue and not depending on
 anything users should select.
 
 c) therefore, I think it's either a question of always having an
 arch-specific implementation that you probe for its return value or you
 have some sort of define in the header files for the
 arch/X/include/asm/kvm_host.h to control what you need.
 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush

2014-07-24 Thread Mario Smarduch

Patch adds HYP interface for global VM TLB invalidation without address
parameter. Generic VM TLB flush calls ARMv7 arch defined TLB flush function.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_asm.h  |1 +
 arch/arm/include/asm/kvm_host.h |1 +
 arch/arm/kvm/Kconfig|1 +
 arch/arm/kvm/interrupts.S   |   12 
 arch/arm/kvm/mmu.c  |   17 +
 virt/kvm/Kconfig|3 +++
 virt/kvm/kvm_main.c |4 
 7 files changed, 39 insertions(+)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 53b3c4a..21bc519 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 193ceaf..042206f 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -230,5 +230,6 @@ int kvm_perf_teardown(void);
 
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
+void kvm_arch_flush_remote_tlbs(struct kvm *);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 466bd29..44d3b6f 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
select KVM_MMIO
+   select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_ARM_HOST
depends on ARM_VIRT_EXT  ARM_LPAE
---help---
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 0d68d40..1258d46 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -66,6 +66,18 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
bx  lr
 ENDPROC(__kvm_tlb_flush_vmid_ipa)
 
+/**
+ * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
+ *
+ * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
+ * parameter
+ */
+
+ENTRY(__kvm_tlb_flush_vmid)
+   b   __kvm_tlb_flush_vmid_ipa
+ENDPROC(__kvm_tlb_flush_vmid)
+
+
 /
  * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
  * domain, for all VMIDs
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 2ac9588..35254c6 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,23 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+#ifdef CONFIG_ARM
+/**
+ * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries
+ * @kvm:   pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries without address
+ * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by
+ * kvm_tlb_flush_vmid_ipa().
+ */
+void kvm_arch_flush_remote_tlbs(struct kvm *kvm)
+{
+   if (kvm)
+   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
+#endif
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 13f2d19..f1efaa5 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -34,3 +34,6 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT
 
 config KVM_VFIO
bool
+
+config HAVE_KVM_ARCH_TLB_FLUSH_ALL
+   bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa70c6e..258f3d9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -186,12 +186,16 @@ static bool make_all_cpus_request(struct kvm *kvm, 
unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+#ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
+   kvm_arch_flush_remote_tlbs(kvm);
+#else
long dirty_count = kvm-tlbs_dirty;
 
smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm-stat.remote_tlb_flush;
cmpxchg(kvm-tlbs_dirty, dirty_count, 0);
+#endif
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v9 0/4] arm: dirty page logging support for ARMv7

2014-07-24 Thread Mario Smarduch

 host IP 192.168.10.100/24, VM same settings as above.
Both VMs have identical MAC addresses.

Initially NFS server route to 192.168.2.100 is via 192.168.10.101

- ssh 192.168.2.100
- start migration from source to destination
- after migration ends
- on NFS server switch routes.
   route add -host 192.168.2.100 gw 192.168.10.100

ssh should resume after route switch. ping as well should work
seamlessly.



Mario Smarduch (4):
  add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to
support arch flush
  ARMv7  dirty page logging inital mem region write protect (w/no huge
PUD support)
  dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8
ia64 mips powerpc s390 arch specific
  ARMv7 dirty page logging 2nd stage page fault handling support

 arch/arm/include/asm/kvm_asm.h|1 +
 arch/arm/include/asm/kvm_host.h   |2 +
 arch/arm/include/asm/kvm_mmu.h|   20 
 arch/arm/include/asm/pgtable-3level.h |1 +
 arch/arm/kvm/Kconfig  |1 +
 arch/arm/kvm/arm.c|   17 ++-
 arch/arm/kvm/interrupts.S |   12 ++
 arch/arm/kvm/mmu.c|  198 -
 arch/arm64/include/asm/kvm_host.h |2 +
 arch/arm64/kvm/Kconfig|1 +
 arch/ia64/include/asm/kvm_host.h  |1 +
 arch/ia64/kvm/Kconfig |1 +
 arch/ia64/kvm/kvm-ia64.c  |2 +-
 arch/mips/include/asm/kvm_host.h  |2 +-
 arch/mips/kvm/Kconfig |1 +
 arch/mips/kvm/kvm_mips.c  |2 +-
 arch/powerpc/include/asm/kvm_host.h   |2 +
 arch/powerpc/kvm/Kconfig  |1 +
 arch/powerpc/kvm/book3s.c |2 +-
 arch/powerpc/kvm/booke.c  |2 +-
 arch/s390/include/asm/kvm_host.h  |2 +
 arch/s390/kvm/Kconfig |1 +
 arch/s390/kvm/kvm-s390.c  |2 +-
 arch/x86/kvm/x86.c|   86 --
 include/linux/kvm_host.h  |3 +
 virt/kvm/Kconfig  |6 +
 virt/kvm/kvm_main.c   |   94 
 27 files changed, 366 insertions(+), 99 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v9 3/4] arm: dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8 ia64 mips powerpc s390 arch specific

2014-07-24 Thread Mario Smarduch

This patch adds support for keeping track of VM dirty pages. As dirty page log
is retrieved, the pages that have been written are write protected again for
next write and log read.

The dirty log read function is generic for armv7 and x86, and arch specific
for arm64, ia64, mips, powerpc, s390.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/arm.c  |8 +++-
 arch/arm/kvm/mmu.c  |   22 +
 arch/arm64/include/asm/kvm_host.h   |2 +
 arch/arm64/kvm/Kconfig  |1 +
 arch/ia64/include/asm/kvm_host.h|1 +
 arch/ia64/kvm/Kconfig   |1 +
 arch/ia64/kvm/kvm-ia64.c|2 +-
 arch/mips/include/asm/kvm_host.h|2 +-
 arch/mips/kvm/Kconfig   |1 +
 arch/mips/kvm/kvm_mips.c|2 +-
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/kvm/Kconfig|1 +
 arch/powerpc/kvm/book3s.c   |2 +-
 arch/powerpc/kvm/booke.c|2 +-
 arch/s390/include/asm/kvm_host.h|2 +
 arch/s390/kvm/Kconfig   |1 +
 arch/s390/kvm/kvm-s390.c|2 +-
 arch/x86/kvm/x86.c  |   86 -
 include/linux/kvm_host.h|3 ++
 virt/kvm/Kconfig|3 ++
 virt/kvm/kvm_main.c |   90 +++
 21 files changed, 143 insertions(+), 93 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index e11c2dd..f7739a0 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -783,10 +783,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+#ifdef CONFIG_ARM64
+/*
+ * For now features not supported on ARM64, the #ifdef is added to make that
+ * clear but not needed since ARM64 Kconfig selects function in generic code.
+ */
+int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
return -EINVAL;
 }
+#endif
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 7bfc792..ca84331 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -889,6 +889,28 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
kvm_flush_remote_tlbs(kvm);
spin_unlock(kvm-mmu_lock);
 }
+
+/**
+ * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask
+ * @kvm:   The KVM pointer
+ * @slot:  The memory slot associated with mask
+ * @gfn_offset:The gfn offset in memory slot
+ * @mask:  The mask of dirty pages at offset 'gfn_offset' in this memory
+ * slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire kvm_mmu_lock.
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   gfn_t gfn_offset, unsigned long mask)
+{
+   phys_addr_t base_gfn = slot-base_gfn + gfn_offset;
+   phys_addr_t start = (base_gfn +  __ffs(mask))  PAGE_SHIFT;
+   phys_addr_t end = (base_gfn + __fls(mask) + 1)  PAGE_SHIFT;
+
+   stage2_wp_range(kvm, start, end);
+}
 #endif
 
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 92242ce..b4a280b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -200,4 +200,6 @@ static inline void __cpu_init_hyp_mode(phys_addr_t 
boot_pgd_ptr,
 hyp_stack_ptr, vector_ptr);
 }
 
+int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log 
*log);
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 8ba85e9..9e21a8a 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
select PREEMPT_NOTIFIERS
select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
+   select HAVE_KVM_ARCH_DIRTY_LOG
select KVM_MMIO
select KVM_ARM_HOST
select KVM_ARM_VGIC
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index db95f57..d79f520 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -594,6 +594,7 @@ void kvm_sal_emul(struct kvm_vcpu *vcpu);
 #define __KVM_HAVE_ARCH_VM_ALLOC 1
 struct kvm *kvm_arch_alloc_vm(void);
 void kvm_arch_free_vm(struct kvm *kvm);
+int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log 
*log);
 
 #endif /* __ASSEMBLY__*/
 
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 990b864..32dd6c8 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -24,6 +24,7 @@ config KVM
depends on BROKEN
select PREEMPT_NOTIFIERS

[PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)

2014-07-24 Thread Mario Smarduch

Patch adds  support for initial write protection VM memlsot. This patch series
assumes that huge PUDs will not be used in 2nd stage tables.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/include/asm/kvm_host.h   |1 +
 arch/arm/include/asm/kvm_mmu.h|   20 ++
 arch/arm/include/asm/pgtable-3level.h |1 +
 arch/arm/kvm/arm.c|9 +++
 arch/arm/kvm/mmu.c|  128 +
 5 files changed, 159 insertions(+)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 042206f..6521a2d 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 void kvm_arch_flush_remote_tlbs(struct kvm *);
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5cc0b0f..08ab5e8 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= L_PMD_S2_RDWR;
 }
 
+static inline void kvm_set_s2pte_readonly(pte_t *pte)
+{
+   pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
+}
+
+static inline bool kvm_s2pte_readonly(pte_t *pte)
+{
+   return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
+}
+
+static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
+{
+   pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
+}
+
+static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
+{
+   return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
+}
+
 /* Open coded p*d_addr_end that can deal with 64bit addresses */
 #define kvm_pgd_addr_end(addr, end)\
 ({ u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
diff --git a/arch/arm/include/asm/pgtable-3level.h 
b/arch/arm/include/asm/pgtable-3level.h
index 85c60ad..d8bb40b 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -129,6 +129,7 @@
 #define L_PTE_S2_RDONLY(_AT(pteval_t, 1)  6)   /* 
HAP[1]   */
 #define L_PTE_S2_RDWR  (_AT(pteval_t, 3)  6)   /* HAP[2:1] */
 
+#define L_PMD_S2_RDONLY(_AT(pteval_t, 1)  6)   /* 
HAP[1]   */
 #define L_PMD_S2_RDWR  (_AT(pmdval_t, 3)  6)   /* HAP[2:1] */
 
 /*
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 3c82b37..e11c2dd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
   const struct kvm_memory_slot *old,
   enum kvm_mr_change change)
 {
+#ifdef CONFIG_ARM
+   /*
+* At this point memslot has been committed and there is an
+* allocated dirty_bitmap[], dirty pages will be be tracked while the
+* memory slot is write protected.
+*/
+   if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   kvm_mmu_wp_memory_region(kvm, mem-slot);
+#endif
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 35254c6..7bfc792 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
return false;
 }
 
+#ifdef CONFIG_ARM
+/**
+ * stage2_wp_pte_range - write protect PTE range
+ * @pmd:   pointer to pmd entry
+ * @addr:  range start address
+ * @end:   range end address
+ */
+static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+   pte_t *pte;
+
+   pte = pte_offset_kernel(pmd, addr);
+   do {
+   if (!pte_none(*pte)) {
+   if (!kvm_s2pte_readonly(pte))
+   kvm_set_s2pte_readonly(pte);
+   }
+   } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmd_range - write protect PMD range
+ * @pud:   pointer to pud entry
+ * @addr:  range start address
+ * @end:   range end address
+ */
+static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+   pmd_t *pmd;
+   phys_addr_t next;
+
+   pmd = pmd_offset(pud, addr);
+
+   do {
+   next = kvm_pmd_addr_end(addr, end);
+   if (!pmd_none(*pmd)) {
+   if (kvm_pmd_huge(*pmd)) {
+   if (!kvm_s2pmd_readonly(pmd))
+   kvm_set_s2pmd_readonly(pmd);
+   } else
+   stage2_wp_pte_range(pmd, addr, next);
+
+   }
+   } while (pmd

[PATCH v9 4/4] arm: ARMv7 dirty page logging 2nd stage page fault handling support

2014-07-24 Thread Mario Smarduch

This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and dissolves huge pages to page tables.
In case migration is canceled huge pages will be used again.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---
 arch/arm/kvm/mmu.c |   31 +--
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index ca84331..a17812a 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte, bool iomap)
+ phys_addr_t addr, const pte_t *new_pte, bool iomap,
+ bool logging_active)
 {
pmd_t *pmd;
pte_t *pte, old_pte;
@@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cache,
return 0;
}
 
+   /*
+* While dirty memory logging, clear PMD entry for huge page and split
+* into smaller pages, to track dirty memory at page granularity.
+*/
+   if (logging_active  kvm_pmd_huge(*pmd)) {
+   phys_addr_t ipa = pmd_pfn(*pmd)  PAGE_SHIFT;
+   clear_pmd_entry(kvm, pmd, ipa);
+   }
+
/* Create stage-2 page mappings - Level 2 */
if (pmd_none(*pmd)) {
if (!cache)
@@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
guest_ipa,
if (ret)
goto out;
spin_lock(kvm-mmu_lock);
-   ret = stage2_set_pte(kvm, cache, addr, pte, true);
+   ret = stage2_set_pte(kvm, cache, addr, pte, true, false);
spin_unlock(kvm-mmu_lock);
if (ret)
goto out;
@@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
pfn_t pfn;
+   /* Get logging status, if dirty_bitmap is not NULL then logging is on */
+   #ifdef CONFIG_ARM
+   bool logging_active = !!memslot-dirty_bitmap;
+   #else
+   bool logging_active = false;
+   #endif
 
write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
if (fault_status == FSC_PERM  !write_fault) {
@@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
/* Let's check if we will get back a huge page backed by hugetlbfs */
down_read(current-mm-mmap_sem);
vma = find_vma_intersection(current-mm, hva, hva + 1);
-   if (is_vm_hugetlb_page(vma)) {
+   if (is_vm_hugetlb_page(vma)  !logging_active) {
hugetlb = true;
gfn = (fault_ipa  PMD_MASK)  PAGE_SHIFT;
} else {
@@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
spin_lock(kvm-mmu_lock);
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
-   if (!hugetlb  !force_pte)
+   if (!hugetlb  !force_pte  !logging_active)
hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
 
if (hugetlb) {
@@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
kvm_set_pfn_dirty(pfn);
}
coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
-   ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
+   ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false,
+   logging_active);
}
 
+   if (write_fault)
+   mark_page_dirty(kvm, gfn);
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
@@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t 
gpa, void *data)
 {
pte_t *pte = (pte_t *)data;
 
-   stage2_set_pte(kvm, NULL, gpa, pte, false);
+   stage2_set_pte(kvm, NULL, gpa, pte, false, false);
 }
 
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush

2014-07-25 Thread Mario Smarduch

On 07/24/2014 11:12 PM, Alexander Graf wrote:
 
 On 25.07.14 02:56, Mario Smarduch wrote:
 Patch adds HYP interface for global VM TLB invalidation without address
 parameter. Generic VM TLB flush calls ARMv7 arch defined TLB flush
 function.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
   arch/arm/include/asm/kvm_asm.h  |1 +
   arch/arm/include/asm/kvm_host.h |1 +
   arch/arm/kvm/Kconfig|1 +
   arch/arm/kvm/interrupts.S   |   12 
   arch/arm/kvm/mmu.c  |   17 +
   virt/kvm/Kconfig|3 +++
   virt/kvm/kvm_main.c |4 
   7 files changed, 39 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_asm.h
 b/arch/arm/include/asm/kvm_asm.h
 index 53b3c4a..21bc519 100644
 --- a/arch/arm/include/asm/kvm_asm.h
 +++ b/arch/arm/include/asm/kvm_asm.h
 @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
 extern void __kvm_flush_vm_context(void);
   extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 +extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
   #endif
 diff --git a/arch/arm/include/asm/kvm_host.h
 b/arch/arm/include/asm/kvm_host.h
 index 193ceaf..042206f 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -230,5 +230,6 @@ int kvm_perf_teardown(void);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
   int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 +void kvm_arch_flush_remote_tlbs(struct kvm *);
 #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
 index 466bd29..44d3b6f 100644
 --- a/arch/arm/kvm/Kconfig
 +++ b/arch/arm/kvm/Kconfig
 @@ -22,6 +22,7 @@ config KVM
   select ANON_INODES
   select HAVE_KVM_CPU_RELAX_INTERCEPT
   select KVM_MMIO
 +select HAVE_KVM_ARCH_TLB_FLUSH_ALL
   select KVM_ARM_HOST
   depends on ARM_VIRT_EXT  ARM_LPAE
   ---help---
 diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
 index 0d68d40..1258d46 100644
 --- a/arch/arm/kvm/interrupts.S
 +++ b/arch/arm/kvm/interrupts.S
 @@ -66,6 +66,18 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
   bxlr
   ENDPROC(__kvm_tlb_flush_vmid_ipa)
   +/**
 + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
 + *
 + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
 + * parameter
 + */
 +
 +ENTRY(__kvm_tlb_flush_vmid)
 +b__kvm_tlb_flush_vmid_ipa
 +ENDPROC(__kvm_tlb_flush_vmid)
 +
 +
   /
* Flush TLBs and instruction caches of all CPUs inside the
 inner-shareable
* domain, for all VMIDs
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 2ac9588..35254c6 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -56,6 +56,23 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
 phys_addr_t ipa)
   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
   }
   +#ifdef CONFIG_ARM
 
 Why the ifdef? We're in ARM code here, no?

For the time being to compile ARM64.

 
 +/**
 + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries
 + * @kvm:   pointer to kvm structure.
 + *
 + * Interface to HYP function to flush all VM TLB entries without address
 + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function
 used by
 + * kvm_tlb_flush_vmid_ipa().
 + */
 +void kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 +{
 +if (kvm)
 +kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 
 I don't see why we should ever call this function with kvm==NULL.

Yes that true, I copied a generic arm/arm64 mmu function. But it's
use here guarantees kvm != NULL.

 
 
 Alex
 

Thanks,
  Mario

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)

2014-07-25 Thread Mario Smarduch

On 07/24/2014 11:16 PM, Alexander Graf wrote:
 
 On 25.07.14 02:56, Mario Smarduch wrote:
 Patch adds  support for initial write protection VM memlsot. This
 patch series
 assumes that huge PUDs will not be used in 2nd stage tables.
 
 Is this a valid assumption?

Right now it's unclear if PUDs will be used to back guest
memory, assuming so required quite a bit of additional code.
After discussing on mailing list it was recommended to
treat this as BUG_ON case for now.

 

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
   arch/arm/include/asm/kvm_host.h   |1 +
   arch/arm/include/asm/kvm_mmu.h|   20 ++
   arch/arm/include/asm/pgtable-3level.h |1 +
   arch/arm/kvm/arm.c|9 +++
   arch/arm/kvm/mmu.c|  128
 +
   5 files changed, 159 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_host.h
 b/arch/arm/include/asm/kvm_host.h
 index 042206f..6521a2d 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
   u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
   int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
   void kvm_arch_flush_remote_tlbs(struct kvm *);
 +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/include/asm/kvm_mmu.h
 b/arch/arm/include/asm/kvm_mmu.h
 index 5cc0b0f..08ab5e8 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t
 *pmd)
   pmd_val(*pmd) |= L_PMD_S2_RDWR;
   }
   +static inline void kvm_set_s2pte_readonly(pte_t *pte)
 +{
 +pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pte_readonly(pte_t *pte)
 +{
 +return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
 +}
 +
 +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 +{
 +pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 +{
 +return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
 +}
 +
   /* Open coded p*d_addr_end that can deal with 64bit addresses */
   #define kvm_pgd_addr_end(addr, end)\
   ({u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
 diff --git a/arch/arm/include/asm/pgtable-3level.h
 b/arch/arm/include/asm/pgtable-3level.h
 index 85c60ad..d8bb40b 100644
 --- a/arch/arm/include/asm/pgtable-3level.h
 +++ b/arch/arm/include/asm/pgtable-3level.h
 @@ -129,6 +129,7 @@
   #define L_PTE_S2_RDONLY(_AT(pteval_t, 1)  6)   /*
 HAP[1]   */
   #define L_PTE_S2_RDWR(_AT(pteval_t, 3)  6)   /*
 HAP[2:1] */
   +#define L_PMD_S2_RDONLY(_AT(pteval_t, 1)  6)   /*
 HAP[1]   */
   #define L_PMD_S2_RDWR(_AT(pmdval_t, 3)  6)   /*
 HAP[2:1] */
 /*
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..e11c2dd 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
  const struct kvm_memory_slot *old,
  enum kvm_mr_change change)
   {
 +#ifdef CONFIG_ARM
 
 Same question on CONFIG_ARM here. Is this the define used to distinguish
 between 32bit and 64bit?

Yes let ARM64 compile. Eventually we'll come back to ARM64 soon, and
these will go.
 
 
 Alex
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs ... - looking for comments

2014-08-08 Thread Mario Smarduch

- place guest/host into page reclaim/swap mode - by whatever means in this
  case run multiple copies of 'dirtyram.ram' on host
- issue migrate command(s) on source
- Top result is 409600, 8192, 5
  o QEMU is instrumented to save RAM memory regions on source and destination
after memory is migrated, but before guest started. Later files are
checksummed on both ends for correctness, given VMs are small this works.
  o Guest kernel is instrumented to capture current cycle counter - last cycle
and compare to qemu down time to test arch timer accuracy.
  o Network failover is at L3 due to interface limitations, ping continues
working transparently
  o Also tested 'migrate_cancel' to test reassemble of huge pages (inserted low
level instrumentation code).
- Basic Network Test - Assuming one ethernet interface available

Source host IP 192.168.10.101/24, VM tap0 192.168.2.1/24 and
VM eth0 192.168.2.100/24 with default route 192.168.2.1

Destination host IP 192.168.10.100/24, VM same settings as above.
Both VMs have identical MAC addresses.

Initially NFS server route to 192.168.2.100 is via 192.168.10.101

- ssh 192.168.2.100
- start migration from source to destination
- after migration ends
- on NFS server switch routes.
   route add -host 192.168.2.100 gw 192.168.10.100

ssh should resume after route switch. ping as well should work
seamlessly.



Mario Smarduch (4):
  add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to
support arch flush
  ARMv7  dirty page logging inital mem region write protect (w/no huge
PUD support)
  dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8
ia64 mips powerpc s390 arch specific
  ARMv7 dirty page logging 2nd stage page fault handling support

 arch/arm/include/asm/kvm_asm.h|1 +
 arch/arm/include/asm/kvm_host.h   |2 +
 arch/arm/include/asm/kvm_mmu.h|   20 
 arch/arm/include/asm/pgtable-3level.h |1 +
 arch/arm/kvm/Kconfig  |1 +
 arch/arm/kvm/arm.c|   17 ++-
 arch/arm/kvm/interrupts.S |   12 ++
 arch/arm/kvm/mmu.c|  198 -
 arch/arm64/include/asm/kvm_host.h |2 +
 arch/arm64/kvm/Kconfig|1 +
 arch/ia64/include/asm/kvm_host.h  |1 +
 arch/ia64/kvm/Kconfig |1 +
 arch/ia64/kvm/kvm-ia64.c  |2 +-
 arch/mips/include/asm/kvm_host.h  |2 +-
 arch/mips/kvm/Kconfig |1 +
 arch/mips/kvm/kvm_mips.c  |2 +-
 arch/powerpc/include/asm/kvm_host.h   |2 +
 arch/powerpc/kvm/Kconfig  |1 +
 arch/powerpc/kvm/book3s.c |2 +-
 arch/powerpc/kvm/booke.c  |2 +-
 arch/s390/include/asm/kvm_host.h  |2 +
 arch/s390/kvm/Kconfig |1 +
 arch/s390/kvm/kvm-s390.c  |2 +-
 arch/x86/kvm/x86.c|   86 --
 include/linux/kvm_host.h  |3 +
 virt/kvm/Kconfig  |6 +
 virt/kvm/kvm_main.c   |   94 
 27 files changed, 366 insertions(+), 99 deletions(-)

-- 
1.7.9.5
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v9 1/4] arm: add ARMv7 HYP API to flush VM TLBs, change generic TLB flush to support arch flush

2014-08-11 Thread Mario Smarduch

On 08/11/2014 12:12 PM, Christoffer Dall wrote:
 On Thu, Jul 24, 2014 at 05:56:05PM -0700, Mario Smarduch wrote:
 Patch adds HYP interface for global VM TLB invalidation without address
 parameter. Generic VM TLB flush calls ARMv7 arch defined TLB flush function.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_asm.h  |1 +
  arch/arm/include/asm/kvm_host.h |1 +
  arch/arm/kvm/Kconfig|1 +
  arch/arm/kvm/interrupts.S   |   12 
  arch/arm/kvm/mmu.c  |   17 +
  virt/kvm/Kconfig|3 +++
  virt/kvm/kvm_main.c |4 
  7 files changed, 39 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
 index 53b3c4a..21bc519 100644
 --- a/arch/arm/include/asm/kvm_asm.h
 +++ b/arch/arm/include/asm/kvm_asm.h
 @@ -78,6 +78,7 @@ extern char __kvm_hyp_code_end[];
  
  extern void __kvm_flush_vm_context(void);
  extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 +extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
  
  extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
  #endif
 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 193ceaf..042206f 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -230,5 +230,6 @@ int kvm_perf_teardown(void);
  
  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 +void kvm_arch_flush_remote_tlbs(struct kvm *);
  
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
 index 466bd29..44d3b6f 100644
 --- a/arch/arm/kvm/Kconfig
 +++ b/arch/arm/kvm/Kconfig
 @@ -22,6 +22,7 @@ config KVM
  select ANON_INODES
  select HAVE_KVM_CPU_RELAX_INTERCEPT
  select KVM_MMIO
 +select HAVE_KVM_ARCH_TLB_FLUSH_ALL
  select KVM_ARM_HOST
  depends on ARM_VIRT_EXT  ARM_LPAE
  ---help---
 diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
 index 0d68d40..1258d46 100644
 --- a/arch/arm/kvm/interrupts.S
 +++ b/arch/arm/kvm/interrupts.S
 @@ -66,6 +66,18 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
  bx  lr
  ENDPROC(__kvm_tlb_flush_vmid_ipa)
  
 +/**
 + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
 + *
 + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
 + * parameter
 + */
 +
 +ENTRY(__kvm_tlb_flush_vmid)
 +b   __kvm_tlb_flush_vmid_ipa
 +ENDPROC(__kvm_tlb_flush_vmid)
 +
 +
  /
   * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
   * domain, for all VMIDs
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 2ac9588..35254c6 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -56,6 +56,23 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
 phys_addr_t ipa)
  kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  }
  
 +#ifdef CONFIG_ARM
 
 I assume this is here because of arm vs. arm64, use static inlines in
 the header files to differentiate instead.
Yes that's right, will move it.
 
 +/**
 + * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries
 + * @kvm:   pointer to kvm structure.
 + *
 + * Interface to HYP function to flush all VM TLB entries without address
 + * parameter. In HYP mode reuses __kvm_tlb_flush_vmid_ipa() function used by
 + * kvm_tlb_flush_vmid_ipa().
 
 remove the last sentence from here, it's repetitive.
Ok.
 
 + */
 +void kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 +{
 +if (kvm)
 +kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 +}
 +
 +#endif
 +
  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
int min, int max)
  {
 diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
 index 13f2d19..f1efaa5 100644
 --- a/virt/kvm/Kconfig
 +++ b/virt/kvm/Kconfig
 @@ -34,3 +34,6 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT
  
  config KVM_VFIO
 bool
 +
 +config HAVE_KVM_ARCH_TLB_FLUSH_ALL
 +   bool
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index fa70c6e..258f3d9 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -186,12 +186,16 @@ static bool make_all_cpus_request(struct kvm *kvm, 
 unsigned int req)
  
  void kvm_flush_remote_tlbs(struct kvm *kvm)
  {
 +#ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 +kvm_arch_flush_remote_tlbs(kvm);
 +#else
  long dirty_count = kvm-tlbs_dirty;
  
  smp_mb();
  if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
  ++kvm-stat.remote_tlb_flush;
  cmpxchg(kvm-tlbs_dirty, dirty_count, 0);
 +#endif
 
 I would split this into two patches, one trivial one for the KVM generic
 solution, and one to add the arm-specific part.
 
 That will make your commit text and title much nicer to read too.

Yes makes sense easier to review generic and arch layers.

 
 Thanks

Re: [PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)

2014-08-11 Thread Mario Smarduch

On 08/11/2014 12:12 PM, Christoffer Dall wrote:
 Remove the parenthesis from the subject line.

Hmmm have to check this don't see it my patch file.
 
 On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote:
 Patch adds  support for initial write protection VM memlsot. This patch 
 series
 ^^^
 stray whitespace of
 
Need to watch out for these adds delays to review cycle.
 
 assumes that huge PUDs will not be used in 2nd stage tables.
 
 may be worth mentioning that this is always valid on ARMv7.
 

Yep definitely.


 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h   |1 +
  arch/arm/include/asm/kvm_mmu.h|   20 ++
  arch/arm/include/asm/pgtable-3level.h |1 +
  arch/arm/kvm/arm.c|9 +++
  arch/arm/kvm/mmu.c|  128 
 +
  5 files changed, 159 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 042206f..6521a2d 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
  void kvm_arch_flush_remote_tlbs(struct kvm *);
 +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
  
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
 index 5cc0b0f..08ab5e8 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
  pmd_val(*pmd) |= L_PMD_S2_RDWR;
  }
  
 +static inline void kvm_set_s2pte_readonly(pte_t *pte)
 +{
 +pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pte_readonly(pte_t *pte)
 +{
 +return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
 +}
 +
 +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 +{
 +pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 +{
 +return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
 +}
 +
  /* Open coded p*d_addr_end that can deal with 64bit addresses */
  #define kvm_pgd_addr_end(addr, end) \
  ({  u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
 diff --git a/arch/arm/include/asm/pgtable-3level.h 
 b/arch/arm/include/asm/pgtable-3level.h
 index 85c60ad..d8bb40b 100644
 --- a/arch/arm/include/asm/pgtable-3level.h
 +++ b/arch/arm/include/asm/pgtable-3level.h
 @@ -129,6 +129,7 @@
  #define L_PTE_S2_RDONLY (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PTE_S2_RDWR   (_AT(pteval_t, 3)  6)   /* 
 HAP[2:1] */
  
 +#define L_PMD_S2_RDONLY (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PMD_S2_RDWR   (_AT(pmdval_t, 3)  6)   /* 
 HAP[2:1] */
  
  /*
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..e11c2dd 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 const struct kvm_memory_slot *old,
 enum kvm_mr_change change)
  {
 +#ifdef CONFIG_ARM
 +/*
 + * At this point memslot has been committed and there is an
 + * allocated dirty_bitmap[], dirty pages will be be tracked while the
 + * memory slot is write protected.
 + */
 +if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
 +kvm_mmu_wp_memory_region(kvm, mem-slot);
 +#endif
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 35254c6..7bfc792 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
 phys_addr_t *ipap)
  return false;
  }
  
 +#ifdef CONFIG_ARM
 +/**
 + * stage2_wp_pte_range - write protect PTE range
 + * @pmd:pointer to pmd entry
 + * @addr:   range start address
 + * @end:range end address
 + */
 +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t 
 end)
 +{
 +pte_t *pte;
 +
 +pte = pte_offset_kernel(pmd, addr);
 +do {
 +if (!pte_none(*pte)) {
 +if (!kvm_s2pte_readonly(pte))
 +kvm_set_s2pte_readonly(pte);
 +}
 +} while (pte++, addr += PAGE_SIZE, addr != end);
 +}
 +
 +/**
 + * stage2_wp_pmd_range - write protect PMD range
 + * @pud:pointer to pud entry
 + * @addr:   range start address
 + * @end:range end address
 + */
 +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t 
 end

Re: [PATCH v9 3/4] arm: dirty log write protect mgmt. Moved x86, armv7 to generic, set armv8 ia64 mips powerpc s390 arch specific

2014-08-11 Thread Mario Smarduch

On 08/11/2014 12:13 PM, Christoffer Dall wrote:
 On Thu, Jul 24, 2014 at 05:56:07PM -0700, Mario Smarduch wrote:
 This patch adds support for keeping track of VM dirty pages. As dirty page 
 log
 is retrieved, the pages that have been written are write protected again for
 next write and log read.

 The dirty log read function is generic for armv7 and x86, and arch specific
 for arm64, ia64, mips, powerpc, s390.
 
 So I would also split up this patch.  One that only modifies the
 existing functionality, but does not introduce any new functionality for
 ARM.  Put this first patch in the beginning of the patch series with the
 other prepatory patch, so that you get something like this:
 
 [PATCH 1/X] KVM: Add architecture-specific TLB flush implementations
 [PATCH 2/X] KVM: Add generic implementation of kvm_vm_ioctl_get_dirty_log
 [PATCH 3/X] arm: KVM: Add ARMv7 API to flush TLBs
 [PATCH 4/X] arm: KVM: Add initial dirty page locking infrastructure
 ...

Yes definitely, thanks for the advice makes the patch series easier to
review.

 
 That will make it easier to get the patches accepted and for us to
 review...
 
 

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/kvm/arm.c  |8 +++-
  arch/arm/kvm/mmu.c  |   22 +
  arch/arm64/include/asm/kvm_host.h   |2 +
  arch/arm64/kvm/Kconfig  |1 +
  arch/ia64/include/asm/kvm_host.h|1 +
  arch/ia64/kvm/Kconfig   |1 +
  arch/ia64/kvm/kvm-ia64.c|2 +-
  arch/mips/include/asm/kvm_host.h|2 +-
  arch/mips/kvm/Kconfig   |1 +
  arch/mips/kvm/kvm_mips.c|2 +-
  arch/powerpc/include/asm/kvm_host.h |2 +
  arch/powerpc/kvm/Kconfig|1 +
  arch/powerpc/kvm/book3s.c   |2 +-
  arch/powerpc/kvm/booke.c|2 +-
  arch/s390/include/asm/kvm_host.h|2 +
  arch/s390/kvm/Kconfig   |1 +
  arch/s390/kvm/kvm-s390.c|2 +-
  arch/x86/kvm/x86.c  |   86 -
  include/linux/kvm_host.h|3 ++
  virt/kvm/Kconfig|3 ++
  virt/kvm/kvm_main.c |   90 
 +++
  21 files changed, 143 insertions(+), 93 deletions(-)

 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index e11c2dd..f7739a0 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -783,10 +783,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
  }
  }
  
 -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 +#ifdef CONFIG_ARM64
 +/*
 + * For now features not supported on ARM64, the #ifdef is added to make that
 + * clear but not needed since ARM64 Kconfig selects function in generic 
 code.
 + */
 
 I don't think this comment is needed, but if you really want it, it
 should be something like:
 
 /*
  * ARM64 does not support dirty logging and therefore selects
  * CONFIG_HAVE_KVM_ARCH_DIRTY_LOG.  Provide a -EINVAL stub.
  */

I think it could go since I'm doing arm64 now.

 
 +int kvm_arch_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log 
 *log)
  {
  return -EINVAL;
  }
 +#endif
  
  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
  struct kvm_arm_device_addr *dev_addr)
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 7bfc792..ca84331 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -889,6 +889,28 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
  kvm_flush_remote_tlbs(kvm);
  spin_unlock(kvm-mmu_lock);
  }
 +
 +/**
 + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in 
 mask
 + * @kvm:The KVM pointer
 + * @slot:   The memory slot associated with mask
 + * @gfn_offset: The gfn offset in memory slot
 + * @mask:   The mask of dirty pages at offset 'gfn_offset' in this memory
 + *  slot to be write protected
 + *
 + * Walks bits set in mask write protects the associated pte's. Caller must
 + * acquire kvm_mmu_lock.
 + */
 +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 +struct kvm_memory_slot *slot,
 +gfn_t gfn_offset, unsigned long mask)
 +{
 +phys_addr_t base_gfn = slot-base_gfn + gfn_offset;
 +phys_addr_t start = (base_gfn +  __ffs(mask))  PAGE_SHIFT;
 +phys_addr_t end = (base_gfn + __fls(mask) + 1)  PAGE_SHIFT;
 
 __fls(x) + 1 is the same as fls(x)

For me the __fls(x) + 1 is easier to see the covered range. Unless
it really breaks the convention I'd prefer to keep the '+1'. Either
way no problem.

 +
 +stage2_wp_range(kvm, start, end);
 +}
  #endif
  
  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 diff --git a/arch/arm64/include/asm/kvm_host.h 
 b/arch/arm64/include/asm/kvm_host.h
 index 92242ce..b4a280b 100644
 --- a/arch/arm64/include/asm/kvm_host.h
 +++ b/arch/arm64/include/asm/kvm_host.h
 @@ -200,4 +200,6 @@ static

Re: [PATCH v9 4/4] arm: ARMv7 dirty page logging 2nd stage page fault handling support

2014-08-11 Thread Mario Smarduch

On 08/11/2014 12:13 PM, Christoffer Dall wrote:
 On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote:
 This patch adds support for handling 2nd stage page faults during migration,
 it disables faulting in huge pages, and dissolves huge pages to page tables.
 In case migration is canceled huge pages will be used again.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/kvm/mmu.c |   31 +--
  1 file changed, 25 insertions(+), 6 deletions(-)

 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index ca84331..a17812a 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
 kvm_mmu_memory_cache
  }
  
  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache 
 *cache,
 -  phys_addr_t addr, const pte_t *new_pte, bool iomap)
 +  phys_addr_t addr, const pte_t *new_pte, bool iomap,
 +  bool logging_active)
  {
  pmd_t *pmd;
  pte_t *pte, old_pte;
 @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct 
 kvm_mmu_memory_cache *cache,
  return 0;
  }
  
 +/*
 + * While dirty memory logging, clear PMD entry for huge page and split
 + * into smaller pages, to track dirty memory at page granularity.
 + */
 +if (logging_active  kvm_pmd_huge(*pmd)) {
 +phys_addr_t ipa = pmd_pfn(*pmd)  PAGE_SHIFT;
 +clear_pmd_entry(kvm, pmd, ipa);
 
 clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is
 definitely not the right thing to call.

I don't see that in 3.15rc1/rc4 -

static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
{
if (kvm_pmd_huge(*pmd)) {
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
  []
}

I thought the purpose of this function was to clear PMD entry. Also
ran hundreds of tests no problems. Hmmm confused.

 
 +}
 +
  /* Create stage-2 page mappings - Level 2 */
  if (pmd_none(*pmd)) {
  if (!cache)
 @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
 guest_ipa,
  if (ret)
  goto out;
  spin_lock(kvm-mmu_lock);
 -ret = stage2_set_pte(kvm, cache, addr, pte, true);
 +ret = stage2_set_pte(kvm, cache, addr, pte, true, false);
  spin_unlock(kvm-mmu_lock);
  if (ret)
  goto out;
 @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
  struct vm_area_struct *vma;
  pfn_t pfn;
 +/* Get logging status, if dirty_bitmap is not NULL then logging is on */
 +#ifdef CONFIG_ARM
 +bool logging_active = !!memslot-dirty_bitmap;
 +#else
 +bool logging_active = false;
 +#endif
 
 can you make this an inline in the header files for now please?

Yes definitely.

 
  
  write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
  if (fault_status == FSC_PERM  !write_fault) {
 @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  /* Let's check if we will get back a huge page backed by hugetlbfs */
  down_read(current-mm-mmap_sem);
  vma = find_vma_intersection(current-mm, hva, hva + 1);
 -if (is_vm_hugetlb_page(vma)) {
 +if (is_vm_hugetlb_page(vma)  !logging_active) {
  hugetlb = true;
  gfn = (fault_ipa  PMD_MASK)  PAGE_SHIFT;
  } else {
 @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  spin_lock(kvm-mmu_lock);
  if (mmu_notifier_retry(kvm, mmu_seq))
  goto out_unlock;
 -if (!hugetlb  !force_pte)
 +if (!hugetlb  !force_pte  !logging_active)
  hugetlb = transparent_hugepage_adjust(pfn, fault_ipa);
  
  if (hugetlb) {
 @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
 phys_addr_t fault_ipa,
  kvm_set_pfn_dirty(pfn);
  }
  coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
 -ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false);
 +ret = stage2_set_pte(kvm, memcache, fault_ipa, new_pte, false,
 +logging_active);
  }
  
 +if (write_fault)
 +mark_page_dirty(kvm, gfn);
  
  out_unlock:
  spin_unlock(kvm-mmu_lock);
 @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, 
 gpa_t gpa, void *data)
  {
  pte_t *pte = (pte_t *)data;
  
 -stage2_set_pte(kvm, NULL, gpa, pte, false);
 +stage2_set_pte(kvm, NULL, gpa, pte, false, false);
 
 why is logging never active if we are called from MMU notifiers?

mmu notifiers update sptes, but I don't see how these updates
can

Re: [PATCH v9 2/4] arm: ARMv7 dirty page logging inital mem region write protect (w/no huge PUD support)

2014-08-11 Thread Mario Smarduch

On 08/11/2014 12:12 PM, Christoffer Dall wrote:
 Remove the parenthesis from the subject line.
 
 On Thu, Jul 24, 2014 at 05:56:06PM -0700, Mario Smarduch wrote:
 Patch adds  support for initial write protection VM memlsot. This patch 
 series
 ^^^
 stray whitespace of
 
 
 assumes that huge PUDs will not be used in 2nd stage tables.
 
 may be worth mentioning that this is always valid on ARMv7.
 

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h   |1 +
  arch/arm/include/asm/kvm_mmu.h|   20 ++
  arch/arm/include/asm/pgtable-3level.h |1 +
  arch/arm/kvm/arm.c|9 +++
  arch/arm/kvm/mmu.c|  128 
 +
  5 files changed, 159 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 042206f..6521a2d 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -231,5 +231,6 @@ int kvm_perf_teardown(void);
  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
  int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
  void kvm_arch_flush_remote_tlbs(struct kvm *);
 +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
  
  #endif /* __ARM_KVM_HOST_H__ */
 diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
 index 5cc0b0f..08ab5e8 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -114,6 +114,26 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
  pmd_val(*pmd) |= L_PMD_S2_RDWR;
  }
  
 +static inline void kvm_set_s2pte_readonly(pte_t *pte)
 +{
 +pte_val(*pte) = (pte_val(*pte)  ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pte_readonly(pte_t *pte)
 +{
 +return (pte_val(*pte)  L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
 +}
 +
 +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 +{
 +pmd_val(*pmd) = (pmd_val(*pmd)  ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
 +}
 +
 +static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 +{
 +return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
 +}
 +
  /* Open coded p*d_addr_end that can deal with 64bit addresses */
  #define kvm_pgd_addr_end(addr, end) \
  ({  u64 __boundary = ((addr) + PGDIR_SIZE)  PGDIR_MASK;\
 diff --git a/arch/arm/include/asm/pgtable-3level.h 
 b/arch/arm/include/asm/pgtable-3level.h
 index 85c60ad..d8bb40b 100644
 --- a/arch/arm/include/asm/pgtable-3level.h
 +++ b/arch/arm/include/asm/pgtable-3level.h
 @@ -129,6 +129,7 @@
  #define L_PTE_S2_RDONLY (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PTE_S2_RDWR   (_AT(pteval_t, 3)  6)   /* 
 HAP[2:1] */
  
 +#define L_PMD_S2_RDONLY (_AT(pteval_t, 1)  6)   /* 
 HAP[1]   */
  #define L_PMD_S2_RDWR   (_AT(pmdval_t, 3)  6)   /* 
 HAP[2:1] */
  
  /*
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..e11c2dd 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -242,6 +242,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 const struct kvm_memory_slot *old,
 enum kvm_mr_change change)
  {
 +#ifdef CONFIG_ARM
 +/*
 + * At this point memslot has been committed and there is an
 + * allocated dirty_bitmap[], dirty pages will be be tracked while the
 + * memory slot is write protected.
 + */
 +if ((change != KVM_MR_DELETE)  (mem-flags  KVM_MEM_LOG_DIRTY_PAGES))
 +kvm_mmu_wp_memory_region(kvm, mem-slot);
 +#endif
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 35254c6..7bfc792 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -763,6 +763,134 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
 phys_addr_t *ipap)
  return false;
  }
  
 +#ifdef CONFIG_ARM
 +/**
 + * stage2_wp_pte_range - write protect PTE range
 + * @pmd:pointer to pmd entry
 + * @addr:   range start address
 + * @end:range end address
 + */
 +static void stage2_wp_pte_range(pmd_t *pmd, phys_addr_t addr, phys_addr_t 
 end)
 +{
 +pte_t *pte;
 +
 +pte = pte_offset_kernel(pmd, addr);
 +do {
 +if (!pte_none(*pte)) {
 +if (!kvm_s2pte_readonly(pte))
 +kvm_set_s2pte_readonly(pte);
 +}
 +} while (pte++, addr += PAGE_SIZE, addr != end);
 +}
 +
 +/**
 + * stage2_wp_pmd_range - write protect PMD range
 + * @pud:pointer to pud entry
 + * @addr:   range start address
 + * @end:range end address
 + */
 +static void stage2_wp_pmd_range(pud_t *pud, phys_addr_t addr, phys_addr_t 
 end)
 +{
 +pmd_t *pmd;
 +phys_addr_t next;
 +
 +pmd = pmd_offset(pud, addr);
 +
 +do {
 +next

1 2 3 4 5 >

1 - 100 of 424 matches

Mail list logo