[PATCH] KVM: VMX: Make module parameters readable

2009-03-23 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Useful to see how the module was loaded.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5852443..d3919ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -39,19 +39,19 @@ MODULE_AUTHOR(Qumranet);
 MODULE_LICENSE(GPL);
 
 static int bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, 0);
+module_param(bypass_guest_pf, bool, S_IRUGO);
 
 static int enable_vpid = 1;
-module_param(enable_vpid, bool, 0);
+module_param(enable_vpid, bool, 0444);
 
 static int flexpriority_enabled = 1;
-module_param(flexpriority_enabled, bool, 0);
+module_param(flexpriority_enabled, bool, S_IRUGO);
 
 static int enable_ept = 1;
-module_param(enable_ept, bool, 0);
+module_param(enable_ept, bool, S_IRUGO);
 
 static int emulate_invalid_guest_state = 0;
-module_param(emulate_invalid_guest_state, bool, 0);
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 struct vmcs {
u32 revision_id;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: reuse (pop|push)_irq from svm.c in vmx.c

2009-03-23 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

The prioritized bit vector manipulation functions are useful in both vmx and
svm.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index da23fd3..04ee964 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
 #include irq.h
 #include mmu.h
 #include kvm_cache_regs.h
+#include x86.h
 
 #include linux/module.h
 #include linux/kernel.h
@@ -132,24 +133,6 @@ static inline u32 svm_has(u32 feat)
return svm_features  feat;
 }
 
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
-   int word_index = __ffs(vcpu-arch.irq_summary);
-   int bit_index = __ffs(vcpu-arch.irq_pending[word_index]);
-   int irq = word_index * BITS_PER_LONG + bit_index;
-
-   clear_bit(bit_index, vcpu-arch.irq_pending[word_index]);
-   if (!vcpu-arch.irq_pending[word_index])
-   clear_bit(word_index, vcpu-arch.irq_summary);
-   return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-   set_bit(irq, vcpu-arch.irq_pending);
-   set_bit(irq / BITS_PER_LONG, vcpu-arch.irq_summary);
-}
-
 static inline void clgi(void)
 {
asm volatile (__ex(SVM_CLGI));
@@ -1114,7 +1097,7 @@ static int pf_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
if (!irqchip_in_kernel(kvm) 
is_external_interrupt(exit_int_info)) {
event_injection = true;
-   push_irq(svm-vcpu, exit_int_info  SVM_EVTINJ_VEC_MASK);
+   kvm_push_irq(svm-vcpu, exit_int_info  SVM_EVTINJ_VEC_MASK);
}
 
fault_address  = svm-vmcb-control.exit_info_2;
@@ -2334,7 +2317,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
if ((control-int_ctl  V_IRQ_MASK)
 !irqchip_in_kernel(svm-vcpu.kvm)) {
control-int_ctl = ~V_IRQ_MASK;
-   push_irq(svm-vcpu, control-int_vector);
+   kvm_push_irq(svm-vcpu, control-int_vector);
}
 
svm-vcpu.arch.interrupt_window_open =
@@ -2344,7 +2327,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
 {
-   svm_inject_irq(svm, pop_irq(svm-vcpu));
+   svm_inject_irq(svm, kvm_pop_irq(svm-vcpu));
 }
 
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5cf28df..5852443 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2489,18 +2489,6 @@ static void vmx_update_window_states(struct kvm_vcpu 
*vcpu)
 GUEST_INTR_STATE_MOV_SS)));
 }
 
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
-   int word_index = __ffs(vcpu-arch.irq_summary);
-   int bit_index = __ffs(vcpu-arch.irq_pending[word_index]);
-   int irq = word_index * BITS_PER_LONG + bit_index;
-
-   clear_bit(bit_index, vcpu-arch.irq_pending[word_index]);
-   if (!vcpu-arch.irq_pending[word_index])
-   clear_bit(word_index, vcpu-arch.irq_summary);
-   kvm_queue_interrupt(vcpu, irq);
-}
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
   struct kvm_run *kvm_run)
 {
@@ -2534,7 +2522,7 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 
if (vcpu-arch.interrupt_window_open) {
if (vcpu-arch.irq_summary  !vcpu-arch.interrupt.pending)
-   kvm_do_inject_irq(vcpu);
+   kvm_queue_interrupt(vcpu, kvm_pop_irq(vcpu));
 
if (vcpu-arch.interrupt.pending)
vmx_inject_irq(vcpu, vcpu-arch.interrupt.nr);
@@ -2619,8 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
 
if (!irqchip_in_kernel(vcpu-kvm)  is_external_interrupt(vect_info)) {
int irq = vect_info  VECTORING_INFO_VECTOR_MASK;
-   set_bit(irq, vcpu-arch.irq_pending);
-   set_bit(irq / BITS_PER_LONG, vcpu-arch.irq_summary);
+   kvm_push_irq(vcpu, irq);
}
 
if ((intr_info  INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78..2ab6791 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -19,4 +19,22 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu 
*vcpu)
vcpu-arch.interrupt.pending = false;
 }
 
+static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
+{
+   int word_index = __ffs(vcpu-arch.irq_summary);
+   int bit_index = __ffs(vcpu-arch.irq_pending[word_index]);
+   int irq = word_index * BITS_PER_LONG + bit_index;
+
+   clear_bit(bit_index, vcpu-arch.irq_pending[word_index]);
+   if (!vcpu-arch.irq_pending[word_index])
+   clear_bit(word_index, vcpu-arch.irq_summary);
+   return irq;
+}
+
+static inline void kvm_push_irq(struct kvm_vcpu *vcpu, u8 irq)
+{
+set_bit(irq, vcpu-arch.irq_pending);

Re: Can i shrink the image file size?

2009-03-23 Thread Alberto TreviƱo
On Saturday 21 March 2009 10:45:58 am John Wong wrote:
 I create the 30G qcow2 image file, installed winxp, winxp show me the
 harddisk used 5G, but the image file size is 12G now.
 can i shrink the image file size? how?

I wrote a simple utility in .Net 2.0 that creates a file in Windows with all 
0's until the drive is full, then deletes it.  This allows qemu-img to 
recreate the image and reclaim any sectors that are all 0's.  This is what I 
do:

1. Defragment the Windows drive(s).  Run the defragmenter at least 3 times 
to make sure you get good results.

2. Run my tool (http://mel.byu.edu/zerofill.exe).  It will automatically 
detect all drive letters, create a file filled with zero's and delete it whn 
it's done.  Make sure the VM is not under a lot of use or the continually 
decreasing amount of disk space may interfere with whatever may be running.

3. Run qemu-img and recreate the virtual drive with the same format 
parameters.  For example, if your image is in qcow2 format you run:

  qemu-img convert -O qcow2 win_xp.img win_xp.img.new

4. Rename win_xp.img to win_xp.img.old and win_xp.img.new to win_xp.img.  
Run your VM and make sure everything works.  If it does, remove 
win_xp.img.old and go on with life.

I would recommend you back up your drive image before your start the first 
couple of times, just to be sure.  It's not fun when you screw up and can't 
get back into your VM.  I've done that a few times while developing this 
technique. :-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio block drivers not working

2009-03-23 Thread Markus Armbruster
Caleb Tennis ca...@aei-tech.com writes:

 On Mar 22, 2009, at 5:30 PM, Christoph Hellwig wrote:

 I do you virtio block in a very similar setup to yours (fully static
 kernel, -kernel option to kvm/qemu) sucesfully for quite a a while.

 Can you post your kernel .config and the contents of /proc/devices
 and /proc/partitions to debug this further?


 I've found the problem.  In my testing, I was changing my second drive
 from ide to virtio, so it looked like this:

 -drive index=0,media=disk,if=ide,file=/root
 -drive index=1,media=disk,if=virtio,file=/data

 This doesn't work though, even as vdb, as since the interface is the
 first virtio, the index needs to be set back to 0.  This gives me a /
 dev/vda like I was expecting.

 Thanks,
 Caleb

Yes, the common loop to create virtio block devices (all three copies of
it) stops at the first unused unit number / index.  A rude trap for the
unwary.  The device configuration patch I'm working on upstream doesn't
do that.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 0/6] PCI: support the ATS capability

2009-03-23 Thread Yu Zhao
This patch series implements Address Translation Service support for
the Intel IOMMU. The PCIe Endpoint that supports ATS capability can
request the DMA address translation from the IOMMU and cache the
translation itself. This can alleviate IOMMU TLB pressure and improve
the hardware performance in the I/O virtualization environment.

The ATS is one of PCI-SIG I/O Virtualization (IOV) Specifications. The
spec can be found at: http://www.pcisig.com/specifications/iov/ats/
(it requires membership).


Changelog:
v3 - v4
  1, coding style fixes (Grant Grundler)
  2, support the Virtual Function ATS capability

v2 - v3
  1, throw error message if VT-d hardware detects invalid descriptor
 on Queued Invalidation interface (David Woodhouse)
  2, avoid using pci_find_ext_capability every time when reading ATS
 Invalidate Queue Depth (Matthew Wilcox)

v1 - v2
  added 'static' prefix to a local LIST_HEAD (Andrew Morton)


Yu Zhao (6):
  PCI: support the ATS capability
  PCI: handle Virtual Function ATS enabling
  VT-d: parse ATSR in DMA Remapping Reporting Structure
  VT-d: add device IOTLB invalidation support
  VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
  VT-d: support the device IOTLB

 drivers/pci/dmar.c  |  189 +++---
 drivers/pci/intel-iommu.c   |  139 ++--
 drivers/pci/iov.c   |  155 ++--
 drivers/pci/pci.h   |   39 +
 include/linux/dmar.h|9 ++
 include/linux/intel-iommu.h |   16 -
 include/linux/pci.h |2 +
 include/linux/pci_regs.h|   10 +++
 8 files changed, 514 insertions(+), 45 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 1/6] PCI: support the ATS capability

2009-03-23 Thread Yu Zhao
The PCIe ATS capability makes the Endpoint be able to request the
DMA address translation from the IOMMU and cache the translation
in the device side, thus alleviate IOMMU TLB pressure and improve
the hardware performance in the I/O virtualization environment.

Signed-off-by: Yu Zhao yu.z...@intel.com
---
 drivers/pci/iov.c|  105 ++
 drivers/pci/pci.h|   37 
 include/linux/pci.h  |2 +
 include/linux/pci_regs.h |   10 
 4 files changed, 154 insertions(+), 0 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 7227efc..8a9817c 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -5,6 +5,7 @@
  *
  * PCI Express I/O Virtualization (IOV) support.
  *   Single Root IOV 1.0
+ *   Address Translation Service 1.0
  */
 
 #include linux/pci.h
@@ -678,3 +679,107 @@ irqreturn_t pci_sriov_migration(struct pci_dev *dev)
return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_migration);
+
+static int ats_alloc_one(struct pci_dev *dev, int pgshift)
+{
+   int pos;
+   u16 cap;
+   struct pci_ats *ats;
+
+   pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+   if (!pos)
+   return -ENODEV;
+
+   ats = kzalloc(sizeof(*ats), GFP_KERNEL);
+   if (!ats)
+   return -ENOMEM;
+
+   ats-pos = pos;
+   ats-stu = pgshift;
+   pci_read_config_word(dev, pos + PCI_ATS_CAP, cap);
+   ats-qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+   PCI_ATS_MAX_QDEP;
+   dev-ats = ats;
+
+   return 0;
+}
+
+static void ats_free_one(struct pci_dev *dev)
+{
+   kfree(dev-ats);
+   dev-ats = NULL;
+}
+
+/**
+ * pci_enable_ats - enable the ATS capability
+ * @dev: the PCI device
+ * @pgshift: the IOMMU page shift
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_ats(struct pci_dev *dev, int pgshift)
+{
+   int rc;
+   u16 ctrl;
+
+   BUG_ON(dev-ats);
+
+   if (pgshift  PCI_ATS_MIN_STU)
+   return -EINVAL;
+
+   rc = ats_alloc_one(dev, pgshift);
+   if (rc)
+   return rc;
+
+   ctrl = PCI_ATS_CTRL_ENABLE;
+   ctrl |= PCI_ATS_CTRL_STU(pgshift - PCI_ATS_MIN_STU);
+   pci_write_config_word(dev, dev-ats-pos + PCI_ATS_CTRL, ctrl);
+
+   return 0;
+}
+
+/**
+ * pci_disable_ats - disable the ATS capability
+ * @dev: the PCI device
+ */
+void pci_disable_ats(struct pci_dev *dev)
+{
+   u16 ctrl;
+
+   BUG_ON(!dev-ats);
+
+   pci_read_config_word(dev, dev-ats-pos + PCI_ATS_CTRL, ctrl);
+   ctrl = ~PCI_ATS_CTRL_ENABLE;
+   pci_write_config_word(dev, dev-ats-pos + PCI_ATS_CTRL, ctrl);
+
+   ats_free_one(dev);
+}
+
+/**
+ * pci_ats_queue_depth - query the ATS Invalidate Queue Depth
+ * @dev: the PCI device
+ *
+ * Returns the queue depth on success, or negative on failure.
+ *
+ * The ATS spec uses 0 in the Invalidate Queue Depth field to
+ * indicate that the function can accept 32 Invalidate Request.
+ * But here we use the `real' values (i.e. 1~32) for the Queue
+ * Depth.
+ */
+int pci_ats_queue_depth(struct pci_dev *dev)
+{
+   int pos;
+   u16 cap;
+
+   if (dev-ats)
+   return dev-ats-qdep;
+
+   pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+   if (!pos)
+   return -ENODEV;
+
+   pci_read_config_word(dev, pos + PCI_ATS_CAP, cap);
+
+   return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+  PCI_ATS_MAX_QDEP;
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index dd7c63f..9f0db6a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -218,6 +218,13 @@ struct pci_sriov {
u8 __iomem *mstate; /* VF Migration State Array */
 };
 
+/* Address Translation Service */
+struct pci_ats {
+   int pos;/* capability position */
+   int stu;/* Smallest Translation Unit */
+   int qdep;   /* Invalidate Queue Depth */
+};
+
 #ifdef CONFIG_PCI_IOV
 extern int pci_iov_init(struct pci_dev *dev);
 extern void pci_iov_release(struct pci_dev *dev);
@@ -225,6 +232,20 @@ extern int pci_iov_resource_bar(struct pci_dev *dev, int 
resno,
enum pci_bar_type *type);
 extern void pci_restore_iov_state(struct pci_dev *dev);
 extern int pci_iov_bus_range(struct pci_bus *bus);
+
+extern int pci_enable_ats(struct pci_dev *dev, int pgshift);
+extern void pci_disable_ats(struct pci_dev *dev);
+extern int pci_ats_queue_depth(struct pci_dev *dev);
+/**
+ * pci_ats_enabled - query the ATS status
+ * @dev: the PCI device
+ *
+ * Returns 1 if ATS capability is enabled, or 0 if not.
+ */
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+   return !!dev-ats;
+}
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
@@ -246,6 +267,22 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 {

[PATCH v4 3/6] VT-d: parse ATSR in DMA Remapping Reporting Structure

2009-03-23 Thread Yu Zhao
Parse the Root Port ATS Capability Reporting Structure in the DMA
Remapping Reporting Structure ACPI table.

Signed-off-by: Yu Zhao yu.z...@intel.com
---
 drivers/pci/dmar.c  |  112 --
 include/linux/dmar.h|9 
 include/linux/intel-iommu.h |1 +
 3 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 26c536b..106bc45 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -254,6 +254,84 @@ rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
}
return ret;
 }
+
+static LIST_HEAD(dmar_atsr_units);
+
+static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
+{
+   struct acpi_dmar_atsr *atsr;
+   struct dmar_atsr_unit *atsru;
+
+   atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+   atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
+   if (!atsru)
+   return -ENOMEM;
+
+   atsru-hdr = hdr;
+   atsru-include_all = atsr-flags  0x1;
+
+   list_add(atsru-list, dmar_atsr_units);
+
+   return 0;
+}
+
+static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
+{
+   int rc;
+   struct acpi_dmar_atsr *atsr;
+
+   if (atsru-include_all)
+   return 0;
+
+   atsr = container_of(atsru-hdr, struct acpi_dmar_atsr, header);
+   rc = dmar_parse_dev_scope((void *)(atsr + 1),
+   (void *)atsr + atsr-header.length,
+   atsru-devices_cnt, atsru-devices,
+   atsr-segment);
+   if (rc || !atsru-devices_cnt) {
+   list_del(atsru-list);
+   kfree(atsru);
+   }
+
+   return rc;
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+   int i;
+   struct pci_bus *bus;
+   struct acpi_dmar_atsr *atsr;
+   struct dmar_atsr_unit *atsru;
+
+   list_for_each_entry(atsru, dmar_atsr_units, list) {
+   atsr = container_of(atsru-hdr, struct acpi_dmar_atsr, header);
+   if (atsr-segment == pci_domain_nr(dev-bus))
+   goto found;
+   }
+
+   return 0;
+
+found:
+   for (bus = dev-bus; bus; bus = bus-parent) {
+   struct pci_dev *bridge = bus-self;
+
+   if (!bridge || !bridge-is_pcie ||
+   bridge-pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+   return 0;
+
+   if (bridge-pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
+   for (i = 0; i  atsru-devices_cnt; i++)
+   if (atsru-devices[i] == bridge)
+   return 1;
+   break;
+   }
+   }
+
+   if (atsru-include_all)
+   return 1;
+
+   return 0;
+}
 #endif
 
 static void __init
@@ -261,22 +339,28 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header 
*header)
 {
struct acpi_dmar_hardware_unit *drhd;
struct acpi_dmar_reserved_memory *rmrr;
+   struct acpi_dmar_atsr *atsr;
 
switch (header-type) {
case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-   drhd = (struct acpi_dmar_hardware_unit *)header;
+   drhd = container_of(header, struct acpi_dmar_hardware_unit,
+   header);
printk (KERN_INFO PREFIX
-   DRHD (flags: 0x%08x)base: 0x%016Lx\n,
-   drhd-flags, (unsigned long long)drhd-address);
+   DRHD base: %#016Lx flags: %#x\n,
+   (unsigned long long)drhd-address, drhd-flags);
break;
case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-   rmrr = (struct acpi_dmar_reserved_memory *)header;
-
+   rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+   header);
printk (KERN_INFO PREFIX
-   RMRR base: 0x%016Lx end: 0x%016Lx\n,
+   RMRR base: %#016Lx end: %#016Lx\n,
(unsigned long long)rmrr-base_address,
(unsigned long long)rmrr-end_address);
break;
+   case ACPI_DMAR_TYPE_ATSR:
+   atsr = container_of(header, struct acpi_dmar_atsr, header);
+   printk(KERN_INFO PREFIX ATSR flags: %#x\n, atsr-flags);
+   break;
}
 }
 
@@ -349,6 +433,11 @@ parse_dmar_table(void)
ret = dmar_parse_one_rmrr(entry_header);
 #endif
break;
+   case ACPI_DMAR_TYPE_ATSR:
+#ifdef CONFIG_DMAR
+   ret = dmar_parse_one_atsr(entry_header);
+#endif
+   break;
default:
printk(KERN_WARNING PREFIX
Unknown DMAR structure type\n);
@@ -417,11 +506,19 @@ int __init dmar_dev_scope_init(void)
 #ifdef CONFIG_DMAR
{
struct 

[PATCH v4 2/6] PCI: handle Virtual Function ATS enabling

2009-03-23 Thread Yu Zhao
The SR-IOV spec requires the Smallest Translation Unit and the
Invalidate Queue Depth fields in the Virtual Function ATS capability
to be hardwired to 0. If a function is a Virtual Function, then and
set its Physical Function's STU before enabling the ATS.

Signed-off-by: Yu Zhao yu.z...@intel.com
---
 drivers/pci/iov.c |   66 +---
 drivers/pci/pci.h |4 ++-
 2 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 8a9817c..0bf23fc 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -491,10 +491,10 @@ found:
 
if (pdev)
iov-dev = pci_dev_get(pdev);
-   else {
+   else
iov-dev = dev;
-   mutex_init(iov-lock);
-   }
+
+   mutex_init(iov-lock);
 
dev-sriov = iov;
dev-is_physfn = 1;
@@ -514,11 +514,11 @@ static void sriov_release(struct pci_dev *dev)
 {
BUG_ON(dev-sriov-nr_virtfn);
 
-   if (dev == dev-sriov-dev)
-   mutex_destroy(dev-sriov-lock);
-   else
+   if (dev != dev-sriov-dev)
pci_dev_put(dev-sriov-dev);
 
+   mutex_destroy(dev-sriov-lock);
+
kfree(dev-sriov);
dev-sriov = NULL;
 }
@@ -722,19 +722,40 @@ int pci_enable_ats(struct pci_dev *dev, int pgshift)
int rc;
u16 ctrl;
 
-   BUG_ON(dev-ats);
+   BUG_ON(dev-ats  dev-ats-is_enabled);
 
if (pgshift  PCI_ATS_MIN_STU)
return -EINVAL;
 
-   rc = ats_alloc_one(dev, pgshift);
-   if (rc)
-   return rc;
+   if (dev-is_physfn || dev-is_virtfn) {
+   struct pci_dev *pdev = dev-is_physfn ? dev : dev-physfn;
+
+   mutex_lock(pdev-sriov-lock);
+   if (pdev-ats)
+   rc = pdev-ats-stu == pgshift ? 0 : -EINVAL;
+   else
+   rc = ats_alloc_one(pdev, pgshift);
+
+   if (!rc)
+   pdev-ats-ref_cnt++;
+   mutex_unlock(pdev-sriov-lock);
+   if (rc)
+   return rc;
+   }
+
+   if (!dev-is_physfn) {
+   rc = ats_alloc_one(dev, pgshift);
+   if (rc)
+   return rc;
+   }
 
ctrl = PCI_ATS_CTRL_ENABLE;
-   ctrl |= PCI_ATS_CTRL_STU(pgshift - PCI_ATS_MIN_STU);
+   if (!dev-is_virtfn)
+   ctrl |= PCI_ATS_CTRL_STU(pgshift - PCI_ATS_MIN_STU);
pci_write_config_word(dev, dev-ats-pos + PCI_ATS_CTRL, ctrl);
 
+   dev-ats-is_enabled = 1;
+
return 0;
 }
 
@@ -746,13 +767,26 @@ void pci_disable_ats(struct pci_dev *dev)
 {
u16 ctrl;
 
-   BUG_ON(!dev-ats);
+   BUG_ON(!dev-ats || !dev-ats-is_enabled);
 
pci_read_config_word(dev, dev-ats-pos + PCI_ATS_CTRL, ctrl);
ctrl = ~PCI_ATS_CTRL_ENABLE;
pci_write_config_word(dev, dev-ats-pos + PCI_ATS_CTRL, ctrl);
 
-   ats_free_one(dev);
+   dev-ats-is_enabled = 0;
+
+   if (dev-is_physfn || dev-is_virtfn) {
+   struct pci_dev *pdev = dev-is_physfn ? dev : dev-physfn;
+
+   mutex_lock(pdev-sriov-lock);
+   pdev-ats-ref_cnt--;
+   if (!pdev-ats-ref_cnt)
+   ats_free_one(pdev);
+   mutex_unlock(pdev-sriov-lock);
+   }
+
+   if (!dev-is_physfn)
+   ats_free_one(dev);
 }
 
 /**
@@ -764,13 +798,17 @@ void pci_disable_ats(struct pci_dev *dev)
  * The ATS spec uses 0 in the Invalidate Queue Depth field to
  * indicate that the function can accept 32 Invalidate Request.
  * But here we use the `real' values (i.e. 1~32) for the Queue
- * Depth.
+ * Depth; and 0 indicates the function shares the Queue with
+ * other functions (doesn't exclusively own a Queue).
  */
 int pci_ats_queue_depth(struct pci_dev *dev)
 {
int pos;
u16 cap;
 
+   if (dev-is_virtfn)
+   return 0;
+
if (dev-ats)
return dev-ats-qdep;
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9f0db6a..8ecd185 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -223,6 +223,8 @@ struct pci_ats {
int pos;/* capability position */
int stu;/* Smallest Translation Unit */
int qdep;   /* Invalidate Queue Depth */
+   int ref_cnt;/* Physical Function reference count */
+   int is_enabled:1;   /* Enable bit is set */
 };
 
 #ifdef CONFIG_PCI_IOV
@@ -244,7 +246,7 @@ extern int pci_ats_queue_depth(struct pci_dev *dev);
  */
 static inline int pci_ats_enabled(struct pci_dev *dev)
 {
-   return !!dev-ats;
+   return dev-ats  dev-ats-is_enabled;
 }
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
-- 
1.5.6.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 6/6] VT-d: support the device IOTLB

2009-03-23 Thread Yu Zhao
Enable the device IOTLB (i.e. ATS) for both the bare metal and KVM
environments.

Signed-off-by: Yu Zhao yu.z...@intel.com
---
 drivers/pci/intel-iommu.c   |   99 +-
 include/linux/intel-iommu.h |1 +
 2 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 3145368..799bbe5 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -127,6 +127,7 @@ static inline void context_set_fault_enable(struct 
context_entry *context)
 }
 
 #define CONTEXT_TT_MULTI_LEVEL 0
+#define CONTEXT_TT_DEV_IOTLB   1
 
 static inline void context_set_translation_type(struct context_entry *context,
unsigned long value)
@@ -242,6 +243,7 @@ struct device_domain_info {
struct list_head global; /* link to global list */
u8 bus; /* PCI bus numer */
u8 devfn;   /* PCI devfn number */
+   struct intel_iommu *iommu; /* IOMMU used by this device */
struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
struct dmar_domain *domain; /* pointer to domain */
 };
@@ -924,6 +926,80 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, 
u16 did,
return 0;
 }
 
+static struct device_domain_info *
+iommu_support_dev_iotlb(struct dmar_domain *domain, u8 bus, u8 devfn)
+{
+   int found = 0;
+   unsigned long flags;
+   struct device_domain_info *info;
+   struct intel_iommu *iommu = device_to_iommu(bus, devfn);
+
+   if (!ecap_dev_iotlb_support(iommu-ecap))
+   return NULL;
+
+   if (!iommu-qi)
+   return NULL;
+
+   spin_lock_irqsave(device_domain_lock, flags);
+   list_for_each_entry(info, domain-devices, link)
+   if (info-bus == bus  info-devfn == devfn) {
+   found = 1;
+   break;
+   }
+   spin_unlock_irqrestore(device_domain_lock, flags);
+
+   if (!found || !info-dev)
+   return NULL;
+
+   if (!pci_find_ext_capability(info-dev, PCI_EXT_CAP_ID_ATS))
+   return NULL;
+
+   if (!dmar_find_matched_atsr_unit(info-dev))
+   return NULL;
+
+   info-iommu = iommu;
+
+   return info;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+   if (!info)
+   return;
+
+   pci_enable_ats(info-dev, VTD_PAGE_SHIFT);
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+   if (!info-dev || !pci_ats_enabled(info-dev))
+   return;
+
+   pci_disable_ats(info-dev);
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+ u64 addr, unsigned mask)
+{
+   int rc;
+   u16 sid, qdep;
+   unsigned long flags;
+   struct device_domain_info *info;
+
+   spin_lock_irqsave(device_domain_lock, flags);
+   list_for_each_entry(info, domain-devices, link) {
+   if (!info-dev || !pci_ats_enabled(info-dev))
+   continue;
+
+   sid = info-bus  8 | info-devfn;
+   qdep = pci_ats_queue_depth(info-dev);
+   rc = qi_flush_dev_iotlb(info-iommu, sid, qdep, addr, mask);
+   if (rc)
+   dev_err(info-dev-dev, flush IOTLB failed\n);
+   }
+   spin_unlock_irqrestore(device_domain_lock, flags);
+}
+
 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
u64 addr, unsigned int pages, int non_present_entry_flush)
 {
@@ -947,6 +1023,9 @@ static int iommu_flush_iotlb_psi(struct intel_iommu 
*iommu, u16 did,
rc = iommu-flush.flush_iotlb(iommu, did, addr, mask,
DMA_TLB_PSI_FLUSH,
non_present_entry_flush);
+   if (!rc  !non_present_entry_flush)
+   iommu_flush_dev_iotlb(iommu-domains[did], addr, mask);
+
return rc;
 }
 
@@ -1471,6 +1550,7 @@ static int domain_context_mapping_one(struct dmar_domain 
*domain,
unsigned long ndomains;
int id;
int agaw;
+   struct device_domain_info *info;
 
pr_debug(Set context mapping for %02x:%02x.%d\n,
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1536,7 +1616,9 @@ static int domain_context_mapping_one(struct dmar_domain 
*domain,
context_set_domain_id(context, id);
context_set_address_width(context, iommu-agaw);
context_set_address_root(context, virt_to_phys(pgd));
-   context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
+   info = iommu_support_dev_iotlb(domain, bus, devfn);
+   context_set_translation_type(context,
+   info ? CONTEXT_TT_DEV_IOTLB : CONTEXT_TT_MULTI_LEVEL);
context_set_fault_enable(context);
context_set_present(context);
domain_flush_cache(domain, context, 

Re: phenom, amd780g, tsc, hpet, kvm, kernel -- who's at fault?

2009-03-23 Thread Ingo Molnar

* Michael Tokarev m...@tls.msk.ru wrote:

 Today (Friday, the 13th) I had a very bad sequence of failures
 with our servers leading to data loss and almost the whole day
 of very hard work.  And now I'm *really* interested where the
 fault(s) is(are).

 What I have here is an AMD780G-based system (Asus M3A-H/HDMI
 motherboard, latest BIOS) with AND Phenom 9750 CPU and 8Gig of
 ECC memory.  The system is built for KVM (kernel virtual machine)
 work, and is running several guests, but I'm not sure anymore
 that KVM is related to the problem at hand.

 The problem is that - it seems - timekeeping on this machine is
 quite unreliable.

 It's Phenom, so TSC should be synced.  And it is being choosen
 at bootup as clocksource.  But regardless of current_clocksource
 (tsc), it constantly increases hpet min_delta_ns, like this:

 Mar 13 19:58:02 gate kernel: CE: hpet increasing min_delta_ns to 15000 nsec
 Mar 13 19:59:16 gate kernel: CE: hpet increasing min_delta_ns to 22500 nsec
 Mar 13 19:59:16 gate kernel: CE: hpet increasing min_delta_ns to 33750 nsec
 Mar 13 19:59:16 gate kernel: CE: hpet increasing min_delta_ns to 50624 nsec
 Mar 13 20:47:02 gate kernel: CE: hpet increasing min_delta_ns to 75936 nsec
 Mar 13 20:48:17 gate kernel: CE: hpet increasing min_delta_ns to 113904 nsec
 Mar 13 21:02:23 gate kernel: CE: hpet increasing min_delta_ns to 170856 nsec
 Mar 13 21:05:27 gate kernel: CE: hpet increasing min_delta_ns to 256284 nsec
 Mar 13 21:07:28 gate kernel: Clocksource tsc unstable (delta = 751920452 ns)
 Mar 13 21:09:12 gate kernel: CE: hpet increasing min_delta_ns to 384426 nsec

 and finally, it declares that TSC is unstable (pre-last line) and
 switches to the (unstable) HPET.

 HPET min_delta_ns will be increasing further and further, i've seen it
 increased to 576638 and more.

 And no doubt the system is unstable with KVM like crazy, especially under
 some, even light, load.

 Today I were copying some relatively large amount of data over network from
 another to this machine (to the host itself, not to any virtual guest), and
 had numerous guest and host stalls and lockups.  At times, host sops doing
 anything at all, all guests stalling too, load average jumps to 80 and more,
 and nothing happens.  I can do something over console still, like running
 top/strace, but nothing interesting shows.  I captured Sysrq+T of this 
 situation
 here: http://www.corpit.ru/mjt/host-high-la -- everything I was able to find
 in kern.log.

403

 After some time, sometimes it's several seconds, sometimes it's up to 10
 minutes, the thing unstucks and continues working.  Today it happened after
 about 10 minutes.  But after it continued, 2 of the KVM guests were eating
 100% CPU and did not respond at all.  The Sysrq+T of this is available at
 http://www.corpit.ru/mjt/guest-stuck -- two KVM guests were not responsible.

403 too.

 It's even more - the system started showing sporadic, random I/O 
 errors unrelated to the disks - for example, one of software RAID5 
 arrays started behaving really oddly, so that finally, after a 
 reboot, I had to re-create the array and some of the filesystems 
 on it (which I never saw in last ~10 years I'm using sofraid on 
 linus, on many different systems and disks and with various 
 failure cases).

 Now, I switched to acpi_pm clocksource.  And also tried to disable 
 nested page tables with kvm (kvm_amd npt=0).  With that, 
 everything is slow and sluggish, but I was finally able to copy 
 that data without errors, while the guests were running.

 It were about to stuck as before, but I noticed it switched to 
 hpet (see tsc is unstable above) and I forced it to use acpi_pm 
 instead, and it survived.

 So, to the hell out of it all, and ignoring the magical Friday the 13th --
 who's fault it is?

  o why it declares tsc is unstable while phenom supposed to keep it ok?

the TSC can drift slowly between cores, and it might not be in sync 
at bootup time already. You check check the TSC from user-space (on 
any kernel) via time-warp-test:

http://redhat.com/~mingo/time-warp-test/MINI-HOWTO

  o why hpet is malfunctioning?

That's a question for Thomas i guess.

  o why the system time on this machine is dog slow without special
adjtimex adjustments, while it worked before (circa 2.6.26) and
windows works ok here?

 For reference:

  
 https://sourceforge.net/tracker/?func=detailatid=893831aid=2351676group_id=180599
   -- kvm bug on sourceforge, without any visible interest in even looking at 
 it

  http://www.google.com/search?q=CE%3A+hpet+increasing+min_delta_ns
   -- numerous references to that CE: hpet increasing min_delta_ns on the 
 'net,
   mostly for C2Ds, mentioning various lockup issues

  http://marc.info/?t=12324627002r=1w=2 --
   slow clock on AMD 740G chipset -- it's about the clock issue, also without
   any visible interest.

 What's the next thing to do here?  I for one don't want to see 
 todays failures again, it was very, and I mean *very* 

Re: phenom, amd780g, tsc, hpet, kvm, kernel -- who's at fault?

2009-03-23 Thread Michael Tokarev

Ingo, I lost any hope already to hear anything about this one..
Surprise.  Thank you for replying!

Ingo Molnar wrote:
[]

top/strace, but nothing interesting shows.  I captured Sysrq+T of this situation
here: http://www.corpit.ru/mjt/host-high-la -- everything I was able to find
in kern.log.


403


Fixed both.  Didn't notice it was 0640 (i copied the kern.log).  I just
checked my apache access.log - no one but several bots even looked at
those pages before you.  Oh well.

[]

So, to the hell out of it all, and ignoring the magical Friday the 13th --
who's fault it is?

 o why it declares tsc is unstable while phenom supposed to keep it ok?


the TSC can drift slowly between cores, and it might not be in sync 
at bootup time already. You check check the TSC from user-space (on 
any kernel) via time-warp-test:


http://redhat.com/~mingo/time-warp-test/MINI-HOWTO


Aha.  Will do.  But see below.


 o why hpet is malfunctioning?


That's a question for Thomas i guess.


 o why the system time on this machine is dog slow without special
   adjtimex adjustments, while it worked before (circa 2.6.26) and
   windows works ok here?

For reference:

 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2351676group_id=180599
  -- kvm bug on sourceforge, without any visible interest in even looking at it

 http://www.google.com/search?q=CE%3A+hpet+increasing+min_delta_ns
  -- numerous references to that CE: hpet increasing min_delta_ns on the 'net,
  mostly for C2Ds, mentioning various lockup issues

 http://marc.info/?t=12324627002r=1w=2 --
  slow clock on AMD 740G chipset -- it's about the clock issue, also without
  any visible interest.

What's the next thing to do here?  I for one don't want to see 
todays failures again, it was very, and I mean *very* difficult 
day to restore the functionality of this system that (and it isn't 
restored at full because of the slowness of its current state).


it's not clear which kernel you tried - if you tried a recent enough 
one then i'd chalk this up as a yet-unfixed timekeeping problem - 
which probably had ripple effects on KVM and the rest of the system.


It is 2.6.28.7 compiled for x86-64 (64 bits).
Config is at http://www.corpit.ru/mjt/2.6.28.7-x86-64.config

What would be helpful is to debug the problem :-) First verify that 
basic timekeeping is OK: does 'time sleep 10' really take precisely 
10 seconds? Does 'date' advance precisely one second per physical 
second?

[...]

I'll try - maybe today.  The thing is that this is a production machine
running quite several of various (virtual) servers which are all our
infrastructure.  When it started misbehaving at 13th (just because there
was high load, not because of failures or any other changes), all our
office was stopped... ;)

Now, after quite some googling around, I tried to disable hpet, booting
with hpet=disable parameter.  And that one fixed all the problems at once.
7 days uptime, I stress-tested it several times, it works with TSC as
timesource (still a problem within guests as those shows unstable TSC
anyway) since boot, no issues logged.  Even cpufreq works as expected...

Note that i tried to disable hpet as clocksource several times but without
any noticeable effect - kernel still used hpet and hpet2 for something,
and printed that scary increasing min_delay message on a semi-regular
basis usually after the next 'stuck' state


A generic hw/sw state output of:

  http://people.redhat.com/mingo/cfs-scheduler/tools/cfs-debug-info.sh

would also help people taking a look at this problem.

If the problem persists, there might be a chance to debug it without 
rebooting the system. Rebooting and trying out various patches wont 
really work well for a server i guess.


..so it has to be rebooted back to enable hpet.  Hence I'll do it not
before evening.  But I really want to debug and fix the issue, as it
gave me quite some headaches and I want to kill it once and for all ;)

Thanks for noticing this!

/mjt
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] fix ia64 compilation

2009-03-23 Thread Avi Kivity

Gleb Natapov wrote:

Signed-off-by: Gleb Natapov g...@redhat.com
  


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ kvm-Bugs-2703537 ] CPU hotplug causes segfault

2009-03-23 Thread SourceForge.net
Bugs item #2703537, was opened at 2009-03-22 18:07
Message generated for change (Comment added) made by glebn
You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2703537group_id=180599

Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Resolution: None
Priority: 7
Private: No
Submitted By: Technologov (technologov)
Assigned to: Nobody/Anonymous (nobody)
Summary: CPU hotplug causes segfault

Initial Comment:

Host: kvm-84 on Intel or AMD. (RHEL 5/x64)

When I use empty VM and add CPU hotplug several times I get a segfault.

Empty VM is started with PXE networking.
# qemu-kvm -boot n

Now I connect to Qemu monitor and use:
(qemu) cpu_set 1 online
(qemu) cpu_set 2 online
(qemu) cpu_set 3 online
(qemu) cpu_set 4 online
(qemu) cpu_set 5 online

When I add several CPUs like this - Qemu/KVM userspace will crash.

Very easy to reproduce.

GDB:
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x43c2b940 (LWP 3667)]
0x00459167 in cpu_get_apic_tpr (env=0x1cef87b0)
at /root/Linstall/kvm-84rc1/qemu/hw/apic.c:316
316 {
(gdb) c
Continuing.
[Thread 0x4322a940 (LWP 3666) exited]
[Thread 0x43c2b940 (LWP 3667) exited]
[Thread 0x41e28940 (LWP 3660) exited]
[Thread 0x42829940 (LWP 3661) exited]

Program terminated with signal SIGSEGV, Segmentation fault.
The program no longer exists.

Dmesg:
There are no interesting messages in dmesg.

-Alexey, 22.3.2009.

--

Comment By: Gleb Natapov (glebn)
Date: 2009-03-23 11:25

Message:
It is a known problem (at least by me). There is a race when APIC is
accessed before it is created.

--

Comment By: Technologov (technologov)
Date: 2009-03-23 11:16

Message:
Full GDB output attached.

-Alexey, 23.3.2009.
File Added: gdb_of_cpu_hotplug_crash.txt

--

Comment By: Glauber de Oliveira Costa (glommer)
Date: 2009-03-22 18:34

Message:
Can you get a backtrace for all threads in qemu in the moment of the
crash?

thanks

--

You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2703537group_id=180599
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] reuse (pop|push)_irq from svm.c in vmx.c

2009-03-23 Thread Avi Kivity

Gleb Natapov wrote:

Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] Timer event should not unconditionally unhalt vcpu.

2009-03-23 Thread Avi Kivity

Gleb Natapov wrote:

On Mon, Mar 23, 2009 at 12:12:06PM +0200, Gleb Natapov wrote:
  

Currently timer events are processed before entering guest mode. Move it
to main vcpu event loop since timer events should be processed even while
vcpu is haled. Timer may cause interrupt/nmi to be injected and only then
vcpu will be unhalted.



Use this one instead. Previous broke -no-kvm-irqchip option.
  


Looks good to me.  But this is tricky code.  Marcelo, Sheng, your opinions?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Interrupt unhalts vcpu when it shouldn't

2009-03-23 Thread Avi Kivity

Avi Kivity wrote:

Gleb Natapov wrote:

kvm_vcpu_block() unhalts vpu on an interrupt/timer without checking
if interrupt window is actually opened.

 
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)

+{
+struct vcpu_svm *svm = to_svm(vcpu);
+struct vmcb *vmcb = svm-vmcb;
+return (vmcb-save.rflags  X86_EFLAGS_IF)  +
!(vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK) 

+(svm-vcpu.arch.hflags  HF_GIF_MASK);
+}
+
 
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)

+{
+vmx_update_window_states(vcpu);
+return vcpu-arch.interrupt_window_open;
+}
+
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 }
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+return kvm_x86_ops-interrupt_allowed(vcpu);
+}
  


If the guest enables interrupts but sets tpr/cr8 to block interrupts, 
we'll spin (like we do now).


So I think this should be called kvm_arch_can_accept_interrupt() and 
take tpr into account.




kvm_cpu_has_interrupt() takes the tpr into account, so we're okay here.

Marcelo, Sheng?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: phenom, amd780g, tsc, hpet, kvm, kernel -- who's at fault?

2009-03-23 Thread Ingo Molnar

* Michael Tokarev m...@tls.msk.ru wrote:

 Now, after quite some googling around, I tried to disable hpet, 
 booting with hpet=disable parameter.  And that one fixed all the 
 problems at once. 7 days uptime, I stress-tested it several times, 
 it works with TSC as timesource (still a problem within guests as 
 those shows unstable TSC anyway) since boot, no issues logged.  
 Even cpufreq works as expected...

 Note that i tried to disable hpet as clocksource several times but 
 without any noticeable effect - kernel still used hpet and hpet2 
 for something, and printed that scary increasing min_delay 
 message on a semi-regular basis usually after the next 'stuck' 
 state

It could again go bad like it did before - those messages are signs 
of HPET weirdnesses.

Probably your box's hpet needs to be blacklisted, so that it gets 
disabled automatically on bootup.

Ingo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Anthony Liguori
This is just a first cut.  It needs a fair bit of cleanup before it can be
committed.  I also think we need to fixup the AIO abstractions a bit.

I wanted to share though in case anyone is interested in doing some performance
comparisons.  It seems to work although I haven't exercised it very much.

diff --git a/Makefile b/Makefile
index 82fec80..afc6b41 100644
--- a/Makefile
+++ b/Makefile
@@ -61,6 +61,9 @@ else
 ifdef CONFIG_AIO
 BLOCK_OBJS += posix-aio-compat.o
 endif
+ifdef CONFIG_LINUX_AIO
+BLOCK_OBJS += linux-aio.o
+endif
 BLOCK_OBJS += block-raw-posix.o
 endif
 
diff --git a/Makefile.target b/Makefile.target
index 41366ee..df2a794 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -514,6 +514,9 @@ else
 ifdef CONFIG_AIO
 OBJS+=posix-aio-compat.o
 endif
+ifdef CONFIG_LINUX_AIO
+OBJS+=linux-aio.o
+endif
 OBJS+=block-raw-posix.o
 endif
 
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 1a1a178..e355cf4 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -29,6 +29,9 @@
 #ifdef CONFIG_AIO
 #include posix-aio-compat.h
 #endif
+#ifdef CONFIG_LINUX_AIO
+#include linux-aio.h
+#endif
 
 #ifdef CONFIG_COCOA
 #include paths.h
@@ -68,6 +71,10 @@
 #include sys/diskslice.h
 #endif
 
+#ifdef CONFIG_LINUX_AIO
+#include linux-aio.h
+#endif
+
 //#define DEBUG_FLOPPY
 
 //#define DEBUG_BLOCK
@@ -98,6 +105,17 @@
reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
 
+typedef struct AIOOperations
+{
+struct qemu_aiocb *(*get_aiocb)(void);
+void (*put_aiocb)(struct qemu_aiocb *);
+int (*read)(struct qemu_aiocb *);
+int (*write)(struct qemu_aiocb *);
+int (*error)(struct qemu_aiocb *);
+ssize_t (*get_result)(struct qemu_aiocb *aiocb);
+int (*cancel)(int fd, struct qemu_aiocb *aiocb);
+} AIOOperations;
+
 typedef struct BDRVRawState {
 int fd;
 int type;
@@ -111,8 +129,31 @@ typedef struct BDRVRawState {
 int fd_media_changed;
 #endif
 uint8_t* aligned_buf;
+AIOOperations *aio_ops;
 } BDRVRawState;
 
+static AIOOperations posix_aio_ops = {
+.get_aiocb = qemu_paio_get_aiocb,
+.put_aiocb = qemu_paio_put_aiocb,
+.read = qemu_paio_read,
+.write = qemu_paio_write,
+.error = qemu_paio_error,
+.get_result = qemu_paio_return,
+.cancel = qemu_paio_cancel,
+};
+
+#ifdef CONFIG_LINUX_AIO
+static AIOOperations linux_aio_ops = {
+.get_aiocb = qemu_laio_get_aiocb,
+.put_aiocb = qemu_laio_put_aiocb,
+.read = qemu_laio_read,
+.write = qemu_laio_write,
+.error = qemu_laio_error,
+.get_result = qemu_laio_return,
+.cancel = qemu_laio_cancel,
+};
+#endif
+
 static int posix_aio_init(void);
 
 static int fd_open(BlockDriverState *bs);
@@ -124,6 +165,14 @@ static int raw_open(BlockDriverState *bs, const char 
*filename, int flags)
 
 posix_aio_init();
 
+#ifdef CONFIG_LINUX_AIO
+if ((flags  BDRV_O_NOCACHE)) {
+qemu_laio_init();
+s-aio_ops = linux_aio_ops;
+} else
+#endif
+s-aio_ops = posix_aio_ops;
+
 s-lseek_err_cnt = 0;
 
 open_flags = O_BINARY;
@@ -463,7 +512,7 @@ static int raw_write(BlockDriverState *bs, int64_t 
sector_num,
 
 typedef struct RawAIOCB {
 BlockDriverAIOCB common;
-struct qemu_paiocb aiocb;
+struct qemu_aiocb *aiocb;
 struct RawAIOCB *next;
 int ret;
 } RawAIOCB;
@@ -496,19 +545,24 @@ static void posix_aio_read(void *opaque)
 for(;;) {
 pacb = s-first_aio;
 for(;;) {
+BDRVRawState *s;
+
 acb = *pacb;
 if (!acb)
 goto the_end;
-ret = qemu_paio_error(acb-aiocb);
+
+s = acb-common.bs-opaque;
+ret = s-aio_ops-error(acb-aiocb);
 if (ret == ECANCELED) {
 /* remove the request */
 *pacb = acb-next;
+s-aio_ops-put_aiocb(acb-aiocb);
 qemu_aio_release(acb);
 } else if (ret != EINPROGRESS) {
 /* end of aio */
 if (ret == 0) {
-ret = qemu_paio_return(acb-aiocb);
-if (ret == acb-aiocb.aio_nbytes)
+ret = s-aio_ops-get_result(acb-aiocb);
+if (ret == acb-aiocb-aio_nbytes)
 ret = 0;
 else
 ret = -EINVAL;
@@ -519,6 +573,7 @@ static void posix_aio_read(void *opaque)
 *pacb = acb-next;
 /* call the callback */
 acb-common.cb(acb-common.opaque, ret);
+s-aio_ops-put_aiocb(acb-aiocb);
 qemu_aio_release(acb);
 break;
 } else {
@@ -553,7 +608,6 @@ static int posix_aio_init(void)
 struct sigaction act;
 PosixAioState *s;
 int fds[2];
-struct qemu_paioinit ai;
   
 if (posix_aio_state)
 return 0;
@@ -579,6 +633,8 @@ static int posix_aio_init(void)
 
 qemu_aio_set_fd_handler(s-rfd, posix_aio_read, NULL, posix_aio_flush, s);
 
+

Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Avi Kivity

Anthony Liguori wrote:

This is just a first cut.  It needs a fair bit of cleanup before it can be
committed.  I also think we need to fixup the AIO abstractions a bit.

I wanted to share though in case anyone is interested in doing some performance
comparisons.  It seems to work although I haven't exercised it very much.

 
+typedef struct AIOOperations

+{
+struct qemu_aiocb *(*get_aiocb)(void);
+void (*put_aiocb)(struct qemu_aiocb *);
+int (*read)(struct qemu_aiocb *);
+int (*write)(struct qemu_aiocb *);
+int (*error)(struct qemu_aiocb *);
+ssize_t (*get_result)(struct qemu_aiocb *aiocb);
+int (*cancel)(int fd, struct qemu_aiocb *aiocb);
+} AIOOperations;
+
  



Instead of introducing yet another layer of indirection, you could add 
block-raw-linux-aio, which would be registered before block-raw-posix 
(which is realy block-raw-threadpool...), and resist a -probe() if 
caching is enabled.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] Timer event should not unconditionally unhalt vcpu.

2009-03-23 Thread Marcelo Tosatti
On Mon, Mar 23, 2009 at 04:26:34PM +0200, Avi Kivity wrote:
 Gleb Natapov wrote:
 On Mon, Mar 23, 2009 at 12:12:06PM +0200, Gleb Natapov wrote:
   
 Currently timer events are processed before entering guest mode. Move it
 to main vcpu event loop since timer events should be processed even while
 vcpu is haled. Timer may cause interrupt/nmi to be injected and only then
 vcpu will be unhalted.

 
 Use this one instead. Previous broke -no-kvm-irqchip option.
   

 Looks good to me.  But this is tricky code.  Marcelo, Sheng, your opinions?

Looks good. Checking for timer interrupts after guest entry is strange,
but it can be changed in the future.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Anthony Liguori

Avi Kivity wrote:


Instead of introducing yet another layer of indirection, you could add 
block-raw-linux-aio, which would be registered before block-raw-posix 
(which is realy block-raw-threadpool...), and resist a -probe() if 
caching is enabled.


block-raw-posix needs a major overhaul.  That's why I'm not even 
considering committing the patch as is.


I'd like to see the O_DIRECT bounce buffering removed in favor of the 
DMA API bouncing.  Once that happens, raw_read and raw_pread can 
disappear.  block-raw-posix becomes much simpler.


We would drop the signaling stuff and have the thread pool use an fd to 
signal.  The big problem with that right now is that it'll cause a 
performance regression for certain platforms until we have the IO thread 
in place.


Regards,

Anthony Liguori


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Christoph Hellwig
On Mon, Mar 23, 2009 at 06:17:36PM +0200, Avi Kivity wrote:
 Instead of introducing yet another layer of indirection, you could add  
 block-raw-linux-aio, which would be registered before block-raw-posix  
 (which is realy block-raw-threadpool...), and resist a -probe() if  
 caching is enabled.

Exactly the kind of comment I was about to make, but I need to read a
little deeper to understand all the details.

But my gut feeling is that this abstraction doesn't help us very much,
especially with Avi's aiocb pools in place.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Christoph Hellwig
On Mon, Mar 23, 2009 at 12:14:58PM -0500, Anthony Liguori wrote:
 I'd like to see the O_DIRECT bounce buffering removed in favor of the  
 DMA API bouncing.  Once that happens, raw_read and raw_pread can  
 disappear.  block-raw-posix becomes much simpler.

See my vectored I/O patches for doing the bounce buffering at the
optimal place for the aio path. Note that from my reading of the
qcow/qcow2 code they might send down unaligned requests, which is
something the dma api would not help with.

For the buffered I/O path we will always have to do some sort of buffering
due to all the partition header reading / etc.  And given how that part
isn't performance critical my preference would be to keep doing it in
bdrv_pread/write and guarantee the lowlevel drivers proper alignment.

 We would drop the signaling stuff and have the thread pool use an fd to  
 signal.  The big problem with that right now is that it'll cause a  
 performance regression for certain platforms until we have the IO thread  
 in place.

Talking about signaling, does anyone remember why the Linux signalfd/
eventfd support is only in kvm but not in upstream qemu?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: cr3 OOS optimisation breaks 32-bit GNU/kFreeBSD guest

2009-03-23 Thread Marcelo Tosatti
On Sun, Mar 22, 2009 at 11:35:00AM +0200, Avi Kivity wrote:
 Good catch, indeed.  But is it sufficient?  We could unlink a page  
 through other means, for example by the guest zapping a page directory  
 entry.  

Yep.

 Maybe it's best to resync when relinking a global page?

How about this. It will shorten the unsync period of global pages,
unfortunately.

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a36f7f..bccdcc7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1238,6 +1238,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
set_bit(KVM_REQ_MMU_SYNC, vcpu-requests);
kvm_mmu_mark_parents_unsync(vcpu, sp);
}
+   if (role.level != PT_PAGE_TABLE_LEVEL 
+   !list_empty(vcpu-kvm-arch.oos_global_pages))
+   set_bit(KVM_REQ_MMU_GLOBAL_SYNC, 
vcpu-requests);
+
pgprintk(%s: found\n, __func__);
return sp;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2ea8262..48169d7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3109,6 +3109,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
kvm_write_guest_time(vcpu);
if (test_and_clear_bit(KVM_REQ_MMU_SYNC, vcpu-requests))
kvm_mmu_sync_roots(vcpu);
+   if (test_and_clear_bit(KVM_REQ_MMU_GLOBAL_SYNC, 
vcpu-requests))
+   kvm_mmu_sync_global(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, vcpu-requests))
kvm_x86_ops-tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 11eb702..8efd6e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -37,7 +37,8 @@
 #define KVM_REQ_PENDING_TIMER  5
 #define KVM_REQ_UNHALT 6
 #define KVM_REQ_MMU_SYNC   7
-#define KVM_REQ_KVMCLOCK_UPDATE8
+#define KVM_REQ_MMU_GLOBAL_SYNC8 
+#define KVM_REQ_KVMCLOCK_UPDATE9
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID0
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Christoph Hellwig
On Mon, Mar 23, 2009 at 12:14:58PM -0500, Anthony Liguori wrote:
 block-raw-posix needs a major overhaul.  That's why I'm not even  
 considering committing the patch as is.

I have some WIP patches that split out the host device bits into
separate files to get block-raw-posix down to the pure file handling
bits without all the host-specific host device mess.  But it's at the
end of a really large pile, which needs to be rebases once we have the
patches already on the list in in some form.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


kvm: external module: do not hardcode tsc_khz

2009-03-23 Thread Marcelo Tosatti

external module compat hard codes tsc_khz as 200 if KERNEL_VERSION 
(2,6,23).

This breaks kvmclock on hosts with different frequency.

While tsc_khz was only exported on 2.6.23, the majority of relevant
older v2.6 based distros seem to have it exported.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

diff --git a/kernel/external-module-compat-comm.h 
b/kernel/external-module-compat-comm.h
index a14cea2..e0dc577 100644
--- a/kernel/external-module-compat-comm.h
+++ b/kernel/external-module-compat-comm.h
@@ -387,15 +387,11 @@ static inline struct page *__kvm_vm_fault(struct 
vm_area_struct *vma,
 #endif
 
 #if LINUX_VERSION_CODE  KERNEL_VERSION(2,6,23)
-
-static unsigned  __attribute__((__used__)) kvm_tsc_khz = 200;
-
-#else
+extern unsigned int tsc_khz;
+#endif
 
 #define kvm_tsc_khz tsc_khz
 
-#endif
-
 #if LINUX_VERSION_CODE = KERNEL_VERSION(2,6,21)
 
 #include linux/ktime.h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Anthony Liguori

Christoph Hellwig wrote:

On Mon, Mar 23, 2009 at 12:14:58PM -0500, Anthony Liguori wrote:
  
I'd like to see the O_DIRECT bounce buffering removed in favor of the  
DMA API bouncing.  Once that happens, raw_read and raw_pread can  
disappear.  block-raw-posix becomes much simpler.



See my vectored I/O patches for doing the bounce buffering at the
optimal place for the aio path. Note that from my reading of the
qcow/qcow2 code they might send down unaligned requests, which is
something the dma api would not help with.
  


I was going to look today at applying those.


For the buffered I/O path we will always have to do some sort of buffering
due to all the partition header reading / etc.  And given how that part
isn't performance critical my preference would be to keep doing it in
bdrv_pread/write and guarantee the lowlevel drivers proper alignment.
  


I really dislike having so many APIs.  I'd rather have an aio API that 
took byte accesses or have pread/pwrite always be emulated with a full 
sector read/write


We would drop the signaling stuff and have the thread pool use an fd to  
signal.  The big problem with that right now is that it'll cause a  
performance regression for certain platforms until we have the IO thread  
in place.



Talking about signaling, does anyone remember why the Linux signalfd/
eventfd support is only in kvm but not in upstream qemu?
  


Because upstream QEMU doesn't yet have an IO thread.

TCG chains together TBs and if you have a tight loop in a VCPU, then the 
only way to break out of the loop is to receive a signal.  The signal 
handler will call cpu_interrupt() which will unchain TBs allowing TCG 
execution to break once you return from the signal handler.


An IO thread solves this in a different way by letting select() always 
run in parallel to TCG VCPU execution.  When select() returns you can 
send a signal to the TCG VCPU thread to break it out of chained TBs.


Not all IO in qemu generates a signal so this a potential problem but in 
practice, if we don't generate a signal for disk IO completion, a number 
of real world guests breaks (mostly non-x86 boards).


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Improvements for task switching

2009-03-23 Thread Julian Stecklina
Kohl, Bernhard (NSN - DE/Munich) bernhard.k...@nsn.com writes:

 Jan Kiszka Wrote:
[...]
 OK, after the discussion has finished, I will submit separate patches.

Is there any progress on this? I've been using this patch for several
days now with no ill effects.

The patch fixes Bug 2681442 for me:
https://sourceforge.net/tracker/?func=detailatid=893831aid=2681442group_id=180599

Regards,
-- 
Julian Stecklina

The day Microsoft makes something that doesn't suck is probably the day
they start making vacuum cleaners - Ernst Jan Plugge

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Kernel GPF in vmx_save_host_state()

2009-03-23 Thread Avi Kivity

Benjamin Gilbert wrote:

vmx_set_msr: msr_index 0xc080 msr-index 0xc080 msr-data 0x100


How did that get in there?!

Please add a dump_stack() after that printk().


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM on Via Nano (Isaiah) CPUs? Virus checked

2009-03-23 Thread Avi Kivity

Andreas Tanz wrote:

[ 3732.020033] returning from kvm_handle_exit, cause 3, retval = 1, exit_reason 
= 7
  


Here, vmx tells us that the guest is ready to accept interrupts (having 
executed the sti instruction)



[ 3732.020044] vmx-vmx_vcpu_run() 00 : vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) 
returned 0x8408
  


... noticing that, kvm injects a timer interrupt that was previously 
blocked ...



[ 3732.020056] vmx-handle_exception 00 : giving some infos
[ 3732.020062] vmx-handle_exception 01 : vect_info: 0x0
[ 3732.020067] vmx-handle_exception 02 : intr_info: 0x8b0d, 
is_page_fault()==0
  
... and the Nano rewards us with a General Protection Fault instead of 
injecting the interrupt.


Will talk to the specification and come up with further tests.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Kernel GPF in vmx_save_host_state()

2009-03-23 Thread Benjamin Gilbert

Avi Kivity wrote:

Benjamin Gilbert wrote:

vmx_set_msr: msr_index 0xc080 msr-index 0xc080 msr-data 0x100


How did that get in there?!

Please add a dump_stack() after that printk().


Pid: 2381, comm: qemu-system-x86 Not tainted 2.6.28-686 #4
Call Trace:
 [f8cf2fdc] vmx_set_msr+0x150/0x178 [kvm_intel]
 [f8cf325a] handle_wrmsr+0x71/0x9d [kvm_intel]
 [f8cf4fb0] kvm_handle_exit+0x1c8/0x1e5 [kvm_intel]
 [f7c58e34] kvm_arch_vcpu_ioctl_run+0x6f2/0x918 [kvm]
 [c0406469] ? _spin_unlock_irqrestore+0x59/0x5d
 [c0403ede] ? preempt_schedule+0x30/0x3f
 [f7c5467d] kvm_vcpu_ioctl+0xf4/0x40f [kvm]
 [c013ae80] ? up_read+0x1b/0x2f
 [c0148144] ? futex_wake+0xd0/0xdb
 [c0148e4c] ? do_futex+0x81/0x6c9
 [f7c54589] ? kvm_vcpu_ioctl+0x0/0x40f [kvm]
 [c018e345] vfs_ioctl+0x27/0x6c
 [c018e7ec] do_vfs_ioctl+0x394/0x3d8
 [c0184c88] ? fget_light+0xc8/0xe4
 [c018e84c] ? sys_ioctl+0x1c/0x5f
 [c011e06f] ? sub_preempt_count+0x9d/0xab
 [c018e875] sys_ioctl+0x45/0x5f
 [c0102e25] sysenter_do_call+0x12/0x35

--Benjamin Gilbert
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Christoph Hellwig
On Mon, Mar 23, 2009 at 01:10:30PM -0500, Anthony Liguori wrote:
 I really dislike having so many APIs.  I'd rather have an aio API that 
 took byte accesses or have pread/pwrite always be emulated with a full 
 sector read/write

I had patches to change the aio API to byte based access, and get rid
of the read/write methods to only have the byte based pread/pwrite
APIs, but thay got obsoleted by Avi's patch to kill the pread/pwrite
ops.  We could put in byte-based AIO without byte-based read/write,
though.  In my patches I put a flag into BlockDriverState whether we
allow byte-based access to this instance or otherwise emulated it in
the block layer.  We still need this as many of the image formats can't
deal with byte-granularity access without read-modify-write cycles,
and I think we're better off having one read-modify-write handler in
the block handler than one per image format that needs it.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Kernel GPF in vmx_save_host_state()

2009-03-23 Thread Benjamin Gilbert

Avi Kivity wrote:
On 32-bit, we might actually reach the default: label of the switch in 
vmx_set_msr().  Can you add a printk() there? print both msr_index, and, 
if msr is not NULL, msr-index and msr-data.


Sure:

vmx_set_msr: msr_index 0xc081 msr-index 0xc081 msr-data 0x0
vmx_set_msr: msr_index 0xc0010117
vmx_set_msr: msr_index 0x250
vmx_set_msr: msr_index 0x258
vmx_set_msr: msr_index 0x259
vmx_set_msr: msr_index 0x268
vmx_set_msr: msr_index 0x269
vmx_set_msr: msr_index 0x26a
vmx_set_msr: msr_index 0x26b
vmx_set_msr: msr_index 0x26c
vmx_set_msr: msr_index 0x26d
vmx_set_msr: msr_index 0x26e
vmx_set_msr: msr_index 0x26f
vmx_set_msr: msr_index 0x200
vmx_set_msr: msr_index 0x201
vmx_set_msr: msr_index 0x2ff
vmx_set_msr: msr_index 0xc080 msr-index 0xc080 msr-data 0x100
general protection fault:  [#1] PREEMPT SMP
[...]

The printk is after msr-data is set to the value of the data parameter.

--Benjamin Gilbert
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] qemu: SMBIOS passing support

2009-03-23 Thread Alex Williamson

This series adds a new -smbios option for x86 that allows individual
SMBIOS entries to be passed into the guest VM.  This follows the same
basic path as the support for loading ACPI tables.  While SMBIOS is
independent of ACPI, I chose to add the smbios_entry_add() function to
acpi.c because they're both somewhat PC BIOS related (and ia64 can
support SMBIOS and might be able to make use of it there).

This feature allows the guest to see certain properties of the host if
configured correctly.  For instance, the system model and serial number
in the type 1 entry.  Obviously its only built at boot, so doesn't get
updated for migration scenarios.  User provided entries will supersede
generated entries, so care should be taken when passing entries which
describe physical properties, such as memory size and address ranges.
Thanks,

Alex 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] qemu: Allow SMBIOS entries to be loaded and provided to the VM BIOS

2009-03-23 Thread Alex Williamson

Create a new -smbios options that takes binary SMBIOS entries
to provide to the VM BIOS.  The binary can be easily generated
using something like:

dmidecode -t 1 -u | grep $'^\t\t[^]' | xargs -n1 | \
perl -lne 'printf %c, hex($_)'  smbios_type_1.bin

For some inventory tools, this makes the VM report the system
information for the host.  One entry per binary file, multiple
files can be chained together as:

  -smbios file1,file2,...

or specified independently:

  -smbios file1 -smbios file2

Signed-off-by: Alex Williamson alex.william...@hp.com
--

diff --git a/hw/acpi.c b/hw/acpi.c
index 52f50a0..0bd93bf 100644
--- a/hw/acpi.c
+++ b/hw/acpi.c
@@ -915,3 +915,69 @@ out:
 }
 return -1;
 }
+
+char *smbios_entries;
+size_t smbios_entries_len;
+
+int smbios_entry_add(const char *t)
+{
+struct stat s;
+char file[1024], *p, *f, *n;
+int fd, r;
+size_t len, off;
+
+f = (char *)t;
+do {
+n = strchr(f, ',');
+if (n) {
+strncpy(file, f, (n - f));
+file[n - f] = '\0';
+f = n + 1;
+} else {
+strcpy(file, f);
+f += strlen(file);
+}
+
+fd = open(file, O_RDONLY);
+if (fd  0)
+return -1;
+
+if (fstat(fd, s)  0) {
+close(fd);
+return -1;
+}
+
+if (!smbios_entries) {
+smbios_entries_len = sizeof(uint16_t);
+smbios_entries = qemu_mallocz(smbios_entries_len);
+}
+
+len = s.st_size;
+smbios_entries = qemu_realloc(smbios_entries, smbios_entries_len +
+  len + sizeof(uint16_t));
+p = smbios_entries + smbios_entries_len;
+
+*(uint16_t *)p = cpu_to_le32(len);
+p += sizeof(uint16_t);
+
+off = 0;
+do {
+r = read(fd, p + off, len);
+if (r  0) {
+off += r;
+len -= r;
+} else if ((r  0  errno != EINTR) || r == 0) {
+close(fd);
+return -1;
+}
+} while (len);
+
+close(fd);
+
+smbios_entries_len += s.st_size + sizeof(uint16_t);
+(*(uint16_t *)smbios_entries) =
+   cpu_to_le32(le32_to_cpu(*(uint16_t *)smbios_entries) + 1);
+} while (*f);
+
+return 0;
+}
diff --git a/hw/pc.c b/hw/pc.c
index 69f25f3..ec65e33 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -51,6 +51,7 @@
 #define ACPI_DATA_SIZE   0x1
 #define BIOS_CFG_IOPORT 0x510
 #define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0)
+#define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1)
 
 #define MAX_IDE_BUS 2
 
@@ -442,6 +443,8 @@ static void bochs_bios_init(void)
 fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size);
 fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES, (uint8_t *)acpi_tables,
  acpi_tables_len);
+fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES, (uint8_t *)smbios_entries,
+ smbios_entries_len);
 }
 
 /* Generate an initial boot sector which sets state and jump to
diff --git a/hw/pc.h b/hw/pc.h
index 5b378d4..6c200b3 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -106,12 +106,15 @@ int ioport_get_a20(void);
 extern int acpi_enabled;
 extern char *acpi_tables;
 extern size_t acpi_tables_len;
+extern char *smbios_entries;
+extern size_t smbios_entries_len;
 
 i2c_bus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
qemu_irq sci_irq);
 void piix4_smbus_register_device(SMBusDevice *dev, uint8_t addr);
 void acpi_bios_init(void);
 int acpi_table_add(const char *table_desc);
+int smbios_entry_add(const char *smbios_entry);
 
 /* hpet.c */
 extern int no_hpet;
diff --git a/vl.c b/vl.c
index b62a2d4..372b83c 100644
--- a/vl.c
+++ b/vl.c
@@ -4061,6 +4061,7 @@ static void help(int exitcode)
-no-hpetdisable HPET\n
-acpitable 
[sig=str][,rev=n][,oem_id=str][,oem_table_id=str][,oem_rev=n][,asl_compiler_id=str][,asl_compiler_rev=n][,data=file1[:file2]...]\n
ACPI table description\n
+   -smbios file1[,file2]  SMBIOS entry\n
 #endif
Linux boot specific:\n
-kernel bzImage use 'bzImage' as kernel image\n
@@ -4201,6 +4202,7 @@ enum {
 QEMU_OPTION_no_acpi,
 QEMU_OPTION_no_hpet,
 QEMU_OPTION_acpitable,
+QEMU_OPTION_smbios,
 
 /* Linux boot specific: */
 QEMU_OPTION_kernel,
@@ -4322,6 +4324,7 @@ static const QEMUOption qemu_options[] = {
 { no-acpi, 0, QEMU_OPTION_no_acpi },
 { no-hpet, 0, QEMU_OPTION_no_hpet },
 { acpitable, HAS_ARG, QEMU_OPTION_acpitable },
+{ smbios, HAS_ARG, QEMU_OPTION_smbios },
 #endif
 
 /* Linux boot specific: */
@@ -5152,6 +5155,12 @@ int main(int argc, char **argv, char **envp)
 exit(1);
 }
 break;
+case QEMU_OPTION_smbios:
+if(smbios_entry_add(optarg)  0) {
+

[PATCH 2/2] qemu:bios: Read external SMBIOS entries from the VM

2009-03-23 Thread Alex Williamson

SMBIOS entries can be read from the VM using the same mechanism
as additional ACPI tables.  External entries will supercede
generated entries.

Signed-off-by: Alex Williamson alex.william...@hp.com
--

diff --git a/bios/rombios32.c b/bios/rombios32.c
index 7be4216..f0e0f8c 100644
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -471,6 +471,7 @@ void wrmsr_smp(uint32_t index, uint64_t val)
 #define QEMU_CFG_UUID   0x02
 #define QEMU_CFG_ARCH_LOCAL 0x8000
 #define QEMU_CFG_ACPI_TABLES  (QEMU_CFG_ARCH_LOCAL + 0)
+#define QEMU_CFG_SMBIOS_ENTRIES  (QEMU_CFG_ARCH_LOCAL + 1)
 
 int qemu_cfg_port;
 
@@ -519,6 +520,16 @@ static int acpi_load_table(int i, uint32_t addr, uint16_t 
*len)
 qemu_cfg_read((uint8_t*)addr, *len);
 return 0;
 }
+
+static uint16_t smbios_entries(void)
+{
+uint16_t cnt;
+
+qemu_cfg_select(QEMU_CFG_SMBIOS_ENTRIES);
+qemu_cfg_read((uint8_t*)cnt, sizeof(cnt));
+
+return cnt;
+}
 #endif
 
 void uuid_probe(void)
@@ -1966,7 +1977,7 @@ smbios_entry_point_init(void *start,
 /* Type 0 -- BIOS Information */
 #define RELEASE_DATE_STR 01/01/2007
 static void *
-smbios_type_0_init(void *start)
+smbios_init_type_0(void *start)
 {
 struct smbios_type_0 *p = (struct smbios_type_0 *)start;
 
@@ -2002,7 +2013,7 @@ smbios_type_0_init(void *start)
 
 /* Type 1 -- System Information */
 static void *
-smbios_type_1_init(void *start)
+smbios_init_type_1(void *start)
 {
 struct smbios_type_1 *p = (struct smbios_type_1 *)start;
 p-header.type = 1;
@@ -2028,7 +2039,7 @@ smbios_type_1_init(void *start)
 
 /* Type 3 -- System Enclosure */
 static void *
-smbios_type_3_init(void *start)
+smbios_init_type_3(void *start)
 {
 struct smbios_type_3 *p = (struct smbios_type_3 *)start;
 
@@ -2058,7 +2069,7 @@ smbios_type_3_init(void *start)
 
 /* Type 4 -- Processor Information */
 static void *
-smbios_type_4_init(void *start, unsigned int cpu_number)
+smbios_init_type_4(void *start, unsigned int cpu_number)
 {
 struct smbios_type_4 *p = (struct smbios_type_4 *)start;
 
@@ -2098,7 +2109,7 @@ smbios_type_4_init(void *start, unsigned int cpu_number)
 
 /* Type 16 -- Physical Memory Array */
 static void *
-smbios_type_16_init(void *start, uint32_t memsize, int nr_mem_devs)
+smbios_init_type_16(void *start, uint32_t memsize, int nr_mem_devs)
 {
 struct smbios_type_16 *p = (struct smbios_type_16*)start;
 
@@ -2121,7 +2132,7 @@ smbios_type_16_init(void *start, uint32_t memsize, int 
nr_mem_devs)
 
 /* Type 17 -- Memory Device */
 static void *
-smbios_type_17_init(void *start, uint32_t memory_size_mb, int instance)
+smbios_init_type_17(void *start, uint32_t memory_size_mb, int instance)
 {
 struct smbios_type_17 *p = (struct smbios_type_17 *)start;
 
@@ -2151,7 +2162,7 @@ smbios_type_17_init(void *start, uint32_t memory_size_mb, 
int instance)
 
 /* Type 19 -- Memory Array Mapped Address */
 static void *
-smbios_type_19_init(void *start, uint32_t memory_size_mb, int instance)
+smbios_init_type_19(void *start, uint32_t memory_size_mb, int instance)
 {
 struct smbios_type_19 *p = (struct smbios_type_19 *)start;
 
@@ -2172,7 +2183,7 @@ smbios_type_19_init(void *start, uint32_t memory_size_mb, 
int instance)
 
 /* Type 20 -- Memory Device Mapped Address */
 static void *
-smbios_type_20_init(void *start, uint32_t memory_size_mb, int instance)
+smbios_init_type_20(void *start, uint32_t memory_size_mb, int instance)
 {
 struct smbios_type_20 *p = (struct smbios_type_20 *)start;
 
@@ -2196,7 +2207,7 @@ smbios_type_20_init(void *start, uint32_t memory_size_mb, 
int instance)
 
 /* Type 32 -- System Boot Information */
 static void *
-smbios_type_32_init(void *start)
+smbios_init_type_32(void *start)
 {
 struct smbios_type_32 *p = (struct smbios_type_32 *)start;
 
@@ -2214,7 +2225,7 @@ smbios_type_32_init(void *start)
 
 /* Type 127 -- End of Table */
 static void *
-smbios_type_127_init(void *start)
+smbios_init_type_127(void *start)
 {
 struct smbios_type_127 *p = (struct smbios_type_127 *)start;
 
@@ -2228,6 +2239,91 @@ smbios_type_127_init(void *start)
 return start + 2;
 }
 
+static int
+smbios_load_external(int type, char **p, char **q, unsigned *nr_structs,
+ unsigned *max_struct_size)
+{
+#ifdef BX_QEMU
+static uint64_t used_bitmap[4] = { 0 };
+static uint16_t used_cnt = 0;
+char *start = *p;
+uint16_t len;
+int i;
+
+/* Keep track of the entry types we've already processed */
+if (used_bitmap[(type  6)  0x3]  (1ULL  (type  0x3f)))
+return 1;
+
+/* Skip end markers, they could lead to bogus tables */
+if (type == 127)
+return 0;
+
+/* Check if there are any tables left to report, also reset read index */
+i = smbios_entries();
+if (used_cnt == i)
+return 0;
+
+for (; i  0; *q = *p, i--) {
+int string_data;
+qemu_cfg_read((uint8_t*)len, sizeof(len));
+if (!len)
+continue;
+if (len  sizeof(struct 

Re: Kernel GPF in vmx_save_host_state()

2009-03-23 Thread Avi Kivity

Benjamin Gilbert wrote:

vmx_set_msr: msr_index 0xc080 msr-index 0xc080 msr-data 0x100


How did that get in there?!

Please add a dump_stack() after that printk().


Pid: 2381, comm: qemu-system-x86 Not tainted 2.6.28-686 #4
Call Trace:
 [f8cf2fdc] vmx_set_msr+0x150/0x178 [kvm_intel]
 [f8cf325a] handle_wrmsr+0x71/0x9d [kvm_intel]


Duh, I noted this hole in a previous email.

Attached patch should fix.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 88ef094..da6461d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -942,11 +942,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	int ret = 0;
 
 	switch (msr_index) {
-#ifdef CONFIG_X86_64
 	case MSR_EFER:
 		vmx_load_host_state(vmx);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
 		break;
+#ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
 		break;


[PATCH] mm/memory.c:unmap_vmas(): fix NULL * deref

2009-03-23 Thread john cooper

This cropped up in stress testing of a backport
of the mmu notifier mechanism, however it still
exists in 2.6.28.8 as well.  Patch attached.

Signed-off-by: john.coo...@redhat.com

--
john.coo...@third-harmonic.com
 mm/memory.c |8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)
=
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -899,9 +899,10 @@ unsigned long unmap_vmas(struct mmu_gath
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details-i_mmap_lock: NULL;
 	int fullmm = (*tlbp)-fullmm;
-	struct mm_struct *mm = vma-vm_mm;
+	struct mm_struct *mm = vma ? vma-vm_mm : NULL;
 
-	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	if (mm)
+		mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
 	for ( ; vma  vma-vm_start  end_addr; vma = vma-vm_next) {
 		unsigned long end;
 
@@ -966,7 +967,8 @@ unsigned long unmap_vmas(struct mmu_gath
 		}
 	}
 out:
-	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
+	if (mm)
+		mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
 	return start;	/* which is now the end (or restart) address */
 }
 


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Avi Kivity

Christoph Hellwig wrote:

On Mon, Mar 23, 2009 at 01:10:30PM -0500, Anthony Liguori wrote:
  
I really dislike having so many APIs.  I'd rather have an aio API that 
took byte accesses or have pread/pwrite always be emulated with a full 
sector read/write



I had patches to change the aio API to byte based access, and get rid
of the read/write methods to only have the byte based pread/pwrite
APIs, but thay got obsoleted by Avi's patch to kill the pread/pwrite
ops.  We could put in byte-based AIO without byte-based read/write,
though.  In my patches I put a flag into BlockDriverState whether we
allow byte-based access to this instance or otherwise emulated it in
the block layer.  


I like this approach.  An additional flag could tell us what buffer 
alignment the format driver wants, so we can eliminate the alignment 
bounce from format driver code.  Oh, and a flag to indicate we don't 
support vectors, so the generic layer will bounce and send us a length 
one iovec.


Note the align flag is in the device state, not the format driver, as it 
depends on the cache= settings.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Avi Kivity

Anthony Liguori wrote:

Avi Kivity wrote:


Instead of introducing yet another layer of indirection, you could 
add block-raw-linux-aio, which would be registered before 
block-raw-posix (which is realy block-raw-threadpool...), and resist 
a -probe() if caching is enabled.


block-raw-posix needs a major overhaul.  That's why I'm not even 
considering committing the patch as is.


That would suggest block-raw-linux-aio-bork-bork-bork.c even more, no?



I'd like to see the O_DIRECT bounce buffering removed in favor of the 
DMA API bouncing.  Once that happens, raw_read and raw_pread can 
disappear.  block-raw-posix becomes much simpler.


They aren't really related... note that DMA API requests are likely to 
be aligned anyway, since the guest generates them with the expectation 
that alignent is required.  We need to align at a lower level so we can 
take care of non-dma-api callers (mostly qemu internal).




We would drop the signaling stuff and have the thread pool use an fd 
to signal.  The big problem with that right now is that it'll cause a 
performance regression for certain platforms until we have the IO 
thread in place. 


Well, let's merge this after the iothread?

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Kernel GPF in vmx_save_host_state()

2009-03-23 Thread Benjamin Gilbert

Avi Kivity wrote:

Duh, I noted this hole in a previous email.

Attached patch should fix.


It does, thanks.

--Benjamin Gilbert
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH][RFC] Linux AIO support when using O_DIRECT

2009-03-23 Thread Anthony Liguori

Avi Kivity wrote:




We would drop the signaling stuff and have the thread pool use an fd 
to signal.  The big problem with that right now is that it'll cause a 
performance regression for certain platforms until we have the IO 
thread in place. 


Well, let's merge this after the iothread?


Yup.  Just posted that patch in case anyone was interested.  I needed it 
so that we could do some performance testing...


Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM on Via Nano (Isaiah) CPUs? Virus checked

2009-03-23 Thread Avi Kivity

Avi Kivity wrote:


Will talk to the specification and come up with further tests.



Please printk() vmcs_readl(GUEST_RFLAGS) (where you printk kvm_rip_read()).

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] compute checksum for roms bigger than a segment

2009-03-23 Thread Glauber Costa
Some option roms (e1000 provided by gpxe project as an example)
are bigger than a segment. The current algorithm to compute the
checksum fails in such case. To proper compute the checksum, this
patch deals with the possibility of the rom's size crossing a
segment border.

We don't need to worry about it crossing more than one segment
border, since the option roms format only save one byte to store
the image size (thus, maximum size = 0xff = 128k = 2 segments)

Signed-off-by: Glauber Costa glom...@redhat.com
---
 bios/rombios.c |   33 +++--
 1 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/bios/rombios.c b/bios/rombios.c
index bc43251..6fede17 100644
--- a/bios/rombios.c
+++ b/bios/rombios.c
@@ -10162,22 +10162,43 @@ no_serial:
   ret
 
 rom_checksum:
-  push ax
-  push bx
-  push cx
+  pusha
+  push ds
+
   xor  ax, ax
   xor  bx, bx
   xor  cx, cx
+  xor  dx, dx
+
   mov  ch, [2]
   shl  cx, #1
+
+  jnc checksum_loop
+  mov  dx, cx
+  mov  cx, #0x
+
 checksum_loop:
   add  al, [bx]
   inc  bx
   loop checksum_loop
+
+  cmp  dx, #0
+  je checksum_out
+
+  add  al, [bx]
+  mov  cx, dx
+  mov  dx, ds
+  add  dx, #0x1000
+  mov  ds, dx
+  xor  dx, dx 
+  xor  bx, bx
+
+  jmp  checksum_loop
+
+checksum_out:
   and  al, #0xff
-  pop  cx
-  pop  bx
-  pop  ax
+  pop  ds
+  popa 
   ret
 
 
-- 
1.6.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Release: USB image of OS Circular and deb packages of LBCAS (LoopBack Content Addressable Storage)

2009-03-23 Thread Kuniyasu Suzaki

Hello Alex,

 From: Alexander Graf ag...@suse.de
 Subject: Re: Release: USB image of OS Circular and deb packages of LBCAS 
 (LoopBack Content Addressable Storage)
 
 Kuniyasu Suzaki wrote:
  Hello,
 
  We released bootable USB image of OS circular and the Debian packages of 
  LBCAS (LoopBack Content Addressable Storage).
 
 Looking at the backend (LBCAS) that is the actually interesting part: Is
 there any reason you implemented this as FUSE plugin?

Because it is easy to implement a virtual loopback file.
A real content, which is save to a block-file, is mapped to the
virtual loopback file when a block is accessed.  The mapping is a kind
of indirect addressing, which translates physical address to
block-file name (SHA1 digest of its contents). The management is
achieved by FUSE.

 I was thinking of implementing something like this myself for SUSE
 Studio and figured the best way to do it would be to make it a qemu
 block driver backend, en par with the posix-raw one. That way you
 wouldn't need FUSE to download images on demand and also enable this
 features for say Windows users.

Good idea.
OS Circular, however, is designed to be applied to any virtual
machines, i.e., VirtualBox, VMWare, etc. OS Circular also aims to
applied to REAL MACHINE.
Please refer the big picture of OS Circular project.
   http://openlab.jp/oscircular/big-picture.PNG

 So is there any plan on your side to write something like this for
 upstream Qemu? If not that's fine too, I just need to know if I'd have
 to do it myself :-).

Please make your backend driver :-).
I want to ask you to fix the blocking I/O of loopback device.
It would be another topic discussed in this ML threads.

--
suzaki
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Interrupt unhalts vcpu when it shouldn't

2009-03-23 Thread Sheng Yang
On Monday 23 March 2009 23:17:42 Avi Kivity wrote:
 Avi Kivity wrote:
  Gleb Natapov wrote:
  kvm_vcpu_block() unhalts vpu on an interrupt/timer without checking
  if interrupt window is actually opened.
 
 
  +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
  +{
  +struct vcpu_svm *svm = to_svm(vcpu);
  +struct vmcb *vmcb = svm-vmcb;
  +return (vmcb-save.rflags  X86_EFLAGS_IF)  +
  !(vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK) 
  +(svm-vcpu.arch.hflags  HF_GIF_MASK);
  +}
  +
 
  +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  +{
  +vmx_update_window_states(vcpu);
  +return vcpu-arch.interrupt_window_open;
  +}
  +
   static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
   }
  +
  +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
  +{
  +return kvm_x86_ops-interrupt_allowed(vcpu);
  +}
 
  If the guest enables interrupts but sets tpr/cr8 to block interrupts,
  we'll spin (like we do now).
 
  So I think this should be called kvm_arch_can_accept_interrupt() and
  take tpr into account.

 kvm_cpu_has_interrupt() takes the tpr into account, so we're okay here.

 Marcelo, Sheng?

Yes, looks good to me.

-- 
regards
Yang, Sheng

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html