Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-18 Thread Sheng Yang
On Thursday 03 February 2011 09:05:55 Marcelo Tosatti wrote:
 On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote:
  Then we can support mask bit operation of assigned devices now.
  
  Signed-off-by: Sheng Yang sh...@linux.intel.com
  
  +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
  +   struct kvm_msix_mmio_user *mmio_user)
  +{
  +   struct kvm_msix_mmio_dev *mmio_dev = kvm-msix_mmio_dev;
  +   struct kvm_msix_mmio *mmio = NULL;
  +   int r = 0, i;
  +
  +   mutex_lock(mmio_dev-lock);
  +   for (i = 0; i  mmio_dev-mmio_nr; i++) {
  +   if (mmio_dev-mmio[i].dev_id == mmio_user-dev_id 
  +   (mmio_dev-mmio[i].type  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
  +   (mmio_user-type  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
  +   mmio = mmio_dev-mmio[i];
  +   if (mmio-max_entries_nr != mmio_user-max_entries_nr) {
  +   r = -EINVAL;
  +   goto out;
  +   }
  +   break;
  +   }
 
 Why allow this ioctl to succeed if there's an entry already present?
 This case is broken as mmio_dev-mmio_nr is increased below.

Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it.

The reason we allow multiply callings is userspace can register different types 
of 
address here(table address and PBA address).

 PCI bits must be reviewed...

Pardon? PCI related things are already in 2.6.38-rc.

--
regards
Yang, Sheng


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4 v9] MSI-X MMIO support for KVM

2011-02-18 Thread Sheng Yang
Sorry for the long delay, just come back from vacation...

Change from v8:
1. Update struct kvm_run to contain MSI-X routing update exit specific
information.
2. Fix a mmio_nr counting bug.

Notice this patchset still based on 2.6.37 due to a block bug on assigned
device in the upstream now.

Sheng Yang (4):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Add kvm_io_ext_data to IO handler
  KVM: Emulate MSI-X table in kernel
  KVM: Add documents for MSI-X MMIO API

 Documentation/kvm/api.txt |   58 +
 arch/x86/kvm/Makefile |2 +-
 arch/x86/kvm/i8254.c  |6 +-
 arch/x86/kvm/i8259.c  |3 +-
 arch/x86/kvm/lapic.c  |3 +-
 arch/x86/kvm/x86.c|   40 +--
 include/linux/kvm.h   |   28 +
 include/linux/kvm_host.h  |   65 ++-
 virt/kvm/assigned-dev.c   |   44 +++
 virt/kvm/coalesced_mmio.c |3 +-
 virt/kvm/eventfd.c|2 +-
 virt/kvm/ioapic.c |2 +-
 virt/kvm/iodev.h  |   31 +
 virt/kvm/kvm_main.c   |   40 ++-
 virt/kvm/msix_mmio.c  |  293 +
 virt/kvm/msix_mmio.h  |   25 
 16 files changed, 594 insertions(+), 51 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h

2011-02-18 Thread Sheng Yang
Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm_host.h |   23 +++
 virt/kvm/iodev.h |   25 +
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, 
gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+   int (*read)(struct kvm_io_device *this,
+   gpa_t addr,
+   int len,
+   void *val);
+   int (*write)(struct kvm_io_device *this,
+gpa_t addr,
+int len,
+const void *val);
+   void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+   const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include linux/kvm_types.h
+#include linux/kvm_host.h
 #include asm/errno.h
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-   int (*read)(struct kvm_io_device *this,
-   gpa_t addr,
-   int len,
-   void *val);
-   int (*write)(struct kvm_io_device *this,
-gpa_t addr,
-int len,
-const void *val);
-   void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-   const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler

2011-02-18 Thread Sheng Yang
Add a new parameter to IO writing handler, so that we can transfer information
from IO handler to caller.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 arch/x86/kvm/i8254.c  |6 --
 arch/x86/kvm/i8259.c  |3 ++-
 arch/x86/kvm/lapic.c  |3 ++-
 arch/x86/kvm/x86.c|   13 -
 include/linux/kvm_host.h  |   12 ++--
 virt/kvm/coalesced_mmio.c |3 ++-
 virt/kvm/eventfd.c|2 +-
 virt/kvm/ioapic.c |2 +-
 virt/kvm/iodev.h  |6 --
 virt/kvm/kvm_main.c   |4 ++--
 10 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad723..bd8f0c5 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -439,7 +439,8 @@ static inline int pit_in_range(gpa_t addr)
 }
 
 static int pit_ioport_write(struct kvm_io_device *this,
-   gpa_t addr, int len, const void *data)
+   gpa_t addr, int len, const void *data,
+   struct kvm_io_ext_data *ext_data)
 {
struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = pit-pit_state;
@@ -585,7 +586,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
 }
 
 static int speaker_ioport_write(struct kvm_io_device *this,
-   gpa_t addr, int len, const void *data)
+   gpa_t addr, int len, const void *data,
+   struct kvm_io_ext_data *ext_data)
 {
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = pit-pit_state;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05..96b1070 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -480,7 +480,8 @@ static inline struct kvm_pic *to_pic(struct kvm_io_device 
*dev)
 }
 
 static int picdev_write(struct kvm_io_device *this,
-gpa_t addr, int len, const void *val)
+gpa_t addr, int len, const void *val,
+struct kvm_io_ext_data *ext_data)
 {
struct kvm_pic *s = to_pic(this);
unsigned char data = *(unsigned char *)val;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0..f413e9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -836,7 +836,8 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, 
u32 val)
 }
 
 static int apic_mmio_write(struct kvm_io_device *this,
-   gpa_t address, int len, const void *data)
+   gpa_t address, int len, const void *data,
+   struct kvm_io_ext_data *ext_data)
 {
struct kvm_lapic *apic = to_lapic(this);
unsigned int offset = address - apic-base_address;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..21b84e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3571,13 +3571,14 @@ static void kvm_init_msr_list(void)
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
-  const void *v)
+  const void *v, struct kvm_io_ext_data *ext_data)
 {
if (vcpu-arch.apic 
-   !kvm_iodevice_write(vcpu-arch.apic-dev, addr, len, v))
+   !kvm_iodevice_write(vcpu-arch.apic-dev, addr, len, v, ext_data))
return 0;
 
-   return kvm_io_bus_write(vcpu-kvm, KVM_MMIO_BUS, addr, len, v);
+   return kvm_io_bus_write(vcpu-kvm, KVM_MMIO_BUS,
+   addr, len, v, ext_data);
 }
 
 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long 
addr,
   struct kvm_vcpu *vcpu)
 {
gpa_t gpa;
+   struct kvm_io_ext_data ext_data;
 
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3825,7 +3827,7 @@ mmio:
/*
 * Is this MMIO handled locally?
 */
-   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+   if (!vcpu_mmio_write(vcpu, gpa, bytes, val, ext_data))
return X86EMUL_CONTINUE;
 
vcpu-mmio_needed = 1;
@@ -3940,6 +3942,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
/* TODO: String I/O for in kernel device */
int r;
+   struct kvm_io_ext_data ext_data;
 
if (vcpu-arch.pio.in)
r = kvm_io_bus_read(vcpu-kvm, KVM_PIO_BUS, vcpu-arch.pio.port,
@@ -3947,7 +3950,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
else
r = kvm_io_bus_write(vcpu-kvm, KVM_PIO_BUS,
 vcpu-arch.pio.port, vcpu-arch.pio.size,
-pd);
+pd, ext_data);
return r;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..6bb211d 100644

[PATCH 4/4] KVM: Add documents for MSI-X MMIO API

2011-02-18 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 Documentation/kvm/api.txt |   58 +
 1 files changed, 58 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..dd10c3b 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,53 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+NOTICE: Writing the MSI-X MMIO page after it was registered with this API may
+be dangerous for userspace program. The writing during VM running may result
+in synchronization issue therefore the assigned device can't work properly.
+The writing is allowed when VM is not running and can be used as save/restore
+mechanism.
+
+struct kvm_msix_mmio_user {
+   __u32 dev_id;
+   __u16 type; /* Device type and MMIO address type */
+   __u16 max_entries_nr;   /* Maximum entries supported */
+   __u64 base_addr;/* Guest physical address of MMIO */
+   __u64 base_va;  /* Host virtual address of MMIO mapping */
+   __u64 flags;/* Reserved for now */
+   __u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
@@ -1445,6 +1492,17 @@ Userspace can now handle the hypercall and when it's 
done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+   /* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+   struct {
+   __u32 dev_id;
+   __u16 type;
+   __u16 entry_idx;
+   __u64 flags;
+   } msix_routing;
+
+KVM_EXIT_MSIX_ROUTING_UPDATE indicates one MSI-X entry has been modified, and
+userspace need to update the correlated routing table.
+
/* Fix the size of the union. */
char padding[256];
};
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4 v9] qemu-kvm: MSI-X MMIO support for assigned device

2011-02-18 Thread Sheng Yang
Update with kernel patches v9.

Sheng Yang (4):
  qemu-kvm: device assignment: Enabling MSI-X according to the entries'
mask bit
  qemu-kvm: Ioctl for MSIX MMIO support
  qemu-kvm: Header file update for MSI-X MMIO support
  qemu-kvm: MSI-X MMIO support for assigned device

 hw/device-assignment.c  |  284 +--
 hw/device-assignment.h  |5 +-
 kvm/include/linux/kvm.h |   28 +
 qemu-kvm.c  |   60 ++
 qemu-kvm.h  |   26 +
 5 files changed, 366 insertions(+), 37 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] qemu-kvm: MSI-X MMIO support for assigned device

2011-02-18 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 hw/device-assignment.c |  106 +--
 hw/device-assignment.h |3 +
 qemu-kvm.c |   46 +
 qemu-kvm.h |   19 +
 4 files changed, 160 insertions(+), 14 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 5c162c4..09e3b99 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -71,6 +71,11 @@ static void assigned_device_pci_cap_write_config(PCIDevice 
*pci_dev,
 static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
 uint32_t address, int len);
 
+static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus, uint8_t devfn)
+{
+return (uint32_t)seg  16 | (uint32_t)bus  8 | (uint32_t)devfn;
+}
+
 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
uint32_t addr, int len, uint32_t *val)
 {
@@ -274,6 +279,10 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int 
region_num,
 AssignedDevRegion *region = r_dev-v_addrs[region_num];
 PCIRegion *real_region = r_dev-real_device.regions[region_num];
 int ret = 0;
+#ifdef KVM_CAP_MSIX_MMIO
+int cap_mask = kvm_check_extension(kvm_state, KVM_CAP_MSIX_MMIO);
+struct kvm_msix_mmio_user msix_mmio;
+#endif
 
 DEBUG(e_phys=%08 FMT_PCIBUS  r_virt=%p type=%d len=%08 FMT_PCIBUS  
region_num=%d \n,
   e_phys, region-u.r_virtbase, type, e_size, region_num);
@@ -292,6 +301,23 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int 
region_num,
 
 cpu_register_physical_memory(e_phys + offset,
 TARGET_PAGE_SIZE, r_dev-mmio_index);
+#ifdef KVM_CAP_MSIX_MMIO
+if (cap_mask) {
+r_dev-guest_msix_table_addr = e_phys + offset;
+memset(msix_mmio, 0, sizeof msix_mmio);
+msix_mmio.dev_id = calc_assigned_dev_id(r_dev-h_segnr,
+r_dev-h_busnr, r_dev-h_devfn);
+msix_mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+   KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+msix_mmio.base_addr = e_phys + offset;
+msix_mmio.base_va = (unsigned long)r_dev-msix_table_page;
+msix_mmio.max_entries_nr = r_dev-max_msix_entries_nr;
+msix_mmio.flags = 0;
+ret = kvm_register_msix_mmio(kvm_context, msix_mmio);
+if (ret)
+fprintf(stderr, fail to register in-kernel msix_mmio!\n);
+}
+#endif
 }
 }
 
@@ -854,11 +880,6 @@ static void free_assigned_device(AssignedDevice *dev)
 }
 }
 
-static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus, uint8_t devfn)
-{
-return (uint32_t)seg  16 | (uint32_t)bus  8 | (uint32_t)devfn;
-}
-
 static void assign_failed_examine(AssignedDevice *dev)
 {
 char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
@@ -1268,6 +1289,9 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev,
 return r;
 }
 
+static int assigned_dev_update_routing_handler(void *opaque,
+struct kvm_msix_routing_data *data);
+
 static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
 {
 struct kvm_assigned_irq assigned_irq_data;
@@ -1494,7 +1518,9 @@ static int assigned_device_pci_cap_init(PCIDevice 
*pci_dev)
 msix_table_entry = pci_get_long(pci_dev-config + pos + 
PCI_MSIX_TABLE);
 bar_nr = msix_table_entry  PCI_MSIX_BIR;
 msix_table_entry = ~PCI_MSIX_BIR;
-dev-msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
+dev-msix_table_addr = pci_region[bar_nr].base_addr +
+   msix_table_entry;
+
 dev-max_msix_entries_nr = get_msix_entries_max_nr(dev);
 }
 #endif
@@ -1678,11 +1704,10 @@ static uint32_t msix_mmio_readw(void *opaque, 
target_phys_addr_t addr)
 (8 * (addr  3)))  0x;
 }
 
-static void msix_mmio_writel(void *opaque,
- target_phys_addr_t addr, uint32_t val)
+static void assigned_dev_update_routing(void *opaque,
+struct kvm_msix_routing_data *data)
 {
 AssignedDevice *adev = opaque;
-unsigned int offset = addr  0xfff;
 void *page = adev-msix_table_page;
 int ctrl_word, index;
 struct kvm_irq_routing_entry new_entry = {};
@@ -1691,11 +1716,7 @@ static void msix_mmio_writel(void *opaque,
 struct PCIDevice *pci_dev = adev-dev;
 uint8_t cap = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
 
-DEBUG(write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n,
-   addr, val);
-memcpy((void *)((char *)page + offset), val, 4);
-
-index = offset / 16;
+index = data-entry_idx;
 
 /* Check if mask bit is being accessed */
 memcpy(msg_addr, (char *)page + index * 16, 4);

[PATCH 2/4] qemu-kvm: Ioctl for MSIX MMIO support

2011-02-18 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 qemu-kvm.c |   14 ++
 qemu-kvm.h |7 +++
 2 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/qemu-kvm.c b/qemu-kvm.c
index 49cd683..d282c95 100644
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -1050,6 +1050,20 @@ int kvm_assign_set_msix_entry(kvm_context_t kvm,
 }
 #endif
 
+#ifdef KVM_CAP_MSIX_MMIO
+int kvm_register_msix_mmio(kvm_context_t kvm,
+   struct kvm_msix_mmio_user *mmio_user)
+{
+return kvm_vm_ioctl(kvm_state, KVM_REGISTER_MSIX_MMIO, mmio_user);
+}
+
+int kvm_unregister_msix_mmio(kvm_context_t kvm,
+ struct kvm_msix_mmio_user *mmio_user)
+{
+return kvm_vm_ioctl(kvm_state, KVM_UNREGISTER_MSIX_MMIO, mmio_user);
+}
+#endif
+
 #if defined(KVM_CAP_IRQFD)  defined(CONFIG_EVENTFD)
 
 #include sys/eventfd.h
diff --git a/qemu-kvm.h b/qemu-kvm.h
index 88cf276..48ff52d 100644
--- a/qemu-kvm.h
+++ b/qemu-kvm.h
@@ -602,6 +602,13 @@ int kvm_assign_set_msix_entry(kvm_context_t kvm,
   struct kvm_assigned_msix_entry *entry);
 #endif
 
+#ifdef KVM_CAP_MSIX_MMIO
+int kvm_register_msix_mmio(kvm_context_t kvm,
+   struct kvm_msix_mmio_user *mmio_user);
+int kvm_unregister_msix_mmio(kvm_context_t kvm,
+ struct kvm_msix_mmio_user *mmio_user);
+#endif
+
 #else   /* !CONFIG_KVM */
 
 typedef struct kvm_context *kvm_context_t;
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] qemu-kvm: Header file update for MSI-X MMIO support

2011-02-18 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 kvm/include/linux/kvm.h |   28 
 1 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/kvm/include/linux/kvm.h b/kvm/include/linux/kvm.h
index e46729e..dcb8f54 100644
--- a/kvm/include/linux/kvm.h
+++ b/kvm/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI  16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI  18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,13 @@ struct kvm_run {
struct {
__u64 gprs[32];
} osi;
+   /* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+   struct {
+   __u32 dev_id;
+   __u16 type;
+   __u16 entry_idx;
+   __u64 flags;
+   } msix_routing;
/* Fix the size of the union. */
char padding[256];
};
@@ -530,6 +538,7 @@ struct kvm_enable_cap {
 #ifdef __KVM_HAVE_XCRS
 #define KVM_CAP_XCRS 56
 #endif
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -660,6 +669,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -781,4 +793,20 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK   0xff00
+struct kvm_msix_mmio_user {
+   __u32 dev_id;
+   __u16 type;
+   __u16 max_entries_nr;
+   __u64 base_addr;
+   __u64 base_va;
+   __u64 flags;
+   __u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] qemu-kvm: device assignment: Enabling MSI-X according to the entries' mask bit

2011-02-18 Thread Sheng Yang
The old MSI-X enabling method assume the entries are written before MSI-X
enabled, but some OS didn't obey this, e.g. FreeBSD. This patch would fix
this.

Also, according to the PCI spec, mask bit of MSI-X table should be set
after reset.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 hw/device-assignment.c |  188 +---
 hw/device-assignment.h |2 +-
 2 files changed, 162 insertions(+), 28 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index e5205cf..5c162c4 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -1146,15 +1146,12 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev, 
unsigned int ctrl_pos)
 #endif
 
 #ifdef KVM_CAP_DEVICE_MSIX
-static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
+
+#define PCI_MSIX_CTRL_MASKBIT  1ul
+static int get_msix_entries_max_nr(AssignedDevice *adev)
 {
-AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
-uint16_t entries_nr = 0, entries_max_nr;
-int pos = 0, i, r = 0;
-uint32_t msg_addr, msg_upper_addr, msg_data, msg_ctrl;
-struct kvm_assigned_msix_nr msix_nr;
-struct kvm_assigned_msix_entry msix_entry;
-void *va = adev-msix_table_page;
+int pos, entries_max_nr;
+PCIDevice *pci_dev = adev-dev;
 
 pos = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
 
@@ -1162,20 +1159,48 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 entries_max_nr = PCI_MSIX_TABSIZE;
 entries_max_nr += 1;
 
+return entries_max_nr;
+}
+
+static int assigned_dev_msix_entry_masked(AssignedDevice *adev, int entry)
+{
+uint32_t msg_ctrl;
+void *va = adev-msix_table_page;
+
+memcpy(msg_ctrl, va + entry * 16 + 12, 4);
+return (msg_ctrl  PCI_MSIX_CTRL_MASKBIT);
+}
+
+static int get_msix_valid_entries_nr(AssignedDevice *adev,
+uint16_t entries_max_nr)
+{
+void *va = adev-msix_table_page;
+uint32_t msg_ctrl;
+uint16_t entries_nr = 0;
+int i;
+
 /* Get the usable entry number for allocating */
 for (i = 0; i  entries_max_nr; i++) {
 memcpy(msg_ctrl, va + i * 16 + 12, 4);
-memcpy(msg_data, va + i * 16 + 8, 4);
 /* Ignore unused entry even it's unmasked */
-if (msg_data == 0)
+if (assigned_dev_msix_entry_masked(adev, i))
 continue;
 entries_nr ++;
 }
+return entries_nr;
+}
+
+static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
+ uint16_t entries_nr,
+ uint16_t entries_max_nr)
+{
+AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
+int i, r = 0;
+uint32_t msg_addr, msg_upper_addr, msg_data, msg_ctrl;
+struct kvm_assigned_msix_nr msix_nr;
+struct kvm_assigned_msix_entry msix_entry;
+void *va = adev-msix_table_page;
 
-if (entries_nr == 0) {
-fprintf(stderr, MSI-X entry number is zero!\n);
-return -EINVAL;
-}
 msix_nr.assigned_dev_id = calc_assigned_dev_id(adev-h_segnr, 
adev-h_busnr,
   (uint8_t)adev-h_devfn);
 msix_nr.entry_nr = entries_nr;
@@ -1187,6 +1212,8 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 }
 
 free_dev_irq_entries(adev);
+memset(pci_dev-msix_entry_used, 0, KVM_MAX_MSIX_PER_DEV *
+sizeof(*pci_dev-msix_entry_used));
 adev-irq_entries_nr = entries_nr;
 adev-entry = calloc(entries_nr, sizeof(struct kvm_irq_routing_entry));
 if (!adev-entry) {
@@ -1200,10 +1227,10 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 if (entries_nr = msix_nr.entry_nr)
 break;
 memcpy(msg_ctrl, va + i * 16 + 12, 4);
-memcpy(msg_data, va + i * 16 + 8, 4);
-if (msg_data == 0)
+if (assigned_dev_msix_entry_masked(adev, i))
 continue;
 
+memcpy(msg_data, va + i * 16 + 8, 4);
 memcpy(msg_addr, va + i * 16, 4);
 memcpy(msg_upper_addr, va + i * 16 + 4, 4);
 
@@ -1217,17 +1244,18 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 adev-entry[entries_nr].u.msi.address_lo = msg_addr;
 adev-entry[entries_nr].u.msi.address_hi = msg_upper_addr;
 adev-entry[entries_nr].u.msi.data = msg_data;
-DEBUG(MSI-X data 0x%x, MSI-X addr_lo 0x%x\n!, msg_data, msg_addr);
-   kvm_add_routing_entry(adev-entry[entries_nr]);
+DEBUG(MSI-X data 0x%x, MSI-X addr_lo 0x%x!\n, msg_data, msg_addr);
+kvm_add_routing_entry(adev-entry[entries_nr]);
 
 msix_entry.gsi = adev-entry[entries_nr].gsi;
 msix_entry.entry = i;
+pci_dev-msix_entry_used[i] = 1;
 r = kvm_assign_set_msix_entry(kvm_context, msix_entry);
 if (r) {
 fprintf(stderr, fail to set MSI-X entry! %s\n, strerror(-r));
 break;
 }
-DEBUG(MSI-X 

[PATCH v2 06/15] Synchronize VCPU states before reset

2011-02-18 Thread Jan Kiszka
This is required to support keeping VCPU states across a system reset.
If we do not read the current state before the reset,
cpu_synchronize_all_post_reset may write back incorrect state
information.

The first user of this will be MCE MSR synchronization which currently
works around the missing cpu_synchronize_all_states.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 vl.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/vl.c b/vl.c
index b436952..7751843 100644
--- a/vl.c
+++ b/vl.c
@@ -1452,6 +1452,7 @@ static void main_loop(void)
 }
 if (qemu_reset_requested()) {
 pause_all_vcpus();
+cpu_synchronize_all_states();
 qemu_system_reset();
 resume_all_vcpus();
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 08/15] kvm: Rename kvm_arch_process_irqchip_events to async_events

2011-02-18 Thread Jan Kiszka
We will broaden the scope of this function on x86 beyond irqchip events.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 kvm-all.c  |2 +-
 kvm.h  |2 +-
 target-i386/kvm.c  |2 +-
 target-ppc/kvm.c   |2 +-
 target-s390x/kvm.c |2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index e6a7de4..6522a32 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -893,7 +893,7 @@ int kvm_cpu_exec(CPUState *env)
 
 DPRINTF(kvm_cpu_exec()\n);
 
-if (kvm_arch_process_irqchip_events(env)) {
+if (kvm_arch_process_async_events(env)) {
 env-exit_request = 0;
 return EXCP_HLT;
 }
diff --git a/kvm.h b/kvm.h
index 59b2c29..7bc04e0 100644
--- a/kvm.h
+++ b/kvm.h
@@ -102,7 +102,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run);
 
 int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run);
 
-int kvm_arch_process_irqchip_events(CPUState *env);
+int kvm_arch_process_async_events(CPUState *env);
 
 int kvm_arch_get_registers(CPUState *env);
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index f909661..a416554 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1675,7 +1675,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
 cpu_set_apic_base(env-apic_state, run-apic_base);
 }
 
-int kvm_arch_process_irqchip_events(CPUState *env)
+int kvm_arch_process_async_events(CPUState *env)
 {
 if (kvm_irqchip_in_kernel()) {
 return 0;
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 3924f4b..6c99a16 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -259,7 +259,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
 {
 }
 
-int kvm_arch_process_irqchip_events(CPUState *env)
+int kvm_arch_process_async_events(CPUState *env)
 {
 return 0;
 }
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index b349812..5673a95 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -177,7 +177,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
 {
 }
 
-int kvm_arch_process_irqchip_events(CPUState *env)
+int kvm_arch_process_async_events(CPUState *env)
 {
 return 0;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 14/15] Add qemu_ram_remap

2011-02-18 Thread Jan Kiszka
From: Huang Ying ying.hu...@intel.com

qemu_ram_remap() unmaps the specified RAM pages, then re-maps these
pages again.  This is used by KVM HWPoison support to clear HWPoisoned
page tables across guest rebooting, so that a new page may be
allocated later to recover the memory error.

[ Jan: style fixlets, WIN32 fix ]

Signed-off-by: Huang Ying ying.hu...@intel.com
Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 cpu-all.h|4 +++
 cpu-common.h |1 +
 exec.c   |   63 +-
 3 files changed, 67 insertions(+), 1 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index caf5e6c..4f4631d 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -863,10 +863,14 @@ target_phys_addr_t cpu_get_phys_page_debug(CPUState *env, 
target_ulong addr);
 extern int phys_ram_fd;
 extern ram_addr_t ram_size;
 
+/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
+#define RAM_PREALLOC_MASK   (1  0)
+
 typedef struct RAMBlock {
 uint8_t *host;
 ram_addr_t offset;
 ram_addr_t length;
+uint32_t flags;
 char idstr[256];
 QLIST_ENTRY(RAMBlock) next;
 #if defined(__linux__)  !defined(TARGET_S390X)
diff --git a/cpu-common.h b/cpu-common.h
index 54d21d4..ef4e8da 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -50,6 +50,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const 
char *name,
 ram_addr_t size, void *host);
 ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size);
 void qemu_ram_free(ram_addr_t addr);
+void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* Same but slower, to use for migration, where the order of
diff --git a/exec.c b/exec.c
index d611100..9308a97 100644
--- a/exec.c
+++ b/exec.c
@@ -2867,6 +2867,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, 
const char *name,
 
 if (host) {
 new_block-host = host;
+new_block-flags |= RAM_PREALLOC_MASK;
 } else {
 if (mem_path) {
 #if defined (__linux__)  !defined(TARGET_S390X)
@@ -2920,7 +2921,9 @@ void qemu_ram_free(ram_addr_t addr)
 QLIST_FOREACH(block, ram_list.blocks, next) {
 if (addr == block-offset) {
 QLIST_REMOVE(block, next);
-if (mem_path) {
+if (block-flags  RAM_PREALLOC_MASK) {
+;
+} else if (mem_path) {
 #if defined (__linux__)  !defined(TARGET_S390X)
 if (block-fd) {
 munmap(block-host, block-length);
@@ -2943,6 +2946,64 @@ void qemu_ram_free(ram_addr_t addr)
 
 }
 
+#ifndef _WIN32
+void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
+{
+RAMBlock *block;
+ram_addr_t offset;
+int flags;
+void *area, *vaddr;
+
+QLIST_FOREACH(block, ram_list.blocks, next) {
+offset = addr - block-offset;
+if (offset  block-length) {
+vaddr = block-host + offset;
+if (block-flags  RAM_PREALLOC_MASK) {
+;
+} else {
+flags = MAP_FIXED;
+munmap(vaddr, length);
+if (mem_path) {
+#if defined(__linux__)  !defined(TARGET_S390X)
+if (block-fd) {
+#ifdef MAP_POPULATE
+flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED :
+MAP_PRIVATE;
+#else
+flags |= MAP_PRIVATE;
+#endif
+area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
+flags, block-fd, offset);
+} else {
+flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
+flags, -1, 0);
+}
+#endif
+} else {
+#if defined(TARGET_S390X)  defined(CONFIG_KVM)
+flags |= MAP_SHARED | MAP_ANONYMOUS;
+area = mmap(vaddr, length, PROT_EXEC|PROT_READ|PROT_WRITE,
+flags, -1, 0);
+#else
+flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
+flags, -1, 0);
+#endif
+}
+if (area != vaddr) {
+fprintf(stderr, Could not remap addr: %lx@%lx\n,
+length, addr);
+exit(1);
+}
+qemu_madvise(vaddr, length, QEMU_MADV_MERGEABLE);
+}
+return;
+}
+}
+}
+#endif /* !_WIN32 */
+
 /* Return a host pointer to ram allocated with qemu_ram_alloc.
With the exception of the softmmu code in this file, this should
only be used for local memory (e.g. video ram) that the device owns,
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a 

[PATCH v2 15/15] KVM, MCE, unpoison memory address across reboot

2011-02-18 Thread Jan Kiszka
From: Huang Ying ying.hu...@intel.com

In Linux kernel HWPoison processing implementation, the virtual
address in processes mapping the error physical memory page is marked
as HWPoison.  So that, the further accessing to the virtual
address will kill corresponding processes with SIGBUS.

If the error physical memory page is used by a KVM guest, the SIGBUS
will be sent to QEMU, and QEMU will simulate a MCE to report that
memory error to the guest OS.  If the guest OS can not recover from
the error (for example, the page is accessed by kernel code), guest OS
will reboot the system.  But because the underlying host virtual
address backing the guest physical memory is still poisoned, if the
guest system accesses the corresponding guest physical memory even
after rebooting, the SIGBUS will still be sent to QEMU and MCE will be
simulated.  That is, guest system can not recover via rebooting.

In fact, across rebooting, the contents of guest physical memory page
need not to be kept.  We can allocate a new host physical page to
back the corresponding guest physical address.

This patch fixes this issue in QEMU-KVM via calling qemu_ram_remap()
to clear the corresponding page table entry, so that make it possible
to allocate a new page to recover the issue.

[ Jan: rebasing and tiny cleanups]

Signed-off-by: Huang Ying ying.hu...@intel.com
Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 target-i386/kvm.c |   36 
 1 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 44e5504..7b7105d 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -173,7 +173,40 @@ static int get_para_features(CPUState *env)
 }
 #endif /* CONFIG_KVM_PARA */
 
+typedef struct HWPoisonPage {
+ram_addr_t ram_addr;
+QLIST_ENTRY(HWPoisonPage) list;
+} HWPoisonPage;
+
+static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
+QLIST_HEAD_INITIALIZER(hwpoison_page_list);
+
+static void kvm_unpoison_all(void *param)
+{
+HWPoisonPage *page, *next_page;
+
+QLIST_FOREACH_SAFE(page, hwpoison_page_list, list, next_page) {
+QLIST_REMOVE(page, list);
+qemu_ram_remap(page-ram_addr, TARGET_PAGE_SIZE);
+qemu_free(page);
+}
+}
+
 #ifdef KVM_CAP_MCE
+static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
+{
+HWPoisonPage *page;
+
+QLIST_FOREACH(page, hwpoison_page_list, list) {
+if (page-ram_addr == ram_addr) {
+return;
+}
+}
+page = qemu_malloc(sizeof(HWPoisonPage));
+page-ram_addr = ram_addr;
+QLIST_INSERT_HEAD(hwpoison_page_list, page, list);
+}
+
 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
  int *max_banks)
 {
@@ -233,6 +266,7 @@ int kvm_arch_on_sigbus_vcpu(CPUState *env, int code, void 
*addr)
 hardware_memory_error();
 }
 }
+kvm_hwpoison_page_add(ram_addr);
 kvm_mce_inject(env, paddr, code);
 } else
 #endif /* KVM_CAP_MCE */
@@ -263,6 +297,7 @@ int kvm_arch_on_sigbus(int code, void *addr)
 QEMU itself instead of guest system!: %p\n, addr);
 return 0;
 }
+kvm_hwpoison_page_add(ram_addr);
 kvm_mce_inject(first_cpu, paddr, code);
 } else
 #endif /* KVM_CAP_MCE */
@@ -571,6 +606,7 @@ int kvm_arch_init(KVMState *s)
 fprintf(stderr, e820_add_entry() table is full\n);
 return ret;
 }
+qemu_register_reset(kvm_unpoison_all, NULL);
 
 return 0;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 13/15] kvm: x86: Fail kvm_arch_init_vcpu if MCE initialization fails

2011-02-18 Thread Jan Kiszka
There is no reason to continue if the kernel claims to support MCE but
then fails to process our request.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 target-i386/kvm.c |   30 +-
 1 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 486efb9..44e5504 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -437,20 +437,24 @@ int kvm_arch_init_vcpu(CPUState *env)
 int banks;
 int ret;
 
-if (kvm_get_mce_cap_supported(env-kvm_state, mcg_cap, banks)) {
-perror(kvm_get_mce_cap_supported FAILED);
-} else {
-if (banks  MCE_BANKS_DEF)
-banks = MCE_BANKS_DEF;
-mcg_cap = MCE_CAP_DEF;
-mcg_cap |= banks;
-ret = kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
-if (ret  0) {
-fprintf(stderr, KVM_X86_SETUP_MCE: %s, strerror(-ret));
-} else {
-env-mcg_cap = mcg_cap;
-}
+ret = kvm_get_mce_cap_supported(env-kvm_state, mcg_cap, banks);
+if (ret  0) {
+fprintf(stderr, kvm_get_mce_cap_supported: %s, strerror(-ret));
+return ret;
 }
+
+if (banks  MCE_BANKS_DEF) {
+banks = MCE_BANKS_DEF;
+}
+mcg_cap = MCE_CAP_DEF;
+mcg_cap |= banks;
+ret = kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+if (ret  0) {
+fprintf(stderr, KVM_X86_SETUP_MCE: %s, strerror(-ret));
+return ret;
+}
+
+env-mcg_cap = mcg_cap;
 }
 #endif
 
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 10/15] x86: Run qemu_inject_x86_mce on target VCPU

2011-02-18 Thread Jan Kiszka
We will use the current TCG-only MCE injection path for KVM as well, and
then this read-modify-write of the target VCPU state has to be performed
synchronously in the corresponding thread.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 target-i386/helper.c |   87 +
 1 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/target-i386/helper.c b/target-i386/helper.c
index e3ef40c..a32960c 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1067,29 +1067,42 @@ static void breakpoint_handler(CPUState *env)
 prev_debug_excp_handler(env);
 }
 
-static void
-qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status,
-uint64_t mcg_status, uint64_t addr, uint64_t misc,
-int flags)
+typedef struct MCEInjectionParams {
+Monitor *mon;
+CPUState *env;
+int bank;
+uint64_t status;
+uint64_t mcg_status;
+uint64_t addr;
+uint64_t misc;
+int flags;
+} MCEInjectionParams;
+
+static void do_inject_x86_mce(void *data)
 {
-uint64_t mcg_cap = cenv-mcg_cap;
-uint64_t *banks = cenv-mce_banks + 4 * bank;
+MCEInjectionParams *params = data;
+CPUState *cenv = params-env;
+uint64_t *banks = cenv-mce_banks + 4 * params-bank;
+
+cpu_synchronize_state(cenv);
 
 /*
  * If there is an MCE exception being processed, ignore this SRAO MCE
  * unless unconditional injection was requested.
  */
-if (!(flags  MCE_INJECT_UNCOND_AO)  !(status  MCI_STATUS_AR)
+if (!(params-flags  MCE_INJECT_UNCOND_AO)
+ !(params-status  MCI_STATUS_AR)
  (cenv-mcg_status  MCG_STATUS_MCIP)) {
 return;
 }
-if (status  MCI_STATUS_UC) {
+
+if (params-status  MCI_STATUS_UC) {
 /*
  * if MSR_MCG_CTL is not all 1s, the uncorrected error
  * reporting is disabled
  */
-if ((mcg_cap  MCG_CTL_P)  cenv-mcg_ctl != ~(uint64_t)0) {
-monitor_printf(mon,
+if ((cenv-mcg_cap  MCG_CTL_P)  cenv-mcg_ctl != ~(uint64_t)0) {
+monitor_printf(params-mon,
CPU %d: Uncorrected error reporting disabled\n,
cenv-cpu_index);
 return;
@@ -1100,35 +1113,39 @@ qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int 
bank, uint64_t status,
  * reporting is disabled for the bank
  */
 if (banks[0] != ~(uint64_t)0) {
-monitor_printf(mon, CPU %d: Uncorrected error reporting disabled 
-   for bank %d\n, cenv-cpu_index, bank);
+monitor_printf(params-mon,
+   CPU %d: Uncorrected error reporting disabled for
+bank %d\n,
+   cenv-cpu_index, params-bank);
 return;
 }
 
 if ((cenv-mcg_status  MCG_STATUS_MCIP) ||
 !(cenv-cr[4]  CR4_MCE_MASK)) {
-monitor_printf(mon, CPU %d: Previous MCE still in progress, 
-raising triple fault\n, cenv-cpu_index);
+monitor_printf(params-mon,
+   CPU %d: Previous MCE still in progress, raising
+triple fault\n,
+   cenv-cpu_index);
 qemu_log_mask(CPU_LOG_RESET, Triple fault\n);
 qemu_system_reset_request();
 return;
 }
 if (banks[1]  MCI_STATUS_VAL) {
-status |= MCI_STATUS_OVER;
+params-status |= MCI_STATUS_OVER;
 }
-banks[2] = addr;
-banks[3] = misc;
-cenv-mcg_status = mcg_status;
-banks[1] = status;
+banks[2] = params-addr;
+banks[3] = params-misc;
+cenv-mcg_status = params-mcg_status;
+banks[1] = params-status;
 cpu_interrupt(cenv, CPU_INTERRUPT_MCE);
 } else if (!(banks[1]  MCI_STATUS_VAL)
|| !(banks[1]  MCI_STATUS_UC)) {
 if (banks[1]  MCI_STATUS_VAL) {
-status |= MCI_STATUS_OVER;
+params-status |= MCI_STATUS_OVER;
 }
-banks[2] = addr;
-banks[3] = misc;
-banks[1] = status;
+banks[2] = params-addr;
+banks[3] = params-misc;
+banks[1] = params-status;
 } else {
 banks[1] |= MCI_STATUS_OVER;
 }
@@ -1138,6 +1155,16 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, 
int bank,
 uint64_t status, uint64_t mcg_status, uint64_t addr,
 uint64_t misc, int flags)
 {
+MCEInjectionParams params = {
+.mon = mon,
+.env = cenv,
+.bank = bank,
+.status = status,
+.mcg_status = mcg_status,
+.addr = addr,
+.misc = misc,
+.flags = flags,
+};
 unsigned bank_num = cenv-mcg_cap  0xff;
 CPUState *env;
 int flag = 0;
@@ -1167,17 +1194,19 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState 

[PATCH v2 09/15] kvm: x86: Inject pending MCE events on state writeback

2011-02-18 Thread Jan Kiszka
The current way of injecting MCE events without updating of and
synchronizing with the CPUState is broken and causes spurious
corruptions of the MCE-related parts of the CPUState.

As a first step towards a fix, enhance the state writeback code with
support for injecting events that are pending in the CPUState. A pending
exception will then be signaled via cpu_interrupt(CPU_INTERRUPT_MCE).
And, just like for TCG, we need to leave the halt state when
CPU_INTERRUPT_MCE is pending (left broken for the to-be-removed old KVM
code).

This will also allow to unify TCG and KVM injection code.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 target-i386/kvm.c |   60 +
 1 files changed, 60 insertions(+), 0 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index a416554..939edc8 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -467,6 +467,38 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t 
status,
 #endif /* !KVM_CAP_MCE*/
 }
 
+static int kvm_inject_mce_oldstyle(CPUState *env)
+{
+#ifdef KVM_CAP_MCE
+if (!kvm_has_vcpu_events()  env-exception_injected == EXCP12_MCHK) {
+unsigned int bank, bank_num = env-mcg_cap  0xff;
+struct kvm_x86_mce mce;
+
+env-exception_injected = -1;
+
+/*
+ * There must be at least one bank in use if an MCE is pending.
+ * Find it and use its values for the event injection.
+ */
+for (bank = 0; bank  bank_num; bank++) {
+if (env-mce_banks[bank * 4 + 1]  MCI_STATUS_VAL) {
+break;
+}
+}
+assert(bank  bank_num);
+
+mce.bank = bank;
+mce.status = env-mce_banks[bank * 4 + 1];
+mce.mcg_status = env-mcg_status;
+mce.addr = env-mce_banks[bank * 4 + 2];
+mce.misc = env-mce_banks[bank * 4 + 3];
+
+return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, mce);
+}
+#endif /* KVM_CAP_MCE */
+return 0;
+}
+
 static void cpu_update_state(void *opaque, int running, int reason)
 {
 CPUState *env = opaque;
@@ -1539,6 +1571,11 @@ int kvm_arch_put_registers(CPUState *env, int level)
 if (ret  0) {
 return ret;
 }
+/* must be before kvm_put_msrs */
+ret = kvm_inject_mce_oldstyle(env);
+if (ret  0) {
+return ret;
+}
 ret = kvm_put_msrs(env, level);
 if (ret  0) {
 return ret;
@@ -1677,6 +1714,29 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run 
*run)
 
 int kvm_arch_process_async_events(CPUState *env)
 {
+if (env-interrupt_request  CPU_INTERRUPT_MCE) {
+/* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
+assert(env-mcg_cap);
+
+env-interrupt_request = ~CPU_INTERRUPT_MCE;
+
+kvm_cpu_synchronize_state(env);
+
+if (env-exception_injected == EXCP08_DBLE) {
+/* this means triple fault */
+qemu_system_reset_request();
+env-exit_request = 1;
+return 0;
+}
+env-exception_injected = EXCP12_MCHK;
+env-has_error_code = 0;
+
+env-halted = 0;
+if (kvm_irqchip_in_kernel()  env-mp_state == KVM_MP_STATE_HALTED) {
+env-mp_state = KVM_MP_STATE_RUNNABLE;
+}
+}
+
 if (kvm_irqchip_in_kernel()) {
 return 0;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 12/15] kvm: x86: Clean up kvm_setup_mce

2011-02-18 Thread Jan Kiszka
There is nothing to abstract here. Fold kvm_setup_mce into its caller
and fix up the error reporting (return code of kvm_vcpu_ioctl holds the
error value).

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 target-i386/kvm.c |   11 ---
 1 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index be896dd..486efb9 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -187,11 +187,6 @@ static int kvm_get_mce_cap_supported(KVMState *s, uint64_t 
*mce_cap,
 return -ENOSYS;
 }
 
-static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
-{
-return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
-}
-
 static void kvm_mce_inject(CPUState *env, target_phys_addr_t paddr, int code)
 {
 uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
@@ -440,6 +435,7 @@ int kvm_arch_init_vcpu(CPUState *env)
  kvm_check_extension(env-kvm_state, KVM_CAP_MCE)  0) {
 uint64_t mcg_cap;
 int banks;
+int ret;
 
 if (kvm_get_mce_cap_supported(env-kvm_state, mcg_cap, banks)) {
 perror(kvm_get_mce_cap_supported FAILED);
@@ -448,8 +444,9 @@ int kvm_arch_init_vcpu(CPUState *env)
 banks = MCE_BANKS_DEF;
 mcg_cap = MCE_CAP_DEF;
 mcg_cap |= banks;
-if (kvm_setup_mce(env, mcg_cap)) {
-perror(kvm_setup_mce FAILED);
+ret = kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+if (ret  0) {
+fprintf(stderr, KVM_X86_SETUP_MCE: %s, strerror(-ret));
 } else {
 env-mcg_cap = mcg_cap;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 05/15] x86: Optionally avoid injecting AO MCEs while others are pending

2011-02-18 Thread Jan Kiszka
Allow to tell cpu_x86_inject_mce that it should ignore Action Optional
MCE events when the target VCPU is still processing another one. This
will be used by KVM soon.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 monitor.c|7 +--
 target-i386/cpu.h|5 -
 target-i386/helper.c |   26 +++---
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/monitor.c b/monitor.c
index 662df7c..ae20927 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2709,12 +2709,15 @@ static void do_inject_mce(Monitor *mon, const QDict 
*qdict)
 uint64_t mcg_status = qdict_get_int(qdict, mcg_status);
 uint64_t addr = qdict_get_int(qdict, addr);
 uint64_t misc = qdict_get_int(qdict, misc);
-int broadcast = qdict_get_try_bool(qdict, broadcast, 0);
+int flags = MCE_INJECT_UNCOND_AO;
 
+if (qdict_get_try_bool(qdict, broadcast, 0)) {
+flags |= MCE_INJECT_BROADCAST;
+}
 for (cenv = first_cpu; cenv != NULL; cenv = cenv-next_cpu) {
 if (cenv-cpu_index == cpu_index) {
 cpu_x86_inject_mce(mon, cenv, bank, status, mcg_status, addr, misc,
-   broadcast);
+   flags);
 break;
 }
 }
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 486af1d..d0eae75 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -987,8 +987,11 @@ static inline void cpu_get_tb_cpu_state(CPUState *env, 
target_ulong *pc,
 void do_cpu_init(CPUState *env);
 void do_cpu_sipi(CPUState *env);
 
+#define MCE_INJECT_BROADCAST1
+#define MCE_INJECT_UNCOND_AO2
+
 void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank,
 uint64_t status, uint64_t mcg_status, uint64_t addr,
-uint64_t misc, int broadcast);
+uint64_t misc, int flags);
 
 #endif /* CPU_I386_H */
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 462d332..e3ef40c 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1069,11 +1069,20 @@ static void breakpoint_handler(CPUState *env)
 
 static void
 qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status,
-uint64_t mcg_status, uint64_t addr, uint64_t misc)
+uint64_t mcg_status, uint64_t addr, uint64_t misc,
+int flags)
 {
 uint64_t mcg_cap = cenv-mcg_cap;
 uint64_t *banks = cenv-mce_banks + 4 * bank;
 
+/*
+ * If there is an MCE exception being processed, ignore this SRAO MCE
+ * unless unconditional injection was requested.
+ */
+if (!(flags  MCE_INJECT_UNCOND_AO)  !(status  MCI_STATUS_AR)
+ (cenv-mcg_status  MCG_STATUS_MCIP)) {
+return;
+}
 if (status  MCI_STATUS_UC) {
 /*
  * if MSR_MCG_CTL is not all 1s, the uncorrected error
@@ -1127,7 +1136,7 @@ qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int 
bank, uint64_t status,
 
 void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank,
 uint64_t status, uint64_t mcg_status, uint64_t addr,
-uint64_t misc, int broadcast)
+uint64_t misc, int flags)
 {
 unsigned bank_num = cenv-mcg_cap  0xff;
 CPUState *env;
@@ -1145,27 +1154,30 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, 
int bank,
 monitor_printf(mon, Invalid MCE status code\n);
 return;
 }
-if (broadcast  !cpu_x86_support_mca_broadcast(cenv)) {
+if ((flags  MCE_INJECT_BROADCAST)
+ !cpu_x86_support_mca_broadcast(cenv)) {
 monitor_printf(mon, Guest CPU does not support MCA broadcast\n);
 return;
 }
 
 if (kvm_enabled()) {
-if (broadcast) {
+if (flags  MCE_INJECT_BROADCAST) {
 flag |= MCE_BROADCAST;
 }
 
 kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, flag);
 } else {
-qemu_inject_x86_mce(mon, cenv, bank, status, mcg_status, addr, misc);
-if (broadcast) {
+qemu_inject_x86_mce(mon, cenv, bank, status, mcg_status, addr, misc,
+flags);
+if (flags  MCE_INJECT_BROADCAST) {
 for (env = first_cpu; env != NULL; env = env-next_cpu) {
 if (cenv == env) {
 continue;
 }
 qemu_inject_x86_mce(mon, env, 1,
 MCI_STATUS_VAL | MCI_STATUS_UC,
-MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
+MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0,
+flags);
 }
 }
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo 

[PATCH v2 01/15] x86: Account for MCE in cpu_has_work

2011-02-18 Thread Jan Kiszka
MCEs can be injected asynchronously, so they can also terminate the halt
state.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 target-i386/exec.h |   15 ++-
 1 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/target-i386/exec.h b/target-i386/exec.h
index fc8945b..d050dd0 100644
--- a/target-i386/exec.h
+++ b/target-i386/exec.h
@@ -293,15 +293,12 @@ static inline void load_eflags(int eflags, int 
update_mask)
 
 static inline int cpu_has_work(CPUState *env)
 {
-int work;
-
-work = (env-interrupt_request  CPU_INTERRUPT_HARD) 
-   (env-eflags  IF_MASK);
-work |= env-interrupt_request  CPU_INTERRUPT_NMI;
-work |= env-interrupt_request  CPU_INTERRUPT_INIT;
-work |= env-interrupt_request  CPU_INTERRUPT_SIPI;
-
-return work;
+return ((env-interrupt_request  CPU_INTERRUPT_HARD) 
+(env-eflags  IF_MASK)) ||
+   (env-interrupt_request  (CPU_INTERRUPT_NMI |
+  CPU_INTERRUPT_INIT |
+  CPU_INTERRUPT_SIPI |
+  CPU_INTERRUPT_MCE));
 }
 
 static inline int cpu_halted(CPUState *env) {
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 00/15] [uq/master] Patch queue, part IV (MCE edition)

2011-02-18 Thread Jan Kiszka
Round 2 of this part, primarily addressing review comments:
 - Reworked CPU_INTERRUPT_MCE - exection translation
   (now done in kvm_arch_process_async_events, indeed much cleaner)
 - Add missing cpu_synchronize_state on pending MCE events for
   !kvm_irqchip_in_kernel
 - Split up KVM MCE code switch from old to new style into two patches
   and dropped some unneeded variable renamings
 - Fixed Windows build (qemu_ram_remap is POSIX-only)

Thanks for the feedback so far.

CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Huang Ying ying.hu...@intel.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com

Huang Ying (2):
  Add qemu_ram_remap
  KVM, MCE, unpoison memory address across reboot

Jan Kiszka (13):
  x86: Account for MCE in cpu_has_work
  x86: Perform implicit mcg_status reset
  x86: Small cleanups of MCE helpers
  x86: Refine error reporting of MCE injection services
  x86: Optionally avoid injecting AO MCEs while others are pending
  Synchronize VCPU states before reset
  kvm: x86: Move MCE functions together
  kvm: Rename kvm_arch_process_irqchip_events to async_events
  kvm: x86: Inject pending MCE events on state writeback
  x86: Run qemu_inject_x86_mce on target VCPU
  kvm: x86: Consolidate TCG and KVM MCE injection code
  kvm: x86: Clean up kvm_setup_mce
  kvm: x86: Fail kvm_arch_init_vcpu if MCE initialization fails

 cpu-all.h |8 +-
 cpu-common.h  |1 +
 exec.c|   63 +++-
 kvm-all.c |2 +-
 kvm.h |2 +-
 monitor.c |   11 +-
 qemu-common.h |6 +-
 target-i386/cpu.h |   11 +-
 target-i386/exec.h|   15 +-
 target-i386/helper.c  |  185 +---
 target-i386/kvm.c |  463 -
 target-i386/kvm_x86.h |   25 ---
 target-ppc/kvm.c  |2 +-
 target-s390x/kvm.c|2 +-
 vl.c  |1 +
 15 files changed, 399 insertions(+), 398 deletions(-)
 delete mode 100644 target-i386/kvm_x86.h

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 04/15] x86: Refine error reporting of MCE injection services

2011-02-18 Thread Jan Kiszka
As this service is used by the human monitor, make sure that errors get
reported to the right channel, and also raise the verbosity.

This requires to move Monitor typedef in qemu-common.h to resolve the
include dependency.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 monitor.c|4 +-
 qemu-common.h|6 ++--
 target-i386/cpu.h|6 ++--
 target-i386/helper.c |   79 +-
 4 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/monitor.c b/monitor.c
index 45b0cc2..662df7c 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2712,8 +2712,8 @@ static void do_inject_mce(Monitor *mon, const QDict 
*qdict)
 int broadcast = qdict_get_try_bool(qdict, broadcast, 0);
 
 for (cenv = first_cpu; cenv != NULL; cenv = cenv-next_cpu) {
-if (cenv-cpu_index == cpu_index  cenv-mcg_cap) {
-cpu_x86_inject_mce(cenv, bank, status, mcg_status, addr, misc,
+if (cenv-cpu_index == cpu_index) {
+cpu_x86_inject_mce(mon, cenv, bank, status, mcg_status, addr, misc,
broadcast);
 break;
 }
diff --git a/qemu-common.h b/qemu-common.h
index a4d9c21..6ac29cc 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -18,6 +18,9 @@ typedef struct QEMUFile QEMUFile;
 typedef struct QEMUBH QEMUBH;
 typedef struct DeviceState DeviceState;
 
+struct Monitor;
+typedef struct Monitor Monitor;
+
 /* we put basic includes here to avoid repeating them in device drivers */
 #include stdlib.h
 #include stdio.h
@@ -324,9 +327,6 @@ void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf);
 void qemu_iovec_from_buffer(QEMUIOVector *qiov, const void *buf, size_t count);
 void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t count);
 
-struct Monitor;
-typedef struct Monitor Monitor;
-
 /* Convert a byte between binary and BCD.  */
 static inline uint8_t to_bcd(uint8_t val)
 {
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 52bb48e..486af1d 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -987,8 +987,8 @@ static inline void cpu_get_tb_cpu_state(CPUState *env, 
target_ulong *pc,
 void do_cpu_init(CPUState *env);
 void do_cpu_sipi(CPUState *env);
 
-void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status,
-uint64_t mcg_status, uint64_t addr, uint64_t misc,
-int broadcast);
+void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank,
+uint64_t status, uint64_t mcg_status, uint64_t addr,
+uint64_t misc, int broadcast);
 
 #endif /* CPU_I386_H */
diff --git a/target-i386/helper.c b/target-i386/helper.c
index ba3bed9..462d332 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -30,6 +30,7 @@
 #include kvm_x86.h
 #ifndef CONFIG_USER_ONLY
 #include sysemu.h
+#include monitor.h
 #endif
 
 //#define DEBUG_MMU
@@ -1067,33 +1068,38 @@ static void breakpoint_handler(CPUState *env)
 }
 
 static void
-qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status,
 uint64_t mcg_status, uint64_t addr, uint64_t misc)
 {
 uint64_t mcg_cap = cenv-mcg_cap;
-uint64_t *banks = cenv-mce_banks;
-
-/*
- * if MSR_MCG_CTL is not all 1s, the uncorrected error
- * reporting is disabled
- */
-if ((status  MCI_STATUS_UC)  (mcg_cap  MCG_CTL_P) 
-cenv-mcg_ctl != ~(uint64_t)0) {
-return;
-}
-banks += 4 * bank;
-/*
- * if MSR_MCi_CTL is not all 1s, the uncorrected error
- * reporting is disabled for the bank
- */
-if ((status  MCI_STATUS_UC)  banks[0] != ~(uint64_t)0) {
-return;
-}
+uint64_t *banks = cenv-mce_banks + 4 * bank;
+
 if (status  MCI_STATUS_UC) {
+/*
+ * if MSR_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+if ((mcg_cap  MCG_CTL_P)  cenv-mcg_ctl != ~(uint64_t)0) {
+monitor_printf(mon,
+   CPU %d: Uncorrected error reporting disabled\n,
+   cenv-cpu_index);
+return;
+}
+
+/*
+ * if MSR_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+if (banks[0] != ~(uint64_t)0) {
+monitor_printf(mon, CPU %d: Uncorrected error reporting disabled 
+   for bank %d\n, cenv-cpu_index, bank);
+return;
+}
+
 if ((cenv-mcg_status  MCG_STATUS_MCIP) ||
 !(cenv-cr[4]  CR4_MCE_MASK)) {
-fprintf(stderr, injects mce exception while previous 
-one is in progress!\n);
+monitor_printf(mon, CPU %d: Previous MCE still in progress, 
+   

[PATCH v2 02/15] x86: Perform implicit mcg_status reset

2011-02-18 Thread Jan Kiszka
Reorder mcg_status in CPUState to achieve automatic clearing on reset.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 target-i386/cpu.h|3 ++-
 target-i386/helper.c |2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 5f1df8b..75156e7 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -687,6 +687,8 @@ typedef struct CPUX86State {
 
 uint64_t pat;
 
+uint64_t mcg_status;
+
 /* exception/interrupt handling */
 int error_code;
 int exception_is_int;
@@ -741,7 +743,6 @@ typedef struct CPUX86State {
 struct DeviceState *apic_state;
 
 uint64_t mcg_cap;
-uint64_t mcg_status;
 uint64_t mcg_ctl;
 uint64_t mce_banks[MCE_BANKS_DEF*4];
 
diff --git a/target-i386/helper.c b/target-i386/helper.c
index f0c546d..f41416f 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -101,8 +101,6 @@ void cpu_reset(CPUX86State *env)
 env-dr[7] = DR7_FIXED_1;
 cpu_breakpoint_remove_all(env, BP_CPU);
 cpu_watchpoint_remove_all(env, BP_CPU);
-
-env-mcg_status = 0;
 }
 
 void cpu_x86_close(CPUX86State *env)
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/15] x86: Small cleanups of MCE helpers

2011-02-18 Thread Jan Kiszka
Fix some code style issues, use proper headers, and align to cpu_x86
naming scheme. No functional changes.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 cpu-all.h|4 
 monitor.c|2 +-
 target-i386/cpu.h|5 +
 target-i386/helper.c |   41 -
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 87b0f86..caf5e6c 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -971,8 +971,4 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
 uint8_t *buf, int len, int is_write);
 
-void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-uint64_t mcg_status, uint64_t addr, uint64_t misc,
-int broadcast);
-
 #endif /* CPU_ALL_H */
diff --git a/monitor.c b/monitor.c
index 22ae3bb..45b0cc2 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2713,7 +2713,7 @@ static void do_inject_mce(Monitor *mon, const QDict 
*qdict)
 
 for (cenv = first_cpu; cenv != NULL; cenv = cenv-next_cpu) {
 if (cenv-cpu_index == cpu_index  cenv-mcg_cap) {
-cpu_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc,
+cpu_x86_inject_mce(cenv, bank, status, mcg_status, addr, misc,
broadcast);
 break;
 }
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 75156e7..52bb48e 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -986,4 +986,9 @@ static inline void cpu_get_tb_cpu_state(CPUState *env, 
target_ulong *pc,
 
 void do_cpu_init(CPUState *env);
 void do_cpu_sipi(CPUState *env);
+
+void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status,
+uint64_t mcg_status, uint64_t addr, uint64_t misc,
+int broadcast);
+
 #endif /* CPU_I386_H */
diff --git a/target-i386/helper.c b/target-i386/helper.c
index f41416f..ba3bed9 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -28,6 +28,9 @@
 #include qemu-common.h
 #include kvm.h
 #include kvm_x86.h
+#ifndef CONFIG_USER_ONLY
+#include sysemu.h
+#endif
 
 //#define DEBUG_MMU
 
@@ -1063,11 +1066,9 @@ static void breakpoint_handler(CPUState *env)
 prev_debug_excp_handler(env);
 }
 
-/* This should come from sysemu.h - if we could include it here... */
-void qemu_system_reset_request(void);
-
-static void qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-uint64_t mcg_status, uint64_t addr, uint64_t misc)
+static void
+qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+uint64_t mcg_status, uint64_t addr, uint64_t misc)
 {
 uint64_t mcg_cap = cenv-mcg_cap;
 uint64_t *banks = cenv-mce_banks;
@@ -1077,15 +1078,17 @@ static void qemu_inject_x86_mce(CPUState *cenv, int 
bank, uint64_t status,
  * reporting is disabled
  */
 if ((status  MCI_STATUS_UC)  (mcg_cap  MCG_CTL_P) 
-cenv-mcg_ctl != ~(uint64_t)0)
+cenv-mcg_ctl != ~(uint64_t)0) {
 return;
+}
 banks += 4 * bank;
 /*
  * if MSR_MCi_CTL is not all 1s, the uncorrected error
  * reporting is disabled for the bank
  */
-if ((status  MCI_STATUS_UC)  banks[0] != ~(uint64_t)0)
+if ((status  MCI_STATUS_UC)  banks[0] != ~(uint64_t)0) {
 return;
+}
 if (status  MCI_STATUS_UC) {
 if ((cenv-mcg_status  MCG_STATUS_MCIP) ||
 !(cenv-cr[4]  CR4_MCE_MASK)) {
@@ -1095,8 +1098,9 @@ static void qemu_inject_x86_mce(CPUState *cenv, int bank, 
uint64_t status,
 qemu_system_reset_request();
 return;
 }
-if (banks[1]  MCI_STATUS_VAL)
+if (banks[1]  MCI_STATUS_VAL) {
 status |= MCI_STATUS_OVER;
+}
 banks[2] = addr;
 banks[3] = misc;
 cenv-mcg_status = mcg_status;
@@ -1104,16 +1108,18 @@ static void qemu_inject_x86_mce(CPUState *cenv, int 
bank, uint64_t status,
 cpu_interrupt(cenv, CPU_INTERRUPT_MCE);
 } else if (!(banks[1]  MCI_STATUS_VAL)
|| !(banks[1]  MCI_STATUS_UC)) {
-if (banks[1]  MCI_STATUS_VAL)
+if (banks[1]  MCI_STATUS_VAL) {
 status |= MCI_STATUS_OVER;
+}
 banks[2] = addr;
 banks[3] = misc;
 banks[1] = status;
-} else
+} else {
 banks[1] |= MCI_STATUS_OVER;
+}
 }
 
-void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status,
 uint64_t mcg_status, uint64_t addr, uint64_t misc,
 int broadcast)
 {
@@ -1155,15 +1161,16 @@ void cpu_inject_x86_mce(CPUState *cenv, int bank, 
uint64_t status,
 
 static void mce_init(CPUX86State 

[PATCH v2 07/15] kvm: x86: Move MCE functions together

2011-02-18 Thread Jan Kiszka
Pure function suffling to avoid multiple #ifdef KVM_CAP_MCE sections,
no functional changes. While at it, annotate some #ifdef sections.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 target-i386/kvm.c |  346 ++---
 1 files changed, 171 insertions(+), 175 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 0aa0a41..f909661 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -172,7 +172,7 @@ static int get_para_features(CPUState *env)
 #endif
 return features;
 }
-#endif
+#endif /* CONFIG_KVM_PARA */
 
 #ifdef KVM_CAP_MCE
 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
@@ -273,8 +273,174 @@ static void kvm_inject_x86_mce_on(CPUState *env, struct 
kvm_x86_mce *mce,
 run_on_cpu(env, kvm_do_inject_x86_mce, data);
 }
 
-static void kvm_mce_broadcast_rest(CPUState *env);
-#endif
+static void kvm_mce_broadcast_rest(CPUState *env)
+{
+struct kvm_x86_mce mce = {
+.bank = 1,
+.status = MCI_STATUS_VAL | MCI_STATUS_UC,
+.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV,
+.addr = 0,
+.misc = 0,
+};
+CPUState *cenv;
+
+/* Broadcast MCA signal for processor version 06H_EH and above */
+if (cpu_x86_support_mca_broadcast(env)) {
+for (cenv = first_cpu; cenv != NULL; cenv = cenv-next_cpu) {
+if (cenv == env) {
+continue;
+}
+kvm_inject_x86_mce_on(cenv, mce, ABORT_ON_ERROR);
+}
+}
+}
+
+static void kvm_mce_inj_srar_dataload(CPUState *env, target_phys_addr_t paddr)
+{
+struct kvm_x86_mce mce = {
+.bank = 9,
+.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+  | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+  | MCI_STATUS_AR | 0x134,
+.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV,
+.addr = paddr,
+.misc = (MCM_ADDR_PHYS  6) | 0xc,
+};
+int r;
+
+r = kvm_set_mce(env, mce);
+if (r  0) {
+fprintf(stderr, kvm_set_mce: %s\n, strerror(errno));
+abort();
+}
+kvm_mce_broadcast_rest(env);
+}
+
+static void kvm_mce_inj_srao_memscrub(CPUState *env, target_phys_addr_t paddr)
+{
+struct kvm_x86_mce mce = {
+.bank = 9,
+.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+  | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+  | 0xc0,
+.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV,
+.addr = paddr,
+.misc = (MCM_ADDR_PHYS  6) | 0xc,
+};
+int r;
+
+r = kvm_set_mce(env, mce);
+if (r  0) {
+fprintf(stderr, kvm_set_mce: %s\n, strerror(errno));
+abort();
+}
+kvm_mce_broadcast_rest(env);
+}
+
+static void kvm_mce_inj_srao_memscrub2(CPUState *env, target_phys_addr_t paddr)
+{
+struct kvm_x86_mce mce = {
+.bank = 9,
+.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+  | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+  | 0xc0,
+.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV,
+.addr = paddr,
+.misc = (MCM_ADDR_PHYS  6) | 0xc,
+};
+
+kvm_inject_x86_mce_on(env, mce, ABORT_ON_ERROR);
+kvm_mce_broadcast_rest(env);
+}
+#endif /* KVM_CAP_MCE */
+
+static void hardware_memory_error(void)
+{
+fprintf(stderr, Hardware memory error!\n);
+exit(1);
+}
+
+int kvm_arch_on_sigbus_vcpu(CPUState *env, int code, void *addr)
+{
+#ifdef KVM_CAP_MCE
+void *vaddr;
+ram_addr_t ram_addr;
+target_phys_addr_t paddr;
+
+if ((env-mcg_cap  MCG_SER_P)  addr
+ (code == BUS_MCEERR_AR
+|| code == BUS_MCEERR_AO)) {
+vaddr = (void *)addr;
+if (qemu_ram_addr_from_host(vaddr, ram_addr) ||
+!kvm_physical_memory_addr_from_ram(env-kvm_state, ram_addr, 
paddr)) {
+fprintf(stderr, Hardware memory error for memory used by 
+QEMU itself instead of guest system!\n);
+/* Hope we are lucky for AO MCE */
+if (code == BUS_MCEERR_AO) {
+return 0;
+} else {
+hardware_memory_error();
+}
+}
+
+if (code == BUS_MCEERR_AR) {
+/* Fake an Intel architectural Data Load SRAR UCR */
+kvm_mce_inj_srar_dataload(env, paddr);
+} else {
+/*
+ * If there is an MCE excpetion being processed, ignore
+ * this SRAO MCE
+ */
+if (!kvm_mce_in_progress(env)) {
+/* Fake an Intel architectural Memory scrubbing UCR */
+kvm_mce_inj_srao_memscrub(env, paddr);
+}
+}
+} else
+#endif /* KVM_CAP_MCE */
+{
+if (code == BUS_MCEERR_AO) {
+return 0;
+ 

[PATCH v2 11/15] kvm: x86: Consolidate TCG and KVM MCE injection code

2011-02-18 Thread Jan Kiszka
This switches KVM's MCE injection path to cpu_x86_inject_mce, both for
SIGBUS and monitor initiated events. This means we prepare the MCA MSRs
in the VCPUState also for KVM.

We have to drop the MSRs writeback restrictions for this purpose which
is now safe as every uncoordinated MSR injection is removed with this
patch.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Huang Ying ying.hu...@intel.com
CC: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
CC: Jin Dongming jin.dongm...@np.css.fujitsu.com
---
 target-i386/helper.c  |   34 +++-
 target-i386/kvm.c |  238 +---
 target-i386/kvm_x86.h |   25 -
 3 files changed, 37 insertions(+), 260 deletions(-)
 delete mode 100644 target-i386/kvm_x86.h

diff --git a/target-i386/helper.c b/target-i386/helper.c
index a32960c..a08309f 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -27,7 +27,6 @@
 #include exec-all.h
 #include qemu-common.h
 #include kvm.h
-#include kvm_x86.h
 #ifndef CONFIG_USER_ONLY
 #include sysemu.h
 #include monitor.h
@@ -1167,7 +1166,6 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int 
bank,
 };
 unsigned bank_num = cenv-mcg_cap  0xff;
 CPUState *env;
-int flag = 0;
 
 if (!cenv-mcg_cap) {
 monitor_printf(mon, MCE injection not supported\n);
@@ -1187,27 +1185,19 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, 
int bank,
 return;
 }
 
-if (kvm_enabled()) {
-if (flags  MCE_INJECT_BROADCAST) {
-flag |= MCE_BROADCAST;
-}
-
-kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, flag);
-} else {
-run_on_cpu(cenv, do_inject_x86_mce, params);
-if (flags  MCE_INJECT_BROADCAST) {
-params.bank = 1;
-params.status = MCI_STATUS_VAL | MCI_STATUS_UC;
-params.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
-params.addr = 0;
-params.misc = 0;
-for (env = first_cpu; env != NULL; env = env-next_cpu) {
-if (cenv == env) {
-continue;
-}
-params.env = env;
-run_on_cpu(cenv, do_inject_x86_mce, params);
+run_on_cpu(cenv, do_inject_x86_mce, params);
+if (flags  MCE_INJECT_BROADCAST) {
+params.bank = 1;
+params.status = MCI_STATUS_VAL | MCI_STATUS_UC;
+params.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+params.addr = 0;
+params.misc = 0;
+for (env = first_cpu; env != NULL; env = env-next_cpu) {
+if (cenv == env) {
+continue;
 }
+params.env = env;
+run_on_cpu(cenv, do_inject_x86_mce, params);
 }
 }
 }
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 939edc8..be896dd 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -28,7 +28,6 @@
 #include hw/pc.h
 #include hw/apic.h
 #include ioport.h
-#include kvm_x86.h
 
 #ifdef CONFIG_KVM_PARA
 #include linux/kvm_para.h
@@ -193,164 +192,23 @@ static int kvm_setup_mce(CPUState *env, uint64_t 
*mcg_cap)
 return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
 }
 
-static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
+static void kvm_mce_inject(CPUState *env, target_phys_addr_t paddr, int code)
 {
-return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
-}
-
-static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
-{
-struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
-int r;
-
-kmsrs-nmsrs = n;
-memcpy(kmsrs-entries, msrs, n * sizeof *msrs);
-r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
-memcpy(msrs, kmsrs-entries, n * sizeof *msrs);
-free(kmsrs);
-return r;
-}
-
-/* FIXME: kill this and kvm_get_msr, use env-mcg_status instead */
-static int kvm_mce_in_progress(CPUState *env)
-{
-struct kvm_msr_entry msr_mcg_status = {
-.index = MSR_MCG_STATUS,
-};
-int r;
-
-r = kvm_get_msr(env, msr_mcg_status, 1);
-if (r == -1 || r == 0) {
-fprintf(stderr, Failed to get MCE status\n);
-return 0;
-}
-return !!(msr_mcg_status.data  MCG_STATUS_MCIP);
-}
-
-struct kvm_x86_mce_data
-{
-CPUState *env;
-struct kvm_x86_mce *mce;
-int abort_on_error;
-};
-
-static void kvm_do_inject_x86_mce(void *_data)
-{
-struct kvm_x86_mce_data *data = _data;
-int r;
-
-/* If there is an MCE exception being processed, ignore this SRAO MCE */
-if ((data-env-mcg_cap  MCG_SER_P) 
-!(data-mce-status  MCI_STATUS_AR)) {
-if (kvm_mce_in_progress(data-env)) {
-return;
-}
-}
-
-r = kvm_set_mce(data-env, data-mce);
-if (r  0) {
-perror(kvm_set_mce FAILED);
-if (data-abort_on_error) {
-abort();
-}
-}
-}
-
-static void kvm_inject_x86_mce_on(CPUState *env, struct kvm_x86_mce *mce,
-  int flag)
-{
-struct 

[Bug 29382] New: Panic: Unable to handle kernel paging request

2011-02-18 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=29382

   Summary: Panic: Unable to handle kernel paging request
   Product: Virtualization
   Version: unspecified
  Platform: All
OS/Version: Linux
  Tree: Mainline
Status: NEW
  Severity: high
  Priority: P1
 Component: kvm
AssignedTo: virtualization_...@kernel-bugs.osdl.org
ReportedBy: se...@seblu.net
Regression: No


Created an attachment (id=48302)
 -- (https://bugzilla.kernel.org/attachment.cgi?id=48302)
screeb dump

ello,

I have an kvm host which kernel panic frequently.

extract from lshw
product: PowerEdge M610
vendor: Dell Inc.
version: PowerEdge M1000e
product: Intel(R) Xeon(R) CPU   X5650  @ 2.67GHz


# uname -a
Linux hkvm-cap-2 2.6.35.7 #1 SMP Mon Oct 25 15:28:33 UTC 2010 x86_64
GNU/Linux
# kvm  --version
QEMU emulator version 0.13.0 (qemu-kvm-0.13.0), Copyright (c) 2003-2008
Fabrice Bellard
# dpkg -l|grep -e libvirt
ii  libvirt-bin 0.8.7-1+sj1 the
programs for the libvirt library
ii  libvirt00.8.7-1+sj1
library for interfacing with different virtualization systems
ii  python-libvirt  0.8.7-1+sj1
libvirt Python bindings

I've take some screenshot from idrac about kernel panic trace. Do you
have preconisation about this?

Regards,

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 29382] Panic: Unable to handle kernel paging request

2011-02-18 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=29382





--- Comment #1 from Seb Lu se...@seblu.net  2011-02-18 14:59:41 ---
Created an attachment (id=48312)
 -- (https://bugzilla.kernel.org/attachment.cgi?id=48312)
screeb dump

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 29382] Panic: Unable to handle kernel paging request

2011-02-18 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=29382





--- Comment #2 from Seb Lu se...@seblu.net  2011-02-18 15:00:10 ---
Created an attachment (id=48322)
 -- (https://bugzilla.kernel.org/attachment.cgi?id=48322)
screen dump

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 29382] Panic: Unable to handle kernel paging request

2011-02-18 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=29382





--- Comment #3 from Seb Lu se...@seblu.net  2011-02-18 15:00:46 ---
Created an attachment (id=48332)
 -- (https://bugzilla.kernel.org/attachment.cgi?id=48332)
screen dump

-- 
Configure bugmail: https://bugzilla.kernel.org/userprefs.cgi?tab=email
--- You are receiving this mail because: ---
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Possible netfilter-related memory corruption in 2.6.37

2011-02-18 Thread Patrick McHardy
Am 14.02.2011 17:52, schrieb Patrick McHardy:
 Am 14.02.2011 17:48, schrieb Eric Dumazet:
 I am not sure, but I guess nf_reinject() needs a fix too ;)
 
 I agree. That one looks uglier though, I guess we'll have to
 iterate through all hooks to note the previous one.

How about this? Unfortunately I don't think we can avoid
iterating through all hooks without violating RCU rules.


diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 74aebed..834bb07 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -235,6 +235,7 @@ int nf_queue(struct sk_buff *skb,
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
struct sk_buff *skb = entry-skb;
+   struct nf_hook_ops *i, *prev;
struct list_head *elem = entry-elem-list;
const struct nf_afinfo *afinfo;
 
@@ -244,8 +245,21 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned 
int verdict)
 
/* Continue traversal iff userspace said ok... */
if (verdict == NF_REPEAT) {
-   elem = elem-prev;
-   verdict = NF_ACCEPT;
+   prev = NULL;
+   list_for_each_entry_rcu(i, nf_hooks[entry-pf][entry-hook],
+   list) {
+   if (i-list == elem)
+   break;
+   prev = i;
+   }
+
+   if (prev == NULL ||
+   i-list == nf_hooks[entry-pf][entry-hook])
+   verdict = NF_DROP;
+   else {
+   elem = prev-list;
+   verdict = NF_ACCEPT;
+   }
}
 
if (verdict == NF_ACCEPT) {


[PATCH] KVM test: Do not load acpiphp on RHEL 6.0

2011-02-18 Thread Lucas Meneghel Rodrigues
Turns out hotplug for PCI devices is built in
on RHEL 6.0 kernel.

Signed-off-by: Lucas Meneghel Rodrigues l...@redhat.com
---
 client/tests/kvm/tests_base.cfg.sample |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/tests_base.cfg.sample 
b/client/tests/kvm/tests_base.cfg.sample
index 80362db..7eb3635 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -1691,6 +1691,10 @@ variants:
 
 - 6.0.i386:
 no setup
+nic_hotplug:
+modprobe_module =
+block_hotplug:
+modprobe_module =
 image_name = rhel6-32
 unattended_install:
 unattended_file = unattended/RHEL-6-series.ks
@@ -1706,6 +1710,10 @@ variants:
 
 - 6.0.x86_64:
 no setup
+nic_hotplug:
+modprobe_module =
+block_hotplug:
+modprobe_module =
 image_name = rhel6-64
 unattended_install:
 unattended_file = unattended/RHEL-6-series.ks
-- 
1.7.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM test: nic_hotplug: Fix typo

2011-02-18 Thread Lucas Meneghel Rodrigues
Signed-off-by: Lucas Meneghel Rodrigues l...@redhat.com
---
 client/tests/kvm/tests/nic_hotplug.py |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/tests/nic_hotplug.py 
b/client/tests/kvm/tests/nic_hotplug.py
index 50a3ce9..5a757e7 100644
--- a/client/tests/kvm/tests/nic_hotplug.py
+++ b/client/tests/kvm/tests/nic_hotplug.py
@@ -76,7 +76,7 @@ def run_nic_hotplug(test, params, env):
 vm.monitor.cmd(device_add_cmd)
 
 qdev = vm.monitor.info(qtree)
-if id not in qdev:
+if nic_id not in qdev:
 logging.error(qdev)
 raise error.TestFail(Device %s was not plugged into qdev
  tree % nic_id)
-- 
1.7.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Remove ethtool from rtl8139 variant

2011-02-18 Thread Lucas Meneghel Rodrigues
As the original patch set don't define supported_features
for it.

Signed-off-by: Lucas Meneghel Rodrigues l...@redhat.com
---
 client/tests/kvm/tests/ethtool.py  |6 +-
 client/tests/kvm/tests_base.cfg.sample |1 +
 2 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/tests/ethtool.py 
b/client/tests/kvm/tests/ethtool.py
index 81e45d3..d7c6b57 100644
--- a/client/tests/kvm/tests/ethtool.py
+++ b/client/tests/kvm/tests/ethtool.py
@@ -191,7 +191,11 @@ def run_ethtool(test, params, env):
 filename = /tmp/ethtool.dd
 guest_ip = vm.get_address()
 ethname = kvm_test_utils.get_linux_ifname(session, vm.get_mac_address(0))
-supported_features = params.get(supported_features).split()
+supported_features = params.get(supported_features)
+if supported_features:
+supported_features = supported_features.split()
+else:
+supported_features = []
 test_matrix = {
 # type:(callback,(dependence), (exclude)
 tx:  (tx_callback, (), ()),
diff --git a/client/tests/kvm/tests_base.cfg.sample 
b/client/tests/kvm/tests_base.cfg.sample
index 7eb3635..816a94f 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -903,6 +903,7 @@ variants:
 variants:
 - @rtl8139:
 nic_model = rtl8139
+no ethtool
 jumbo:
 mtu = 1500
 - e1000:
-- 
1.7.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Possible netfilter-related memory corruption in 2.6.37

2011-02-18 Thread Eric Dumazet
Le vendredi 18 février 2011 à 19:37 +0100, Patrick McHardy a écrit :
 Am 14.02.2011 17:52, schrieb Patrick McHardy:
  Am 14.02.2011 17:48, schrieb Eric Dumazet:
  I am not sure, but I guess nf_reinject() needs a fix too ;)
  
  I agree. That one looks uglier though, I guess we'll have to
  iterate through all hooks to note the previous one.
 
 How about this? Unfortunately I don't think we can avoid
 iterating through all hooks without violating RCU rules.
 
 

   /* Continue traversal iff userspace said ok... */
if (verdict == NF_REPEAT) {
-   elem = elem-prev;
-   verdict = NF_ACCEPT;
+   prev = NULL;
+   list_for_each_entry_rcu(i,
nf_hooks[entry-pf][entry-hook],
+   list) {
+   if (i-list == elem)
+   break;
+   prev = i;


Hmm... what happens if elem was the first elem in list ?

We exit with prev = NULL  -- NF_DROP ?

I must miss something...

+   }
+
+   if (prev == NULL ||
+   i-list == nf_hooks[entry-pf][entry-hook])
+   verdict = NF_DROP;
+   else {
+   elem = prev-list;
+   verdict = NF_ACCEPT;
+   }
}



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM test: Include start_vm = yes on nic_bonding

2011-02-18 Thread Lucas Meneghel Rodrigues
So the VM can be restarted to include the new NICs required
for the test.

Signed-off-by: Lucas Meneghel Rodrigues l...@redhat.com
---
 client/tests/kvm/tests_base.cfg.sample |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/tests_base.cfg.sample 
b/client/tests/kvm/tests_base.cfg.sample
index 816a94f..cfe343d 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -703,6 +703,7 @@ variants:
 - nic_bonding:
 type = nic_bonding
 nics += ' nic2 nic3 nic4'
+start_vm = yes
 image_snapshot = yes
 serial_login = yes
 test_timeout = 1000
-- 
1.7.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 1/2] KVM test: make_qemu_command(): properly deal with get_mac_address() failure

2011-02-18 Thread Michael Goldish
If VM params define a new NIC that didn't previously exist, then when
make_qemu_command() is called in order to see if the VM should be restarted,
it attempts to get the MAC address of the new (nonexistent) NIC, and an
exception is raised.  This exception is expected and should be caught.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_vm.py |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/kvm_vm.py b/client/tests/kvm/kvm_vm.py
index 969558b..d852784 100755
--- a/client/tests/kvm/kvm_vm.py
+++ b/client/tests/kvm/kvm_vm.py
@@ -638,7 +638,10 @@ class VM:
 except IndexError:
 netdev_id = None
 # Handle the '-net nic' part
-mac = vm.get_mac_address(vlan)
+try:
+mac = vm.get_mac_address(vlan)
+except VMAddressError:
+mac = None
 qemu_cmd += add_nic(help, vlan, nic_params.get(nic_model), mac,
 netdev_id, nic_params.get(nic_extra_params))
 # Handle the '-net tap' or '-net user' part
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 2/2] KVM test: kvm_vm.py: make 'nic_mac' trigger a VM restart when changed

2011-02-18 Thread Michael Goldish
get_mac_address() should first check if 'nic_mac' is defined and then check
the address pool.  This way, if 'nic_mac' is changed between tests,
make_qemu_command(), which calls get_mac_address(), will reveal the change and
trigger a VM restart.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_vm.py |   16 
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/client/tests/kvm/kvm_vm.py b/client/tests/kvm/kvm_vm.py
index d852784..1ceef7a 100755
--- a/client/tests/kvm/kvm_vm.py
+++ b/client/tests/kvm/kvm_vm.py
@@ -850,15 +850,12 @@ class VM:
 for vlan in range(num_nics):
 nic_name = params.objects(nics)[vlan]
 nic_params = params.object_params(nic_name)
-if nic_params.get(nic_mac, None):
-mac = nic_params.get(nic_mac)
+mac = (nic_params.get(nic_mac) or
+   mac_source and mac_source.get_mac_address(vlan))
+if mac:
 kvm_utils.set_mac_address(self.instance, vlan, mac)
 else:
-mac = mac_source and mac_source.get_mac_address(vlan)
-if mac:
-kvm_utils.set_mac_address(self.instance, vlan, mac)
-else:
-kvm_utils.generate_mac_address(self.instance, vlan)
+kvm_utils.generate_mac_address(self.instance, vlan)
 
 # Assign a PCI assignable device
 self.pci_assignable = None
@@ -1233,7 +1230,10 @@ class VM:
 @raise VMMACAddressMissingError: If no MAC address is defined for the
 requested NIC
 
-mac = kvm_utils.get_mac_address(self.instance, nic_index)
+nic_name = self.params.objects(nics)[nic_index]
+nic_params = self.params.object_params(nic_name)
+mac = (nic_params.get(nic_mac) or
+   kvm_utils.get_mac_address(self.instance, nic_index))
 if not mac:
 raise VMMACAddressMissingError(nic_index)
 return mac
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] kvm: fix detection of BIOS disabling VMX

2011-02-18 Thread Cihula, Joseph
 From: Avi Kivity [mailto:a...@redhat.com]
 Sent: Thursday, February 10, 2011 2:02 AM
 
 On 02/08/2011 09:45 PM, Joseph Cihula wrote:
  This patch fixes the logic used to detect whether BIOS has disabled VMX.
 
 
 Applied, thanks.

Will you request this be taken by the -stable kernel?

Joe
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM test: Include start_vm = yes on nic_bonding

2011-02-18 Thread Lucas Meneghel Rodrigues
On Fri, 2011-02-18 at 17:34 -0200, Lucas Meneghel Rodrigues wrote:
 So the VM can be restarted to include the new NICs required
 for the test.

Ignore this patch, please :)

 Signed-off-by: Lucas Meneghel Rodrigues l...@redhat.com
 ---
  client/tests/kvm/tests_base.cfg.sample |1 +
  1 files changed, 1 insertions(+), 0 deletions(-)
 
 diff --git a/client/tests/kvm/tests_base.cfg.sample 
 b/client/tests/kvm/tests_base.cfg.sample
 index 816a94f..cfe343d 100644
 --- a/client/tests/kvm/tests_base.cfg.sample
 +++ b/client/tests/kvm/tests_base.cfg.sample
 @@ -703,6 +703,7 @@ variants:
  - nic_bonding:
  type = nic_bonding
  nics += ' nic2 nic3 nic4'
 +start_vm = yes
  image_snapshot = yes
  serial_login = yes
  test_timeout = 1000


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Remove ethtool from rtl8139 variant

2011-02-18 Thread Amos Kong
On Fri, Feb 18, 2011 at 05:14:09PM -0200, Lucas Meneghel Rodrigues wrote:
 As the original patch set don't define supported_features
 for it.
 
 Signed-off-by: Lucas Meneghel Rodrigues l...@redhat.com

Acked-by: Amos Kong ak...@redhat.com

 ---
  client/tests/kvm/tests/ethtool.py  |6 +-
  client/tests/kvm/tests_base.cfg.sample |1 +
  2 files changed, 6 insertions(+), 1 deletions(-)
 
 diff --git a/client/tests/kvm/tests/ethtool.py 
 b/client/tests/kvm/tests/ethtool.py
 index 81e45d3..d7c6b57 100644
 --- a/client/tests/kvm/tests/ethtool.py
 +++ b/client/tests/kvm/tests/ethtool.py
 @@ -191,7 +191,11 @@ def run_ethtool(test, params, env):
  filename = /tmp/ethtool.dd
  guest_ip = vm.get_address()
  ethname = kvm_test_utils.get_linux_ifname(session, vm.get_mac_address(0))
 -supported_features = params.get(supported_features).split()
 +supported_features = params.get(supported_features)
 +if supported_features:
 +supported_features = supported_features.split()
 +else:
 +supported_features = []
  test_matrix = {
  # type:(callback,(dependence), (exclude)
  tx:  (tx_callback, (), ()),
 diff --git a/client/tests/kvm/tests_base.cfg.sample 
 b/client/tests/kvm/tests_base.cfg.sample
 index 7eb3635..816a94f 100644
 --- a/client/tests/kvm/tests_base.cfg.sample
 +++ b/client/tests/kvm/tests_base.cfg.sample
 @@ -903,6 +903,7 @@ variants:
  variants:
  - @rtl8139:
  nic_model = rtl8139
 +no ethtool
  jumbo:
  mtu = 1500
  - e1000:
 -- 
 1.7.4
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html