From: William Roche <william.ro...@oracle.com>

Migrating a poisoned page as a zero-page can only be done when the
running guest kernel knows about this poison, so that it marks this
page as unaccessible and any access in the VM would fail.

But if a poison information is not relayed to the VM, the kernel
does not prevent its access. In this case, transforming a poisoned
page into a zero-page could create a case of silent data corruption.

So we have to keep track of poisons not injected into the guest,
like the ARM VM emulation ignoring BUS_MCEERR_AO errors.
When such a page exists, the migration has to be blocked.

Signed-off-by: William Roche <william.ro...@oracle.com>
---
 accel/kvm/kvm-all.c      | 27 ++++++++++++++++++++++++++-
 accel/stubs/kvm-stub.c   |  5 +++++
 include/sysemu/kvm.h     |  6 ++++++
 include/sysemu/kvm_int.h |  3 ++-
 migration/migration.c    |  6 ++++++
 target/arm/kvm64.c       |  6 +++++-
 target/i386/kvm/kvm.c    |  2 +-
 7 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 850577ea0e..2829b6372a 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1133,8 +1133,17 @@ int kvm_vm_check_extension(KVMState *s, unsigned int 
extension)
     return ret;
 }
 
+/*
+ * We track the poisoned pages to be able to:
+ * - replace them on VM reset
+ * - skip them when migrating
+ * - block a migration for a VM where a poisoned page is ignored
+ *   as this VM kernel (not knowing about the error) could
+ *   incorrectly access the page.
+ */
 typedef struct HWPoisonPage {
     ram_addr_t ram_addr;
+    bool       vm_known;
     QLIST_ENTRY(HWPoisonPage) list;
 } HWPoisonPage;
 
@@ -1166,20 +1175,36 @@ bool kvm_hwpoisoned_page(RAMBlock *block, void *offset)
     return false;
 }
 
-void kvm_hwpoison_page_add(ram_addr_t ram_addr)
+void kvm_hwpoison_page_add(ram_addr_t ram_addr, bool known)
 {
     HWPoisonPage *page;
 
     QLIST_FOREACH(page, &hwpoison_page_list, list) {
         if (page->ram_addr == ram_addr) {
+            if (known && !page->vm_known) {
+                page->vm_known = true;
+            }
             return;
         }
     }
     page = g_new(HWPoisonPage, 1);
     page->ram_addr = ram_addr;
+    page->vm_known = known;
     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
 }
 
+bool kvm_hwpoisoned_unknown(void)
+{
+    HWPoisonPage *pg;
+
+    QLIST_FOREACH(pg, &hwpoison_page_list, list) {
+        if (!pg->vm_known) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
 {
 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index c0a31611df..c43de44263 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -138,3 +138,8 @@ bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr)
 {
     return false;
 }
+
+bool kvm_hwpoisoned_unknown(void)
+{
+    return false;
+}
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 858688227a..37c8316ce4 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -580,4 +580,10 @@ uint32_t kvm_dirty_ring_size(void);
  *          false: page not yet poisoned
  */
 bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr);
+
+/**
+ * kvm_hwpoisoned_unknown - indicate if a qemu reported memory error
+ * is still unknown to (hasn't been injected into) the VM kernel.
+ */
+bool kvm_hwpoisoned_unknown(void);
 #endif
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index a5b9122cb8..2dfde40690 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -136,10 +136,11 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size);
  *
  * Parameters:
  *  @ram_addr: the address in the RAM for the poisoned page
+ *  @known: indicate if the error is injected to the VM kernel
  *
  * Add a poisoned page to the list
  *
  * Return: None.
  */
-void kvm_hwpoison_page_add(ram_addr_t ram_addr);
+void kvm_hwpoison_page_add(ram_addr_t ram_addr, bool known);
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index 1c6c81ad49..27e9571aaf 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -66,6 +66,7 @@
 #include "sysemu/qtest.h"
 #include "options.h"
 #include "sysemu/dirtylimit.h"
+#include "sysemu/kvm.h"
 
 static NotifierList migration_state_notifiers =
     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
@@ -1646,6 +1647,11 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
         return false;
     }
 
+    if (kvm_hwpoisoned_unknown()) {
+        error_setg(errp, "Can't migrate this vm with ignored poisoned page");
+        return false;
+    }
+
     if (migration_is_blocked(errp)) {
         return false;
     }
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 5e95c496bb..e8db6380c1 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -1158,7 +1158,6 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void 
*addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
-            kvm_hwpoison_page_add(ram_addr);
             /*
              * If this is a BUS_MCEERR_AR, we know we have been called
              * synchronously from the vCPU thread, so we can easily
@@ -1169,7 +1168,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void 
*addr)
              * called synchronously from the vCPU thread, or a bit
              * later from the main thread, so doing the injection of
              * the error would be more complicated.
+             * In this case, BUS_MCEERR_AO errors are unknown from the
+             * guest, and we will prevent migration as long as this
+             * poisoned page hasn't generated a BUS_MCEERR_AR error
+             * that the guest takes into account.
              */
+            kvm_hwpoison_page_add(ram_addr, (code == BUS_MCEERR_AR));
             if (code == BUS_MCEERR_AR) {
                 kvm_cpu_synchronize_state(c);
                 if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index f6c7f7e268..f9365b4457 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -649,7 +649,7 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void 
*addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
-            kvm_hwpoison_page_add(ram_addr);
+            kvm_hwpoison_page_add(ram_addr, true);
             kvm_mce_inject(cpu, paddr, code);
 
             /*
-- 
2.39.3


Reply via email to