Paolo Bonzini <pbonz...@redhat.com> writes:
> The race is as follows: > > vCPU thread reader thread > ----------------------- ----------------------- > TLB check -> slow path > notdirty_mem_write > write to RAM > set dirty flag > clear dirty flag > TLB check -> fast path > read memory > write to RAM > > and the second write is missed by the reader. > > Fortunately, in order to fix it, no change is required to the > vCPU thread. However, the reader thread must delay the read after > the vCPU thread has finished the write. This can be approximated > conservatively by run_on_cpu, which waits for the end of the current > translation block. > > A similar technique is used by KVM, which has to do a synchronous TLB > flush after doing a test-and-clear of the dirty-page flags. > > Reported-by: Dr. David Alan Gilbert <dgilb...@redhat.com> > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> Reviewed-by: Alex Bennée <alex.ben...@linaro.org> Tested-by: Alex Bennée <alex.ben...@linaro.org> > --- > I tested this some time ago, and enough has changed that I don't > really trust those old results. Nevertheless, I am throwing out > the patch so that it is not forgotten. > > exec.c | 31 +++++++++++++++++++++++++++++++ > include/exec/memory.h | 12 ++++++++++++ > memory.c | 10 +++++++++- > migration/ram.c | 1 + > 4 files changed, 53 insertions(+), 1 deletion(-) > > diff --git a/exec.c b/exec.c > index 3e78de3b8f..ae68f72da4 100644 > --- a/exec.c > +++ b/exec.c > @@ -198,6 +198,7 @@ typedef struct subpage_t { > > static void io_mem_init(void); > static void memory_map_init(void); > +static void tcg_log_global_after_sync(MemoryListener *listener); > static void tcg_commit(MemoryListener *listener); > > static MemoryRegion io_mem_watch; > @@ -906,6 +907,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, > newas->cpu = cpu; > newas->as = as; > if (tcg_enabled()) { > + newas->tcg_as_listener.log_global_after_sync = > tcg_log_global_after_sync; > newas->tcg_as_listener.commit = tcg_commit; > memory_listener_register(&newas->tcg_as_listener, as); > } > @@ -3143,6 +3145,35 @@ void address_space_dispatch_free(AddressSpaceDispatch > *d) > g_free(d); > } > > +static void do_nothing(CPUState *cpu, run_on_cpu_data d) > +{ > +} > + > +static void tcg_log_global_after_sync(MemoryListener *listener) > +{ > + CPUAddressSpace *cpuas; > + > + /* Wait for the CPU to end the current TB. This avoids the following > + * incorrect race: > + * > + * vCPU migration > + * ---------------------- ------------------------- > + * TLB check -> slow path > + * notdirty_mem_write > + * write to RAM > + * mark dirty > + * clear dirty flag > + * TLB check -> fast path > + * read memory > + * write to RAM > + * > + * by pushing the migration thread's memory read after the vCPU thread > has > + * written the memory. > + */ > + cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener); > + run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL); > +} > + > static void tcg_commit(MemoryListener *listener) > { > CPUAddressSpace *cpuas; > diff --git a/include/exec/memory.h b/include/exec/memory.h > index bb0961ddb9..b6bcf31b0a 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -419,6 +419,7 @@ struct MemoryListener { > void (*log_clear)(MemoryListener *listener, MemoryRegionSection > *section); > void (*log_global_start)(MemoryListener *listener); > void (*log_global_stop)(MemoryListener *listener); > + void (*log_global_after_sync)(MemoryListener *listener); > void (*eventfd_add)(MemoryListener *listener, MemoryRegionSection > *section, > bool match_data, uint64_t data, EventNotifier *e); > void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection > *section, > @@ -1681,6 +1682,17 @@ MemoryRegionSection memory_region_find(MemoryRegion > *mr, > */ > void memory_global_dirty_log_sync(void); > > +/** > + * memory_global_dirty_log_sync: synchronize the dirty log for all memory > + * > + * Synchronizes the vCPUs with a thread that is reading the dirty bitmap. > + * This function must be called after the dirty log bitmap is cleared, and > + * before dirty guest memory pages are read. If you are using > + * #DirtyBitmapSnapshot, memory_region_snapshot_and_clear_dirty() takes > + * care of doing this. > + */ > +void memory_global_after_dirty_log_sync(void); > + > /** > * memory_region_transaction_begin: Start a transaction. > * > diff --git a/memory.c b/memory.c > index e42d63a3a0..edd0c13c38 100644 > --- a/memory.c > +++ b/memory.c > @@ -2127,9 +2127,12 @@ DirtyBitmapSnapshot > *memory_region_snapshot_and_clear_dirty(MemoryRegion *mr, > hwaddr size, > unsigned client) > { > + DirtyBitmapSnapshot *snapshot; > assert(mr->ram_block); > memory_region_sync_dirty_bitmap(mr); > - return cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size, > client); > + snapshot = cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size, > client); > + memory_global_after_dirty_log_sync(); > + return snapshot; > } > > bool memory_region_snapshot_get_dirty(MemoryRegion *mr, DirtyBitmapSnapshot > *snap, > @@ -2620,6 +2623,11 @@ void memory_global_dirty_log_sync(void) > memory_region_sync_dirty_bitmap(NULL); > } > > +void memory_global_after_dirty_log_sync(void) > +{ > + MEMORY_LISTENER_CALL_GLOBAL(log_global_after_sync, Forward); > +} > + > static VMChangeStateEntry *vmstate_change; > > void memory_global_dirty_log_start(void) > diff --git a/migration/ram.c b/migration/ram.c > index 2b0774c2bf..b9d6a3921d 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -1801,6 +1801,7 @@ static void migration_bitmap_sync(RAMState *rs) > rcu_read_unlock(); > qemu_mutex_unlock(&rs->bitmap_mutex); > > + memory_global_after_dirty_log_sync(); > trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); > > end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); -- Alex Bennée