On Thu, 2018-11-29 at 10:26 +0100, Paolo Bonzini wrote: > On 29/11/18 00:15, Benjamin Herrenschmidt wrote: > > Afaik, this isn't well documented (at least it wasn't when I last looked) > > but OSes such as Linux rely on this behaviour: > > > > The HW updates to the page tables need to be done atomically with the > > checking of the present bit (and other permissions). > > > > This is what allows Linux to do simple xchg of PTEs with 0 and assume > > the value read has "final" stable dirty and accessed bits (the TLB > > invalidation is deferred). > > > > Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org> > > --- > > > > This is lightly tested at this point, mostly to gather comments on the > > approach. > > Looks good, but please kill the notdirty stuff first. It's not needed > and it's a clear bug. When migrating, it can lead to PTEs being > migrated without accessed and dirty bits.
I thought that too but looking closely with rth, it seems it's still setting *those* dirty bits, it just avoids the collision test with the translator. Unless I missed something... Do you still want to kill them ? They are warty, no doubt... For powerpc I need a cmpxchgq variant too, I'll probably split the patch adding those mem ops from the rest as well. Annoyingly, fixing riscv will need some tests on target_ulong size. Cheers, Ben. > Paolo > > > I noticed that RiscV is attempting to do something similar but with endian > > bugs, at least from my reading of the code. > > > > include/exec/memory_ldst.inc.h | 3 + > > memory_ldst.inc.c | 38 ++++++++++++ > > target/i386/excp_helper.c | 104 +++++++++++++++++++++++++-------- > > 3 files changed, 121 insertions(+), 24 deletions(-) > > > > diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h > > index 272c20f02e..b7a41a0ad4 100644 > > --- a/include/exec/memory_ldst.inc.h > > +++ b/include/exec/memory_ldst.inc.h > > @@ -28,6 +28,9 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL, > > hwaddr addr, MemTxAttrs attrs, MemTxResult *result); > > extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, > > hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); > > +extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, > > + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, > > + MemTxResult *result); > > extern void glue(address_space_stw, SUFFIX)(ARG1_DECL, > > hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); > > extern void glue(address_space_stl, SUFFIX)(ARG1_DECL, > > diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c > > index acf865b900..63af8f7dd2 100644 > > --- a/memory_ldst.inc.c > > +++ b/memory_ldst.inc.c > > @@ -320,6 +320,44 @@ void glue(address_space_stl_notdirty, > > SUFFIX)(ARG1_DECL, > > RCU_READ_UNLOCK(); > > } > > > > +/* This is meant to be used for atomic PTE updates under MT-TCG */ > > +uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL, > > + hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult > > *result) > > +{ > > + uint8_t *ptr; > > + MemoryRegion *mr; > > + hwaddr l = 4; > > + hwaddr addr1; > > + MemTxResult r; > > + uint8_t dirty_log_mask; > > + > > + /* Must test result */ > > + assert(result); > > + > > + RCU_READ_LOCK(); > > + mr = TRANSLATE(addr, &addr1, &l, true, attrs); > > + if (l < 4 || !memory_access_is_direct(mr, true)) { > > + r = MEMTX_ERROR; > > + } else { > > + uint32_t orig = old; > > + > > + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); > > + old = atomic_cmpxchg(ptr, orig, new); > > + > > + if (old == orig) { > > + dirty_log_mask = memory_region_get_dirty_log_mask(mr); > > + dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); > > + > > cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, > > + 4, dirty_log_mask); > > + } > > + r = MEMTX_OK; > > + } > > + *result = r; > > + RCU_READ_UNLOCK(); > > + > > + return old; > > +} > > + > > /* warning: addr must be aligned */ > > static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL, > > hwaddr addr, uint32_t val, MemTxAttrs attrs, > > diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c > > index 49231f6b69..5ccb9d6d6a 100644 > > --- a/target/i386/excp_helper.c > > +++ b/target/i386/excp_helper.c > > @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr > > addr, int size, > > > > #else > > > > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, > > + uint64_t orig_entry, uint32_t bits) > > +{ > > + uint64_t new_entry = orig_entry | bits; > > + > > + /* Write the updated bottom 32-bits */ > > + if (qemu_tcg_mttcg_enabled()) { > > + uint32_t old_le = cpu_to_le32(orig_entry); > > + uint32_t new_le = cpu_to_le32(new_entry); > > + MemTxResult result; > > + uint32_t old_ret; > > + > > + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, > > + old_le, new_le, > > + MEMTXATTRS_UNSPECIFIED, > > + &result); > > + if (result == MEMTX_OK) { > > + if (old_ret != old_le) { > > + new_entry = 0; > > + } > > + return new_entry; > > + } > > + > > + /* Do we need to support this case where PTEs aren't in RAM ? > > + * > > + * For now fallback to non-atomic case > > + */ > > + } > > + > > + x86_stl_phys_notdirty(cs, addr, new_entry); > > + > > + return new_entry; > > +} > > + > > static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType > > access_type, > > int *prot) > > { > > CPUX86State *env = &X86_CPU(cs)->env; > > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > > + uint64_t rsvd_mask; > > uint64_t ptep, pte; > > uint64_t exit_info_1 = 0; > > target_ulong pde_addr, pte_addr; > > @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > > MMUAccessType access_type, > > return gphys; > > } > > > > + restart: > > + rsvd_mask = PG_HI_RSVD_MASK; > > if (!(env->nested_pg_mode & SVM_NPT_NXE)) { > > rsvd_mask |= PG_NX_MASK; > > } > > @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > > MMUAccessType access_type, > > goto do_fault_rsvd; > > } > > if (!(pml4e & PG_ACCESSED_MASK)) { > > - pml4e |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > > + pml4e = update_entry(cs, pml4e_addr, pml4e, > > PG_ACCESSED_MASK); > > + if (!pml4e) { > > + goto restart; > > + } > > } > > ptep &= pml4e ^ PG_NX_MASK; > > pdpe_addr = (pml4e & PG_ADDRESS_MASK) + > > @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > > MMUAccessType access_type, > > } > > ptep &= pdpe ^ PG_NX_MASK; > > if (!(pdpe & PG_ACCESSED_MASK)) { > > - pdpe |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > > + if (!pdpe) { > > + goto restart; > > + } > > } > > if (pdpe & PG_PSE_MASK) { > > /* 1 GB page */ > > @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > > MMUAccessType access_type, > > } > > /* 4 KB page */ > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << > > 3); > > pte = x86_ldq_phys(cs, pte_addr); > > @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > > MMUAccessType access_type, > > } > > > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > > > /* page directory entry */ > > @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > > int size, > > int error_code = 0; > > int is_dirty, prot, page_size, is_write, is_user; > > hwaddr paddr; > > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > > + uint64_t rsvd_mask; > > uint32_t page_offset; > > target_ulong vaddr; > > > > @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > > int size, > > goto do_mapping; > > } > > > > + restart: > > + rsvd_mask = PG_HI_RSVD_MASK; > > if (!(env->efer & MSR_EFER_NXE)) { > > rsvd_mask |= PG_NX_MASK; > > } > > @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > > int size, > > goto do_fault_rsvd; > > } > > if (!(pml5e & PG_ACCESSED_MASK)) { > > - pml5e |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pml5e_addr, pml5e); > > + pml5e = update_entry(cs, pml5e_addr, pml5e, > > PG_ACCESSED_MASK); > > + if (!pml5e) { > > + goto restart; > > + } > > } > > ptep = pml5e ^ PG_NX_MASK; > > } else { > > @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > > int size, > > goto do_fault_rsvd; > > } > > if (!(pml4e & PG_ACCESSED_MASK)) { > > - pml4e |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > > + pml4e = update_entry(cs, pml4e_addr, pml4e, > > PG_ACCESSED_MASK); > > + if (!pml4e) { > > + goto restart; > > + } > > } > > ptep &= pml4e ^ PG_NX_MASK; > > pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & > > 0x1ff) << 3)) & > > @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > > int size, > > } > > ptep &= pdpe ^ PG_NX_MASK; > > if (!(pdpe & PG_ACCESSED_MASK)) { > > - pdpe |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > > + if (!pdpe) { > > + goto restart; > > + } > > } > > if (pdpe & PG_PSE_MASK) { > > /* 1 GB page */ > > @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > > int size, > > } > > /* 4 KB page */ > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << > > 3)) & > > a20_mask; > > @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > > int size, > > } > > > > if (!(pde & PG_ACCESSED_MASK)) { > > - pde |= PG_ACCESSED_MASK; > > - x86_stl_phys_notdirty(cs, pde_addr, pde); > > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > > + if (!pde) { > > + goto restart; > > + } > > } > > > > /* page directory entry */ > > @@ -634,11 +690,11 @@ do_check_protect_pse36: > > /* yes, it can! */ > > is_dirty = is_write && !(pte & PG_DIRTY_MASK); > > if (!(pte & PG_ACCESSED_MASK) || is_dirty) { > > - pte |= PG_ACCESSED_MASK; > > - if (is_dirty) { > > - pte |= PG_DIRTY_MASK; > > + pte = update_entry(cs, pte_addr, pte, > > + PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : > > 0)); > > + if (!pte) { > > + goto restart; > > } > > - x86_stl_phys_notdirty(cs, pte_addr, pte); > > } > > > > if (!(pte & PG_DIRTY_MASK)) { > > > >