Note to RiscV folks: You may want to adapt your code to do the same as this, esp. afaik, what you do today is endian-broken if host and guest endian are different.
Cheers, Ben. On Fri, 2018-12-14 at 10:58 +1100, Benjamin Herrenschmidt wrote: > Afaik, this isn't well documented (at least it wasn't when I last looked) > but OSes such as Linux rely on this behaviour: > > The HW updates to the page tables need to be done atomically with the > checking of the present bit (and other permissions). > > This is what allows Linux to do simple xchg of PTEs with 0 and assume > the value read has "final" stable dirty and accessed bits (the TLB > invalidation is deferred). > > Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org> > --- > target/i386/excp_helper.c | 104 +++++++++++++++++++++++++++++--------- > 1 file changed, 80 insertions(+), 24 deletions(-) > > diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c > index 49231f6b69..93fc24c011 100644 > --- a/target/i386/excp_helper.c > +++ b/target/i386/excp_helper.c > @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > > #else > > +static inline uint64_t update_entry(CPUState *cs, target_ulong addr, > + uint64_t orig_entry, uint32_t bits) > +{ > + uint64_t new_entry = orig_entry | bits; > + > + /* Write the updated bottom 32-bits */ > + if (qemu_tcg_mttcg_enabled()) { > + uint32_t old_le = cpu_to_le32(orig_entry); > + uint32_t new_le = cpu_to_le32(new_entry); > + MemTxResult result; > + uint32_t old_ret; > + > + old_ret = address_space_cmpxchgl_notdirty(cs->as, addr, > + old_le, new_le, > + MEMTXATTRS_UNSPECIFIED, > + &result); > + if (result == MEMTX_OK) { > + if (old_ret != old_le && old_ret != new_le) { > + new_entry = 0; > + } > + return new_entry; > + } > + > + /* Do we need to support this case where PTEs aren't in RAM ? > + * > + * For now fallback to non-atomic case > + */ > + } > + > + x86_stl_phys_notdirty(cs, addr, new_entry); > + > + return new_entry; > +} > + > static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType > access_type, > int *prot) > { > CPUX86State *env = &X86_CPU(cs)->env; > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > + uint64_t rsvd_mask; > uint64_t ptep, pte; > uint64_t exit_info_1 = 0; > target_ulong pde_addr, pte_addr; > @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > MMUAccessType access_type, > return gphys; > } > > + restart: > + rsvd_mask = PG_HI_RSVD_MASK; > if (!(env->nested_pg_mode & SVM_NPT_NXE)) { > rsvd_mask |= PG_NX_MASK; > } > @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > MMUAccessType access_type, > goto do_fault_rsvd; > } > if (!(pml4e & PG_ACCESSED_MASK)) { > - pml4e |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > + pml4e = update_entry(cs, pml4e_addr, pml4e, > PG_ACCESSED_MASK); > + if (!pml4e) { > + goto restart; > + } > } > ptep &= pml4e ^ PG_NX_MASK; > pdpe_addr = (pml4e & PG_ADDRESS_MASK) + > @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > MMUAccessType access_type, > } > ptep &= pdpe ^ PG_NX_MASK; > if (!(pdpe & PG_ACCESSED_MASK)) { > - pdpe |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > + if (!pdpe) { > + goto restart; > + } > } > if (pdpe & PG_PSE_MASK) { > /* 1 GB page */ > @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > MMUAccessType access_type, > } > /* 4 KB page */ > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3); > pte = x86_ldq_phys(cs, pte_addr); > @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, > MMUAccessType access_type, > } > > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > > /* page directory entry */ > @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > int error_code = 0; > int is_dirty, prot, page_size, is_write, is_user; > hwaddr paddr; > - uint64_t rsvd_mask = PG_HI_RSVD_MASK; > + uint64_t rsvd_mask; > uint32_t page_offset; > target_ulong vaddr; > > @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > goto do_mapping; > } > > + restart: > + rsvd_mask = PG_HI_RSVD_MASK; > if (!(env->efer & MSR_EFER_NXE)) { > rsvd_mask |= PG_NX_MASK; > } > @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > goto do_fault_rsvd; > } > if (!(pml5e & PG_ACCESSED_MASK)) { > - pml5e |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pml5e_addr, pml5e); > + pml5e = update_entry(cs, pml5e_addr, pml5e, > PG_ACCESSED_MASK); > + if (!pml5e) { > + goto restart; > + } > } > ptep = pml5e ^ PG_NX_MASK; > } else { > @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > goto do_fault_rsvd; > } > if (!(pml4e & PG_ACCESSED_MASK)) { > - pml4e |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pml4e_addr, pml4e); > + pml4e = update_entry(cs, pml4e_addr, pml4e, > PG_ACCESSED_MASK); > + if (!pml4e) { > + goto restart; > + } > } > ptep &= pml4e ^ PG_NX_MASK; > pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) > << 3)) & > @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > } > ptep &= pdpe ^ PG_NX_MASK; > if (!(pdpe & PG_ACCESSED_MASK)) { > - pdpe |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pdpe_addr, pdpe); > + pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK); > + if (!pdpe) { > + goto restart; > + } > } > if (pdpe & PG_PSE_MASK) { > /* 1 GB page */ > @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > } > /* 4 KB page */ > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) > & > a20_mask; > @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, > int size, > } > > if (!(pde & PG_ACCESSED_MASK)) { > - pde |= PG_ACCESSED_MASK; > - x86_stl_phys_notdirty(cs, pde_addr, pde); > + pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK); > + if (!pde) { > + goto restart; > + } > } > > /* page directory entry */ > @@ -634,11 +690,11 @@ do_check_protect_pse36: > /* yes, it can! */ > is_dirty = is_write && !(pte & PG_DIRTY_MASK); > if (!(pte & PG_ACCESSED_MASK) || is_dirty) { > - pte |= PG_ACCESSED_MASK; > - if (is_dirty) { > - pte |= PG_DIRTY_MASK; > + pte = update_entry(cs, pte_addr, pte, > + PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : > 0)); > + if (!pte) { > + goto restart; > } > - x86_stl_phys_notdirty(cs, pte_addr, pte); > } > > if (!(pte & PG_DIRTY_MASK)) {