On Tue, Dec 20, 2022 at 09:03:06PM -0800, Richard Henderson wrote: > Finish weaning user-only away from PageDesc. > > Using an interval tree to track page permissions means that > we can represent very large regions efficiently. > > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/290 > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/967 > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1214 > Reviewed-by: Alex Bennée <alex.ben...@linaro.org> > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > accel/tcg/internal.h | 4 +- > accel/tcg/tb-maint.c | 20 +- > accel/tcg/user-exec.c | 615 ++++++++++++++++++++++----------- > tests/tcg/multiarch/test-vma.c | 22 ++ > 4 files changed, 451 insertions(+), 210 deletions(-) > create mode 100644 tests/tcg/multiarch/test-vma.c
Hi, After staring at vma-pthread.c failures for some time, I finally spotted a few lines here that look suspicious. <skip> > int page_get_flags(target_ulong address) > { > - PageDesc *p; > + PageFlagsNode *p = pageflags_find(address, address); > > - p = page_find(address >> TARGET_PAGE_BITS); > - if (!p) { > + /* > + * See util/interval-tree.c re lockless lookups: no false positives but > + * there are false negatives. If we find nothing, retry with the mmap > + * lock acquired. > + */ > + if (p) { > + return p->flags; > + } > + if (have_mmap_lock()) { > return 0; > } > - return p->flags; > + > + mmap_lock(); > + p = pageflags_find(address, address); > + mmap_unlock(); How does the code ensure that p is not freed here? > + return p ? p->flags : 0; > +} <skip> > int page_check_range(target_ulong start, target_ulong len, int flags) > { > - PageDesc *p; > - target_ulong end; > - target_ulong addr; > - > - /* > - * This function should never be called with addresses outside the > - * guest address space. If this assert fires, it probably indicates > - * a missing call to h2g_valid. > - */ > - if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) { > - assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS)); > - } > + target_ulong last; > > if (len == 0) { > - return 0; > - } > - if (start + len - 1 < start) { > - /* We've wrapped around. */ > - return -1; > + return 0; /* trivial length */ > } > > - /* must do before we loose bits in the next step */ > - end = TARGET_PAGE_ALIGN(start + len); > - start = start & TARGET_PAGE_MASK; > + last = start + len - 1; > + if (last < start) { > + return -1; /* wrap around */ > + } > + > + while (true) { > + PageFlagsNode *p = pageflags_find(start, last); We can end up here without mmap_lock if we come from the syscall code. Do we need a retry like in page_get_flags()? Or would it make sense to just take mmap_lock in lock_user()? Speaking of which: does lock_user() actually guarantee that it's safe to access the respective pages until unlock_user()? If yes, doesn't this mean that mmap_lock must be held between the two? And if no, and the SEGV handler is already supposed to gracefully handle SEGVs in syscall.c, do we need to call access_ok_untagged() there at all? > + int missing; > > - for (addr = start, len = end - start; > - len != 0; > - len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) { > - p = page_find(addr >> TARGET_PAGE_BITS); > if (!p) { > - return -1; > + return -1; /* entire region invalid */ > } > - if (!(p->flags & PAGE_VALID)) { > - return -1; > + if (start < p->itree.start) { > + return -1; /* initial bytes invalid */ > } > > - if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) { > - return -1; > + missing = flags & ~p->flags; > + if (missing & PAGE_READ) { > + return -1; /* page not readable */ > } > - if (flags & PAGE_WRITE) { > + if (missing & PAGE_WRITE) { > if (!(p->flags & PAGE_WRITE_ORG)) { > + return -1; /* page not writable */ > + } > + /* Asking about writable, but has been protected: undo. */ > + if (!page_unprotect(start, 0)) { > return -1; > } > - /* unprotect the page if it was put read-only because it > - contains translated code */ > - if (!(p->flags & PAGE_WRITE)) { > - if (!page_unprotect(addr, 0)) { > - return -1; > - } > + /* TODO: page_unprotect should take a range, not a single page. > */ > + if (last - start < TARGET_PAGE_SIZE) { > + return 0; /* ok */ > } > + start += TARGET_PAGE_SIZE; > + continue; > } > + > + if (last <= p->itree.last) { > + return 0; /* ok */ > + } > + start = p->itree.last + 1; > } > - return 0; > } <skip> > int page_unprotect(target_ulong address, uintptr_t pc) > { > - unsigned int prot; > + PageFlagsNode *p; > bool current_tb_invalidated; > - PageDesc *p; > - target_ulong host_start, host_end, addr; > > /* > * Technically this isn't safe inside a signal handler. However we > @@ -429,40 +627,54 @@ int page_unprotect(target_ulong address, uintptr_t pc) > */ > mmap_lock(); > > - p = page_find(address >> TARGET_PAGE_BITS); > - if (!p) { > + p = pageflags_find(address, address); > + > + /* If this address was not really writable, nothing to do. */ > + if (!p || !(p->flags & PAGE_WRITE_ORG)) { > mmap_unlock(); > return 0; > } > > - /* > - * If the page was really writable, then we change its > - * protection back to writable. > - */ > - if (p->flags & PAGE_WRITE_ORG) { > - current_tb_invalidated = false; > - if (p->flags & PAGE_WRITE) { > - /* > - * If the page is actually marked WRITE then assume this is > because > - * this thread raced with another one which got here first and > - * set the page to PAGE_WRITE and did the TB invalidate for us. > - */ > + current_tb_invalidated = false; > + if (p->flags & PAGE_WRITE) { > + /* > + * If the page is actually marked WRITE then assume this is because > + * this thread raced with another one which got here first and > + * set the page to PAGE_WRITE and did the TB invalidate for us. > + */ > #ifdef TARGET_HAS_PRECISE_SMC > - TranslationBlock *current_tb = tcg_tb_lookup(pc); > - if (current_tb) { > - current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID; > - } > + TranslationBlock *current_tb = tcg_tb_lookup(pc); > + if (current_tb) { > + current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID; > + } > #endif > + } else { > + target_ulong start, len, i; > + int prot; > + > + if (qemu_host_page_size <= TARGET_PAGE_SIZE) { > + start = address & TARGET_PAGE_MASK; > + len = TARGET_PAGE_SIZE; > + prot = p->flags | PAGE_WRITE; > + pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0); > + current_tb_invalidated = tb_invalidate_phys_page_unwind(start, > pc); When we come from page_check_range(), pc == 0 and the assertion in tb_invalidate_phys_page_unwind() fires. Should we pass current_cpu->cc->get_pc() to page_unprotect() instead of 0, so that current_tb is resolved to the TB that invoked the syscall? > } else { > - host_start = address & qemu_host_page_mask; > - host_end = host_start + qemu_host_page_size; > - > + start = address & qemu_host_page_mask; > + len = qemu_host_page_size; > prot = 0; > - for (addr = host_start; addr < host_end; addr += > TARGET_PAGE_SIZE) { > - p = page_find(addr >> TARGET_PAGE_BITS); > - p->flags |= PAGE_WRITE; > - prot |= p->flags; > > + for (i = 0; i < len; i += TARGET_PAGE_SIZE) { > + target_ulong addr = start + i; > + > + p = pageflags_find(addr, addr); > + if (p) { > + prot |= p->flags; > + if (p->flags & PAGE_WRITE_ORG) { > + prot |= PAGE_WRITE; > + pageflags_set_clear(addr, addr + TARGET_PAGE_SIZE - > 1, > + PAGE_WRITE, 0); > + } > + } > /* > * Since the content will be modified, we must invalidate > * the corresponding translated code. > @@ -470,15 +682,16 @@ int page_unprotect(target_ulong address, uintptr_t pc) > current_tb_invalidated |= > tb_invalidate_phys_page_unwind(addr, pc); > } > - mprotect((void *)g2h_untagged(host_start), qemu_host_page_size, > - prot & PAGE_BITS); > } > - mmap_unlock(); > - /* If current TB was invalidated return to main loop */ > - return current_tb_invalidated ? 2 : 1; > + if (prot & PAGE_EXEC) { > + prot = (prot & ~PAGE_EXEC) | PAGE_READ; > + } > + mprotect((void *)g2h_untagged(start), len, prot & PAGE_BITS); > } > mmap_unlock(); > - return 0; > + > + /* If current TB was invalidated return to main loop */ > + return current_tb_invalidated ? 2 : 1; > } Best regards, Ilya