On Thu, Sep 27, 2018 at 03:43:38PM +0800, Peter Xu wrote:
> On Fri, Sep 14, 2018 at 08:41:57PM -0400, Jerome Glisse wrote:
> > On Fri, Sep 14, 2018 at 03:16:11PM +0800, Peter Xu wrote:
> > > On Thu, Sep 13, 2018 at 08:42:39PM -0400, Jerome Glisse wrote:

[...]

> > 
> > From 83abd3f16950a0b5cb6870a04d89d4fcc06b8865 Mon Sep 17 00:00:00 2001
> > From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <[email protected]>
> > Date: Thu, 13 Sep 2018 10:16:30 -0400
> > Subject: [PATCH] mm/mprotect: add a mkwrite paramater to change_protection()
> > MIME-Version: 1.0
> > Content-Type: text/plain; charset=UTF-8
> > Content-Transfer-Encoding: 8bit
> > 
> > The mkwrite parameter allow to change read only pte to write one which
> > is needed by userfaultfd to un-write-protect after a fault have been
> > handled.
> > 
> > Signed-off-by: Jérôme Glisse <[email protected]>
> > ---
> >  include/linux/huge_mm.h  |  2 +-
> >  include/linux/mm.h       |  3 +-
> >  mm/huge_memory.c         | 32 +++++++++++++++++++--
> >  mm/mempolicy.c           |  2 +-
> >  mm/mprotect.c            | 61 +++++++++++++++++++++++++++++-----------
> >  mm/userfaultfd.c         |  2 +-
> >  tools/lib/str_error_r.c  |  9 ++++--
> >  tools/lib/subcmd/pager.c |  5 +++-
> >  8 files changed, 90 insertions(+), 26 deletions(-)
> > 
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index a8a126259bc4..b51ff7f8e65c 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -45,7 +45,7 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, 
> > unsigned long old_addr,
> >                      pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush);
> >  extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >                     unsigned long addr, pgprot_t newprot,
> > -                   int prot_numa);
> > +                   int prot_numa, bool mkwrite);
> >  int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
> >                     pmd_t *pmd, pfn_t pfn, bool write);
> >  int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 5d5c7fd07dc0..2bbf3e33bf9e 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1492,7 +1492,8 @@ extern unsigned long move_page_tables(struct 
> > vm_area_struct *vma,
> >             bool need_rmap_locks);
> >  extern unsigned long change_protection(struct vm_area_struct *vma, 
> > unsigned long start,
> >                           unsigned long end, pgprot_t newprot,
> > -                         int dirty_accountable, int prot_numa);
> > +                         int dirty_accountable, int prot_numa,
> > +                         bool mkwrite);
> >  extern int mprotect_fixup(struct vm_area_struct *vma,
> >                       struct vm_area_struct **pprev, unsigned long start,
> >                       unsigned long end, unsigned long newflags);
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index abf621aba672..7b848b84d80c 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1842,12 +1842,13 @@ bool move_huge_pmd(struct vm_area_struct *vma, 
> > unsigned long old_addr,
> >   *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
> >   */
> >  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > -           unsigned long addr, pgprot_t newprot, int prot_numa)
> > +           unsigned long addr, pgprot_t newprot, int prot_numa,
> > +           bool mkwrite)
> >  {
> >     struct mm_struct *mm = vma->vm_mm;
> >     spinlock_t *ptl;
> >     pmd_t entry;
> > -   bool preserve_write;
> > +   bool preserve_write, do_mkwrite = false;
> >     int ret;
> >  
> >     ptl = __pmd_trans_huge_lock(pmd, vma);
> > @@ -1857,6 +1858,31 @@ int change_huge_pmd(struct vm_area_struct *vma, 
> > pmd_t *pmd,
> >     preserve_write = prot_numa && pmd_write(*pmd);
> >     ret = 1;
> >  
> > +   if (mkwrite && pmd_present(*pmd) && !pmd_write(*pmd)) {
> > +           pmd_t orig_pmd = READ_ONCE(*pmd);
> > +           struct page *page = pmd_page(orig_pmd);
> > +
> > +           VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> > +           /*
> > +            * We can only allow mkwrite if nobody else maps the huge page
> > +            * or it's part.
> > +            */
> > +           if (!trylock_page(page)) {
> > +                   get_page(page);
> > +                   spin_unlock(ptl);
> > +                   lock_page(page);
> > +
> > +                   ptl = __pmd_trans_huge_lock(pmd, vma);
> > +                   if (!ptl)
> > +                           return 0;
> > +           }
> > +           if (pmd_same(*pmd, orig_pmd) && reuse_swap_page(page, NULL)) {
> > +                   do_mkwrite = true;
> > +           }
> > +           unlock_page(page);
> > +           put_page(page);
> > +   }
> > +
> >  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> >     if (is_swap_pmd(*pmd)) {
> >             swp_entry_t entry = pmd_to_swp_entry(*pmd);
> > @@ -1925,6 +1951,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
> > *pmd,
> >     entry = pmd_modify(entry, newprot);
> >     if (preserve_write)
> >             entry = pmd_mk_savedwrite(entry);
> > +   if (do_mkwrite)
> > +           entry = pmd_mkwrite(entry);
> >     ret = HPAGE_PMD_NR;
> >     set_pmd_at(mm, addr, pmd, entry);
> >     BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index 4ce44d3ff03d..2d0ee09e6b26 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -579,7 +579,7 @@ unsigned long change_prot_numa(struct vm_area_struct 
> > *vma,
> >  {
> >     int nr_updated;
> >  
> > -   nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
> > +   nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1, false);
> >     if (nr_updated)
> >             count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
> >  
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 58b629bb70de..2d0c7e39f075 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -36,7 +36,7 @@
> >  
> >  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t 
> > *pmd,
> >             unsigned long addr, unsigned long end, pgprot_t newprot,
> > -           int dirty_accountable, int prot_numa)
> > +           int dirty_accountable, int prot_numa, bool mkwrite)
> >  {
> >     struct mm_struct *mm = vma->vm_mm;
> >     pte_t *pte, oldpte;
> > @@ -72,13 +72,15 @@ static unsigned long change_pte_range(struct 
> > vm_area_struct *vma, pmd_t *pmd,
> >             if (pte_present(oldpte)) {
> >                     pte_t ptent;
> >                     bool preserve_write = prot_numa && pte_write(oldpte);
> > +                   bool do_mkwrite = false;
> >  
> >                     /*
> >                      * Avoid trapping faults against the zero or KSM
> >                      * pages. See similar comment in change_huge_pmd.
> >                      */
> > -                   if (prot_numa) {
> > +                   if (prot_numa || mkwrite) {
> >                             struct page *page;
> > +                           int tmp;
> >  
> >                             page = vm_normal_page(vma, addr, oldpte);
> >                             if (!page || PageKsm(page))
> > @@ -94,6 +96,26 @@ static unsigned long change_pte_range(struct 
> > vm_area_struct *vma, pmd_t *pmd,
> >                              */
> >                             if (target_node == page_to_nid(page))
> >                                     continue;
> > +
> > +                           if (mkwrite) {
> > +                                   if (!trylock_page(page)) {
> > +                                           pte_t orig_pte = 
> > READ_ONCE(*pte);
> > +                                           get_page(page);
> > +                                           pte_unmap_unlock(pte, ptl);
> > +                                           lock_page(page);
> > +                                           pte = 
> > pte_offset_map_lock(vma->vm_mm, pmd,
> > +                                                                     addr, 
> > &ptl);
> > +                                           if (!pte_same(*pte, orig_pte)) {
> > +                                                   unlock_page(page);
> > +                                                   put_page(page);
> > +                                                   continue;
> > +                                           }
> > +                                   }
> > +                                   if (reuse_swap_page(page, &tmp))
> > +                                           do_mkwrite = true;
> > +                                   unlock_page(page);
> > +                                   put_page(page);
> > +                           }
> >                     }
> >  
> >                     ptent = ptep_modify_prot_start(mm, addr, pte);
> > @@ -102,9 +124,9 @@ static unsigned long change_pte_range(struct 
> > vm_area_struct *vma, pmd_t *pmd,
> >                             ptent = pte_mk_savedwrite(ptent);
> >  
> >                     /* Avoid taking write faults for known dirty pages */
> > -                   if (dirty_accountable && pte_dirty(ptent) &&
> > -                                   (pte_soft_dirty(ptent) ||
> > -                                    !(vma->vm_flags & VM_SOFTDIRTY))) {
> > +                   if (do_mkwrite || (dirty_accountable &&
> > +                       pte_dirty(ptent) && (pte_soft_dirty(ptent) ||
> > +                       !(vma->vm_flags & VM_SOFTDIRTY)))) {
> >                             ptent = pte_mkwrite(ptent);
> >                     }
> >                     ptep_modify_prot_commit(mm, addr, pte, ptent);
> > @@ -150,7 +172,8 @@ static unsigned long change_pte_range(struct 
> > vm_area_struct *vma, pmd_t *pmd,
> >  
> >  static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
> >             pud_t *pud, unsigned long addr, unsigned long end,
> > -           pgprot_t newprot, int dirty_accountable, int prot_numa)
> > +           pgprot_t newprot, int dirty_accountable, int prot_numa,
> > +           bool mkwrite)
> >  {
> >     pmd_t *pmd;
> >     struct mm_struct *mm = vma->vm_mm;
> > @@ -179,7 +202,7 @@ static inline unsigned long change_pmd_range(struct 
> > vm_area_struct *vma,
> >                             __split_huge_pmd(vma, pmd, addr, false, NULL);
> >                     } else {
> >                             int nr_ptes = change_huge_pmd(vma, pmd, addr,
> > -                                           newprot, prot_numa);
> > +                                           newprot, prot_numa, mkwrite);
> >  
> >                             if (nr_ptes) {
> >                                     if (nr_ptes == HPAGE_PMD_NR) {
> > @@ -194,7 +217,7 @@ static inline unsigned long change_pmd_range(struct 
> > vm_area_struct *vma,
> >                     /* fall through, the trans huge pmd just split */
> >             }
> >             this_pages = change_pte_range(vma, pmd, addr, next, newprot,
> > -                            dirty_accountable, prot_numa);
> > +                            dirty_accountable, prot_numa, mkwrite);
> >             pages += this_pages;
> >  next:
> >             cond_resched();
> > @@ -210,7 +233,8 @@ static inline unsigned long change_pmd_range(struct 
> > vm_area_struct *vma,
> >  
> >  static inline unsigned long change_pud_range(struct vm_area_struct *vma,
> >             p4d_t *p4d, unsigned long addr, unsigned long end,
> > -           pgprot_t newprot, int dirty_accountable, int prot_numa)
> > +           pgprot_t newprot, int dirty_accountable, int prot_numa,
> > +           bool mkwrite)
> >  {
> >     pud_t *pud;
> >     unsigned long next;
> > @@ -222,7 +246,7 @@ static inline unsigned long change_pud_range(struct 
> > vm_area_struct *vma,
> >             if (pud_none_or_clear_bad(pud))
> >                     continue;
> >             pages += change_pmd_range(vma, pud, addr, next, newprot,
> > -                            dirty_accountable, prot_numa);
> > +                            dirty_accountable, prot_numa, mkwrite);
> >     } while (pud++, addr = next, addr != end);
> >  
> >     return pages;
> > @@ -230,7 +254,8 @@ static inline unsigned long change_pud_range(struct 
> > vm_area_struct *vma,
> >  
> >  static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
> >             pgd_t *pgd, unsigned long addr, unsigned long end,
> > -           pgprot_t newprot, int dirty_accountable, int prot_numa)
> > +           pgprot_t newprot, int dirty_accountable, int prot_numa,
> > +           bool mkwrite)
> >  {
> >     p4d_t *p4d;
> >     unsigned long next;
> > @@ -242,7 +267,7 @@ static inline unsigned long change_p4d_range(struct 
> > vm_area_struct *vma,
> >             if (p4d_none_or_clear_bad(p4d))
> >                     continue;
> >             pages += change_pud_range(vma, p4d, addr, next, newprot,
> > -                            dirty_accountable, prot_numa);
> > +                            dirty_accountable, prot_numa, mkwrite);
> >     } while (p4d++, addr = next, addr != end);
> >  
> >     return pages;
> > @@ -250,7 +275,7 @@ static inline unsigned long change_p4d_range(struct 
> > vm_area_struct *vma,
> >  
> >  static unsigned long change_protection_range(struct vm_area_struct *vma,
> >             unsigned long addr, unsigned long end, pgprot_t newprot,
> > -           int dirty_accountable, int prot_numa)
> > +           int dirty_accountable, int prot_numa, bool mkwrite)
> >  {
> >     struct mm_struct *mm = vma->vm_mm;
> >     pgd_t *pgd;
> > @@ -267,7 +292,7 @@ static unsigned long change_protection_range(struct 
> > vm_area_struct *vma,
> >             if (pgd_none_or_clear_bad(pgd))
> >                     continue;
> >             pages += change_p4d_range(vma, pgd, addr, next, newprot,
> > -                            dirty_accountable, prot_numa);
> > +                            dirty_accountable, prot_numa, mkwrite);
> >     } while (pgd++, addr = next, addr != end);
> >  
> >     /* Only flush the TLB if we actually modified any entries: */
> > @@ -280,14 +305,16 @@ static unsigned long change_protection_range(struct 
> > vm_area_struct *vma,
> >  
> >  unsigned long change_protection(struct vm_area_struct *vma, unsigned long 
> > start,
> >                    unsigned long end, pgprot_t newprot,
> > -                  int dirty_accountable, int prot_numa)
> > +                  int dirty_accountable, int prot_numa, bool mkwrite)
> >  {
> >     unsigned long pages;
> >  
> >     if (is_vm_hugetlb_page(vma))
> >             pages = hugetlb_change_protection(vma, start, end, newprot);
> >     else
> > -           pages = change_protection_range(vma, start, end, newprot, 
> > dirty_accountable, prot_numa);
> > +           pages = change_protection_range(vma, start, end, newprot,
> > +                                           dirty_accountable,
> > +                                           prot_numa, mkwrite);
> >  
> >     return pages;
> >  }
> > @@ -366,7 +393,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct 
> > vm_area_struct **pprev,
> >     vma_set_page_prot(vma);
> >  
> >     change_protection(vma, start, end, vma->vm_page_prot,
> > -                     dirty_accountable, 0);
> > +                     dirty_accountable, 0, false);
> >  
> >     /*
> >      * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index a0379c5ffa7c..c745c5d87523 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -632,7 +632,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, 
> > unsigned long start,
> >             newprot = vm_get_page_prot(dst_vma->vm_flags);
> >  
> >     change_protection(dst_vma, start, start + len, newprot,
> > -                           !enable_wp, 0);
> > +                     0, 0, !enable_wp);
> >  
> >     err = 0;
> >  out_unlock:
> > diff --git a/tools/lib/str_error_r.c b/tools/lib/str_error_r.c
> > index d6d65537b0d9..11c3425f272b 100644
> > --- a/tools/lib/str_error_r.c
> > +++ b/tools/lib/str_error_r.c
> > @@ -21,7 +21,12 @@
> >  char *str_error_r(int errnum, char *buf, size_t buflen)
> >  {
> >     int err = strerror_r(errnum, buf, buflen);
> > -   if (err)
> > -           snprintf(buf, buflen, "INTERNAL ERROR: strerror_r(%d, %p, 
> > %zd)=%d", errnum, buf, buflen, err);
> > +   if (err) {
> > +           char *err_buf = buf;
> > +
> > +           snprintf(err_buf, buflen,
> > +                    "INTERNAL ERROR: strerror_r(%d, %p, %zd)=%d",
> > +                    errnum, buf, buflen, err);
> > +   }
> >     return buf;
> >  }
> > diff --git a/tools/lib/subcmd/pager.c b/tools/lib/subcmd/pager.c
> > index 5ba754d17952..e1895568edaf 100644
> > --- a/tools/lib/subcmd/pager.c
> > +++ b/tools/lib/subcmd/pager.c
> > @@ -25,6 +25,8 @@ void pager_init(const char *pager_env)
> >  
> >  static void pager_preexec(void)
> >  {
> > +   void *ptr;
> > +
> >     /*
> >      * Work around bug in "less" by not starting it until we
> >      * have real input
> > @@ -33,7 +35,8 @@ static void pager_preexec(void)
> >  
> >     FD_ZERO(&in);
> >     FD_SET(0, &in);
> > -   select(1, &in, NULL, &in, NULL);
> > +   ptr = &in;
> > +   select(1, &in, NULL, ptr, NULL);
> >  
> >     setenv("LESS", "FRSX", 0);
> >  }
> > -- 
> > 2.17.1
> > 
> 
> Hello, Jerome,
> 
> Sorry for a very late response.  Actually I tried this patch many days
> ago but it hanged my remote host when I started my uffd-wp userspace
> test program (what I got was a ssh connection there)... so I found
> another day to reach the system and reboot it. It's reproducable 100%.
> 
> I wanted to capture some panic trace or things alike for you but I
> failed to do so.  I tried to install software watchdog plus kdump
> services (so that when panic happened kdump will capture more
> information) but unluckily the hang I encountered didn't really
> trigger either of them (so not only kdump is not triggered and also
> the software watchdog is not failing).  It just seems like a pure hang
> without panic, though the system is totally not responding so I cannot
> collect anything.
> 
> Let me know if you have any idea on how to debug this scenario.

Maybe run qemu with qemu -s (qemu-system-x86_64 -s -nographic ...)
then you can attach a gdb to the kernel run by your qemu:

gdb -ex 'set architecture i386:x86-64' -ex 'target remote localhost:1234' -ex 
"file $VMLINUX"

where $VMLINUX is the vmlinux kernel that is being run. Then
it is just regular gdb so you can stop, continue, break, set
watchpoint ...

> 
> (Btw, I'm not sure whether we'll need those reuse_swap_page() that you
>  added - AFAIU currently Andrea's uffd-wp tree does not support shmem,
>  so will any of the write protected page be shared by more than one
>  PTE?)

No we do need reuse_swap_page() because of fork() and COW (copy on write)
we can only un-write protect an anonymous page that is not map mutiple
times.

I am traveling and on pto next week so probably won't be able to followup
before couple weeks.

Cheers,
Jérôme

Reply via email to