Jérôme Glisse <[email protected]> writes:

> +
> +     /* Try to fail early on. */
> +     if (unlikely(anon_vma_prepare(vma)))
> +             return -ENOMEM;
> +

What is this about ?

> +retry:
> +     lru_add_drain();
> +     tlb_gather_mmu(&tlb, mm, range.start, range.end);
> +     update_hiwater_rss(mm);
> +     mmu_notifier_invalidate_range_start_excluding(mm, &range,
> +                                                   mmu_notifier_exclude);
> +     tlb_start_vma(&tlb, vma);
> +     for (addr = range.start, i = 0; addr < end && !ret;) {
> +             unsigned long cstart, next, npages = 0;
> +             spinlock_t *ptl;
> +             pgd_t *pgdp;
> +             pud_t *pudp;
> +             pmd_t *pmdp;
> +             pte_t *ptep;
> +
> +             /*
> +              * Pretty much the exact same logic as __handle_mm_fault(),
> +              * exception being the handling of huge pmd.
> +              */
> +             pgdp = pgd_offset(mm, addr);
> +             pudp = pud_alloc(mm, pgdp, addr);
> +             if (!pudp) {
> +                     ret = -ENOMEM;
> +                     break;
> +             }
> +             pmdp = pmd_alloc(mm, pudp, addr);
> +             if (!pmdp) {
> +                     ret = -ENOMEM;
> +                     break;
> +             }
> +             if (unlikely(pte_alloc(mm, pmdp, addr))) {
> +                     ret = -ENOMEM;
> +                     break;
> +             }
> +
> +             /*
> +              * If a huge pmd materialized under us just retry later.  Use
> +              * pmd_trans_unstable() instead of pmd_trans_huge() to ensure 
> the pmd
> +              * didn't become pmd_trans_huge under us and then back to 
> pmd_none, as
> +              * a result of MADV_DONTNEED running immediately after a huge 
> pmd fault
> +              * in a different thread of this mm, in turn leading to a 
> misleading
> +              * pmd_trans_huge() retval.  All we have to ensure is that it 
> is a
> +              * regular pmd that we can walk with pte_offset_map() and we 
> can do that
> +              * through an atomic read in C, which is what 
> pmd_trans_unstable()
> +              * provides.
> +              */
> +             if (unlikely(pmd_trans_unstable(pmdp) || pmd_devmap(*pmdp))) {
> +                     ret = -EAGAIN;
> +                     break;
> +             }
> +
> +             /*
> +              * If an huge pmd materialized from under us split it and break
> +              * out of the loop to retry.
> +              */
> +             if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) {
> +                     split_huge_pmd(vma, addr, pmdp);
> +                     ret = -EAGAIN;
> +                     break;
> +             }
> +
> +             /*
> +              * A regular pmd is established and it can't morph into a huge 
> pmd
> +              * from under us anymore at this point because we hold the 
> mmap_sem
> +              * read mode and khugepaged takes it in write mode. So now it's
> +              * safe to run pte_offset_map().
> +              */
> +             ptep = pte_offset_map(pmdp, addr);
> +
> +             /*
> +              * A regular pmd is established and it can't morph into a huge
> +              * pmd from under us anymore at this point because we hold the
> +              * mmap_sem read mode and khugepaged takes it in write mode. So
> +              * now it's safe to run pte_offset_map().
> +              */
> +             ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);


Why pte_offset_map followed by map_lock ?

> +             for (i = (addr - start) >> PAGE_SHIFT, cstart = addr,
> +                  next = min((addr + PMD_SIZE) & PMD_MASK, end);
> +                  addr < next; addr += PAGE_SIZE, ptep++, i++) {
> +                     save_pte[i] = ptep_get_and_clear(mm, addr, ptep);
> +                     tlb_remove_tlb_entry(&tlb, ptep, addr);
> +                     set_pte_at(mm, addr, ptep, hmm_entry);
> +
> +                     if (pte_present(save_pte[i]))
> +                             continue;
> +
> +                     if (!pte_none(save_pte[i])) {
> +                             set_pte_at(mm, addr, ptep, save_pte[i]);
> +                             ret = -ENOENT;
> +                             ptep++;
> +                             break;
> +                     }

What is special about pte_none ? Why break the loop ? I guess we are
checking for swap_pte ? why not is_swap_pte ? is that we already checked
pte_present ?

> +                     /*
> +                      * TODO: This mm_forbids_zeropage() really does not
> +                      * apply to us. First it seems only S390 have it set,
> +                      * second we are not even using the zero page entry
> +                      * to populate the CPU page table, thought on error
> +                      * we might use the save_pte entry to set the CPU
> +                      * page table entry.
> +                      *
> +                      * Live with that oddity for now.
> +                      */
> +                     if (mm_forbids_zeropage(mm)) {
> +                             pte_clear(mm, addr, &save_pte[i]);
> +                             npages++;
> +                             continue;
> +                     }
> +                     save_pte[i] = pte_mkspecial(pfn_pte(my_zero_pfn(addr),
> +                                                 vma->vm_page_prot));
> +             }
> +             pte_unmap_unlock(ptep - 1, ptl);
> +
> +             /*
> +              * So we must allocate pages before checking for error, which
> +              * here indicate that one entry is a swap entry. We need to
> +              * allocate first because otherwise there is no easy way to
> +              * know on retry or in error code path wether the CPU page
> +              * table locked HMM entry is ours or from some other thread.
> +              */
> +
> +             if (!npages)
> +                     continue;
> +
> +             for (next = addr, addr = cstart,
> +                  i = (addr - start) >> PAGE_SHIFT;
> +                  addr < next; addr += PAGE_SIZE, i++) {
> +                     struct mem_cgroup *memcg;
> +                     struct page *page;
> +
> +                     if (pte_present(save_pte[i]) || !pte_none(save_pte[i]))
> +                             continue;
> +
> +                     page = alloc_zeroed_user_highpage_movable(vma, addr);
> +                     if (!page) {
> +                             ret = -ENOMEM;
> +                             break;
> +                     }
> +                     __SetPageUptodate(page);
> +                     if (mem_cgroup_try_charge(page, mm, GFP_KERNEL,
> +                                               &memcg, false)) {
> +                             page_cache_release(page);
> +                             ret = -ENOMEM;
> +                             break;
> +                     }
> +                     save_pte[i] = mk_pte(page, vma->vm_page_prot);
> +                     if (vma->vm_flags & VM_WRITE)
> +                             save_pte[i] = pte_mkwrite(save_pte[i]);

I guess this also need to go ?

> +                     inc_mm_counter_fast(mm, MM_ANONPAGES);
> +                     /*
> +                      * Because we set the page table entry to the special
> +                      * HMM locked entry we know no other process might do
> +                      * anything with it and thus we can safely account the
> +                      * page without holding any lock at this point.
> +                      */
> +                     page_add_new_anon_rmap(page, vma, addr, false);
> +                     mem_cgroup_commit_charge(page, memcg, false, false);
> +                     /*
> +                      * Add to active list so we know vmscan will not waste
> +                      * its time with that page while we are still using it.
> +                      */
> +                     lru_cache_add_active_or_unevictable(page, vma);
> +             }
> +     }
> +     tlb_end_vma(&tlb, vma);
> +     mmu_notifier_invalidate_range_end_excluding(mm, &range,
> +                                                 mmu_notifier_exclude);
> +     tlb_finish_mmu(&tlb, range.start, range.end);
> +
> +     if (backoff && *backoff) {
> +             /* Stick to the range we updated. */
> +             ret = -EAGAIN;
> +             end = addr;
> +             goto out;
> +     }
> +
> +     /* Check if something is missing or something went wrong. */
> +     if (ret == -ENOENT) {
> +             int flags = FAULT_FLAG_ALLOW_RETRY;
> +
> +             do {
> +                     /*
> +                      * Using __handle_mm_fault() as current->mm != mm ie we
> +                      * might have been call from a kernel thread on behalf
> +                      * of a driver and all accounting handle_mm_fault() is
> +                      * pointless in our case.
> +                      */
> +                     ret = __handle_mm_fault(mm, vma, addr, flags);
> +                     flags |= FAULT_FLAG_TRIED;
> +             } while ((ret & VM_FAULT_RETRY));
> +             if ((ret & VM_FAULT_ERROR)) {
> +                     /* Stick to the range we updated. */
> +                     end = addr;
> +                     ret = -EFAULT;
> +                     goto out;
> +             }
> +             range.start = addr;
> +             goto retry;
> +     }
> +     if (ret == -EAGAIN) {
> +             range.start = addr;
> +             goto retry;
> +     }
> +     if (ret)
> +             /* Stick to the range we updated. */
> +             end = addr;
> +
> +     /*
> +      * At this point no one else can take a reference on the page from this
> +      * process CPU page table. So we can safely check wether we can migrate
> +      * or not the page.
> +      */
> +
> +out:
> +     for (addr = start, i = 0; addr < end;) {
> +             unsigned long next;
> +             spinlock_t *ptl;
> +             pgd_t *pgdp;
> +             pud_t *pudp;
> +             pmd_t *pmdp;
> +             pte_t *ptep;
> +
> +             /*
> +              * We know for certain that we did set special swap entry for
> +              * the range and HMM entry are mark as locked so it means that
> +              * no one beside us can modify them which apply that all level
> +              * of the CPU page table are valid.
> +              */
> +             pgdp = pgd_offset(mm, addr);
> +             pudp = pud_offset(pgdp, addr);
> +             VM_BUG_ON(!pudp);
> +             pmdp = pmd_offset(pudp, addr);
> +             VM_BUG_ON(!pmdp || pmd_bad(*pmdp) || pmd_none(*pmdp) ||
> +                       pmd_trans_huge(*pmdp));
> +
> +             ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> +             for (next = min((addr + PMD_SIZE) & PMD_MASK, end),
> +                  i = (addr - start) >> PAGE_SHIFT; addr < next;
> +                  addr += PAGE_SIZE, ptep++, i++) {
> +                     struct page *page;
> +                     swp_entry_t entry;
> +                     int swapped;
> +
> +                     entry = pte_to_swp_entry(save_pte[i]);
> +                     if (is_hmm_entry(entry)) {
> +                             /*
> +                              * Logic here is pretty involve. If save_pte is
> +                              * an HMM special swap entry then it means that
> +                              * we failed to swap in that page so error must
> +                              * be set.
> +                              *
> +                              * If that's not the case than it means we are
> +                              * seriously screw.
> +                              */
> +                             VM_BUG_ON(!ret);
> +                             continue;
> +                     }
> +
> +                     /*
> +                      * This can not happen, no one else can replace our
> +                      * special entry and as range end is re-ajusted on
> +                      * error.
> +                      */
> +                     entry = pte_to_swp_entry(*ptep);
> +                     VM_BUG_ON(!is_hmm_entry_locked(entry));
> +
> +                     /* On error or backoff restore all the saved pte. */
> +                     if (ret)
> +                             goto restore;
> +
> +                     page = vm_normal_page(vma, addr, save_pte[i]);
> +                     /* The zero page is fine to migrate. */
> +                     if (!page)
> +                             continue;
> +
> +                     /*
> +                      * Check that only CPU mapping hold a reference on the
> +                      * page. To make thing simpler we just refuse bail out
> +                      * if page_mapcount() != page_count() (also accounting
> +                      * for swap cache).
> +                      *
> +                      * There is a small window here where wp_page_copy()
> +                      * might have decremented mapcount but have not yet
> +                      * decremented the page count. This is not an issue as
> +                      * we backoff in that case.
> +                      */
> +                     swapped = PageSwapCache(page);
> +                     if (page_mapcount(page) + swapped == page_count(page))
> +                             continue;
> +
> +restore:
> +                     /* Ok we have to restore that page. */
> +                     set_pte_at(mm, addr, ptep, save_pte[i]);
> +                     /*
> +                      * No need to invalidate - it was non-present
> +                      * before.
> +                      */
> +                     update_mmu_cache(vma, addr, ptep);
> +                     pte_clear(mm, addr, &save_pte[i]);
> +             }
> +             pte_unmap_unlock(ptep - 1, ptl);
> +     }
> +     return ret;
> +}
> +EXPORT_SYMBOL(mm_hmm_migrate);

-aneesh

Reply via email to