[kvm-devel] [PATCH 4 of 8] The conversion to a rwsem allows callbacks during rmap traversal

Andrea Arcangeli Wed, 02 Apr 2008 14:45:42 -0700

# HG changeset patch
# User Andrea Arcangeli <[EMAIL PROTECTED]>
# Date 1207159011 -7200
# Node ID 3c3787c496cab1fc590ba3f97e7904bdfaab5375
# Parent  d880c227ddf345f5d577839d36d150c37b653bfd
The conversion to a rwsem allows callbacks during rmap traversal
for files in a non atomic context. A rw style lock also allows concurrent
walking of the reverse map. This is fairly straightforward if one removes
pieces of the resched checking.


[Restarting unmapping is an issue to be discussed].

This slightly increases Aim9 performance results on an 8p.

Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]>
Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]>

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -69,7 +69,7 @@
        if (!vma_shareable(vma, addr))
                return;
 
-       spin_lock(&mapping->i_mmap_lock);
+       down_read(&mapping->i_mmap_sem);
        vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
@@ -94,7 +94,7 @@
                put_page(virt_to_page(spte));
        spin_unlock(&mm->page_table_lock);
 out:
-       spin_unlock(&mapping->i_mmap_lock);
+       up_read(&mapping->i_mmap_sem);
 }
 
 /*
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -454,10 +454,10 @@
        pgoff = offset >> PAGE_SHIFT;
 
        i_size_write(inode, offset);
-       spin_lock(&mapping->i_mmap_lock);
+       down_read(&mapping->i_mmap_sem);
        if (!prio_tree_empty(&mapping->i_mmap))
                hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-       spin_unlock(&mapping->i_mmap_lock);
+       up_read(&mapping->i_mmap_sem);
        truncate_hugepages(inode, offset);
        return 0;
 }
diff --git a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -210,7 +210,7 @@
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
        rwlock_init(&inode->i_data.tree_lock);
-       spin_lock_init(&inode->i_data.i_mmap_lock);
+       init_rwsem(&inode->i_data.i_mmap_sem);
        INIT_LIST_HEAD(&inode->i_data.private_list);
        spin_lock_init(&inode->i_data.private_lock);
        INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
diff --git a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -503,7 +503,7 @@
        unsigned int            i_mmap_writable;/* count VM_SHARED mappings */
        struct prio_tree_root   i_mmap;         /* tree of private and shared 
mappings */
        struct list_head        i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
-       spinlock_t              i_mmap_lock;    /* protect tree, count, list */
+       struct rw_semaphore     i_mmap_sem;     /* protect tree, count, list */
        unsigned int            truncate_count; /* Cover race condition with 
truncate */
        unsigned long           nrpages;        /* number of total pages */
        pgoff_t                 writeback_index;/* writeback starts here */
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -716,7 +716,7 @@
        struct address_space *check_mapping;    /* Check page->mapping if set */
        pgoff_t first_index;                    /* Lowest page->index to unmap 
*/
        pgoff_t last_index;                     /* Highest page->index to unmap 
*/
-       spinlock_t *i_mmap_lock;                /* For unmap_mapping_range: */
+       struct rw_semaphore *i_mmap_sem;        /* For unmap_mapping_range: */
        unsigned long truncate_count;           /* Compare vm_truncate_count */
 };
 
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -274,12 +274,12 @@
                                atomic_dec(&inode->i_writecount);
 
                        /* insert tmp into the share list, just after mpnt */
-                       spin_lock(&file->f_mapping->i_mmap_lock);
+                       down_write(&file->f_mapping->i_mmap_sem);
                        tmp->vm_truncate_count = mpnt->vm_truncate_count;
                        flush_dcache_mmap_lock(file->f_mapping);
                        vma_prio_tree_add(tmp, mpnt);
                        flush_dcache_mmap_unlock(file->f_mapping);
-                       spin_unlock(&file->f_mapping->i_mmap_lock);
+                       up_write(&file->f_mapping->i_mmap_sem);
                }
 
                /*
diff --git a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -61,16 +61,16 @@
 /*
  * Lock ordering:
  *
- *  ->i_mmap_lock              (vmtruncate)
+ *  ->i_mmap_sem               (vmtruncate)
  *    ->private_lock           (__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock            (exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
  *  ->i_mutex
- *    ->i_mmap_lock            (truncate->unmap_mapping_range)
+ *    ->i_mmap_sem             (truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_mmap_lock
+ *    ->i_mmap_sem
  *      ->page_table_lock or pte_lock  (various, mainly in memory.c)
  *        ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
  *
@@ -87,7 +87,7 @@
  *    ->sb_lock                        (fs/fs-writeback.c)
  *    ->mapping->tree_lock     (__sync_single_inode)
  *
- *  ->i_mmap_lock
+ *  ->i_mmap_sem
  *    ->anon_vma.lock          (vma_adjust)
  *
  *  ->anon_vma.lock
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -184,7 +184,7 @@
        if (!page)
                return;
 
-       spin_lock(&mapping->i_mmap_lock);
+       down_read(&mapping->i_mmap_sem);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
@@ -204,7 +204,7 @@
                        page_cache_release(page);
                }
        }
-       spin_unlock(&mapping->i_mmap_lock);
+       up_read(&mapping->i_mmap_sem);
 }
 
 /*
diff --git a/mm/fremap.c b/mm/fremap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -206,13 +206,13 @@
                        }
                        goto out;
                }
-               spin_lock(&mapping->i_mmap_lock);
+               down_write(&mapping->i_mmap_sem);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
                vma_prio_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
-               spin_unlock(&mapping->i_mmap_lock);
+               up_write(&mapping->i_mmap_sem);
        }
 
        mmu_notifier_invalidate_range_start(mm, start, start + size);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -790,7 +790,7 @@
        struct page *page;
        struct page *tmp;
        /*
-        * A page gathering list, protected by per file i_mmap_lock. The
+        * A page gathering list, protected by per file i_mmap_sem. The
         * lock is used to avoid list corruption from multiple unmapping
         * of the same page since we are using page->lru.
         */
@@ -840,9 +840,9 @@
         * do nothing in this case.
         */
        if (vma->vm_file) {
-               spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+               down_write(&vma->vm_file->f_mapping->i_mmap_sem);
                __unmap_hugepage_range(vma, start, end);
-               spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+               up_write(&vma->vm_file->f_mapping->i_mmap_sem);
        }
 }
 
@@ -1085,7 +1085,7 @@
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
 
-       spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+       down_write(&vma->vm_file->f_mapping->i_mmap_sem);
        spin_lock(&mm->page_table_lock);
        for (; address < end; address += HPAGE_SIZE) {
                ptep = huge_pte_offset(mm, address);
@@ -1100,7 +1100,7 @@
                }
        }
        spin_unlock(&mm->page_table_lock);
-       spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+       up_write(&vma->vm_file->f_mapping->i_mmap_sem);
 
        flush_tlb_range(vma, start, end);
 }
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -838,7 +838,6 @@
        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
        int tlb_start_valid = 0;
        unsigned long start = start_addr;
-       spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
        int fullmm = (*tlbp)->fullmm;
 
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
@@ -875,22 +874,12 @@
                        }
 
                        tlb_finish_mmu(*tlbp, tlb_start, start);
-
-                       if (need_resched() ||
-                               (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                               if (i_mmap_lock) {
-                                       *tlbp = NULL;
-                                       goto out;
-                               }
-                               cond_resched();
-                       }
-
+                       cond_resched();
                        *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
                        tlb_start_valid = 0;
                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
-out:
        return start;   /* which is now the end (or restart) address */
 }
 
@@ -1752,7 +1741,7 @@
 /*
  * Helper functions for unmap_mapping_range().
  *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __
  *
  * We have to restart searching the prio_tree whenever we drop the lock,
  * since the iterator is only valid while the lock is held, and anyway
@@ -1771,7 +1760,7 @@
  * can't efficiently keep all vmas in step with mapping->truncate_count:
  * so instead reset them all whenever it wraps back to 0 (then go to 1).
  * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
+ * i_mmap_sem.
  *
  * In order to make forward progress despite repeatedly restarting some
  * large vma, note the restart_addr from unmap_vmas when it breaks out:
@@ -1821,7 +1810,7 @@
 
        restart_addr = zap_page_range(vma, start_addr,
                                        end_addr - start_addr, details);
-       need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+       need_break = need_resched();
 
        if (restart_addr >= end_addr) {
                /* We have now completed this vma: mark it so */
@@ -1835,9 +1824,9 @@
                        goto again;
        }
 
-       spin_unlock(details->i_mmap_lock);
+       up_write(details->i_mmap_sem);
        cond_resched();
-       spin_lock(details->i_mmap_lock);
+       down_write(details->i_mmap_sem);
        return -EINTR;
 }
 
@@ -1931,9 +1920,9 @@
        details.last_index = hba + hlen - 1;
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
-       details.i_mmap_lock = &mapping->i_mmap_lock;
+       details.i_mmap_sem = &mapping->i_mmap_sem;
 
-       spin_lock(&mapping->i_mmap_lock);
+       down_write(&mapping->i_mmap_sem);
 
        /* Protect against endless unmapping loops */
        mapping->truncate_count++;
@@ -1948,7 +1937,7 @@
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
-       spin_unlock(&mapping->i_mmap_lock);
+       up_write(&mapping->i_mmap_sem);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
diff --git a/mm/migrate.c b/mm/migrate.c
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -211,12 +211,12 @@
        if (!mapping)
                return;
 
-       spin_lock(&mapping->i_mmap_lock);
+       down_read(&mapping->i_mmap_sem);
 
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
                remove_migration_pte(vma, old, new);
 
-       spin_unlock(&mapping->i_mmap_lock);
+       up_read(&mapping->i_mmap_sem);
 }
 
 /*
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -187,7 +187,7 @@
 }
 
 /*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_sem
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
@@ -215,9 +215,9 @@
 
        if (file) {
                struct address_space *mapping = file->f_mapping;
-               spin_lock(&mapping->i_mmap_lock);
+               down_write(&mapping->i_mmap_sem);
                __remove_shared_vm_struct(vma, file, mapping);
-               spin_unlock(&mapping->i_mmap_lock);
+               up_write(&mapping->i_mmap_sem);
        }
 }
 
@@ -440,7 +440,7 @@
                mapping = vma->vm_file->f_mapping;
 
        if (mapping) {
-               spin_lock(&mapping->i_mmap_lock);
+               down_write(&mapping->i_mmap_sem);
                vma->vm_truncate_count = mapping->truncate_count;
        }
        anon_vma_lock(vma);
@@ -450,7 +450,7 @@
 
        anon_vma_unlock(vma);
        if (mapping)
-               spin_unlock(&mapping->i_mmap_lock);
+               up_write(&mapping->i_mmap_sem);
 
        mm->map_count++;
        validate_mm(mm);
@@ -537,7 +537,7 @@
                mapping = file->f_mapping;
                if (!(vma->vm_flags & VM_NONLINEAR))
                        root = &mapping->i_mmap;
-               spin_lock(&mapping->i_mmap_lock);
+               down_write(&mapping->i_mmap_sem);
                if (importer &&
                    vma->vm_truncate_count != next->vm_truncate_count) {
                        /*
@@ -621,7 +621,7 @@
        if (anon_vma)
                spin_unlock(&anon_vma->lock);
        if (mapping)
-               spin_unlock(&mapping->i_mmap_lock);
+               up_write(&mapping->i_mmap_sem);
 
        if (remove_next) {
                if (file)
@@ -2065,7 +2065,7 @@
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_sem is taken here.
  */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -88,7 +88,7 @@
                 * and we propagate stale pages into the dst afterward.
                 */
                mapping = vma->vm_file->f_mapping;
-               spin_lock(&mapping->i_mmap_lock);
+               down_write(&mapping->i_mmap_sem);
                if (new_vma->vm_truncate_count &&
                    new_vma->vm_truncate_count != vma->vm_truncate_count)
                        new_vma->vm_truncate_count = 0;
@@ -120,7 +120,7 @@
        pte_unmap_nested(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
-               spin_unlock(&mapping->i_mmap_lock);
+               up_write(&mapping->i_mmap_sem);
        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
  *   inode->i_alloc_sem (vmtruncate_range)
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_lock
+ *       mapping->i_mmap_sem
  *         anon_vma->lock
  *           mm->page_table_lock or pte_lock
  *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -373,14 +373,14 @@
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
-        * so we can safely take mapping->i_mmap_lock.
+        * so we can safely take mapping->i_mmap_sem.
         */
        BUG_ON(!PageLocked(page));
 
-       spin_lock(&mapping->i_mmap_lock);
+       down_read(&mapping->i_mmap_sem);
 
        /*
-        * i_mmap_lock does not stabilize mapcount at all, but mapcount
+        * i_mmap_sem does not stabilize mapcount at all, but mapcount
         * is more likely to be accurate if we note it after spinning.
         */
        mapcount = page_mapcount(page);
@@ -403,7 +403,7 @@
                        break;
        }
 
-       spin_unlock(&mapping->i_mmap_lock);
+       up_read(&mapping->i_mmap_sem);
        return referenced;
 }
 
@@ -489,12 +489,12 @@
 
        BUG_ON(PageAnon(page));
 
-       spin_lock(&mapping->i_mmap_lock);
+       down_read(&mapping->i_mmap_sem);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (vma->vm_flags & VM_SHARED)
                        ret += page_mkclean_one(page, vma);
        }
-       spin_unlock(&mapping->i_mmap_lock);
+       up_read(&mapping->i_mmap_sem);
        return ret;
 }
 
@@ -930,7 +930,7 @@
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
 
-       spin_lock(&mapping->i_mmap_lock);
+       down_read(&mapping->i_mmap_sem);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                ret = try_to_unmap_one(page, vma, migration);
                if (ret == SWAP_FAIL || !page_mapped(page))
@@ -967,7 +967,6 @@
        mapcount = page_mapcount(page);
        if (!mapcount)
                goto out;
-       cond_resched_lock(&mapping->i_mmap_lock);
 
        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
        if (max_nl_cursor == 0)
@@ -989,7 +988,6 @@
                        }
                        vma->vm_private_data = (void *) max_nl_cursor;
                }
-               cond_resched_lock(&mapping->i_mmap_lock);
                max_nl_cursor += CLUSTER_SIZE;
        } while (max_nl_cursor <= max_nl_size);
 
@@ -1001,7 +999,7 @@
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
                vma->vm_private_data = NULL;
 out:
-       spin_unlock(&mapping->i_mmap_lock);
+       up_write(&mapping->i_mmap_sem);
        return ret;
 }
 

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel

[kvm-devel] [PATCH 4 of 8] The conversion to a rwsem allows callbacks during rmap traversal

Reply via email to