# HG changeset patch # User Andrea Arcangeli <[EMAIL PROTECTED]> # Date 1207666463 -7200 # Node ID 20e829e35dfeceeb55a816ef495afda10cd50b98 # Parent 2c2ed514f294dbbfc66157f771bc900789ac6005 The conversion to a rwsem allows callbacks during rmap traversal for files in a non atomic context. A rw style lock also allows concurrent walking of the reverse map. This is fairly straightforward if one removes pieces of the resched checking.
[Restarting unmapping is an issue to be discussed]. This slightly increases Aim9 performance results on an 8p. Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]> Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]> diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -69,7 +69,7 @@ if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -94,7 +94,7 @@ put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -454,10 +454,10 @@ pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); truncate_hugepages(inode, offset); return 0; } diff --git a/fs/inode.c b/fs/inode.c --- a/fs/inode.c +++ b/fs/inode.c @@ -210,7 +210,7 @@ INIT_LIST_HEAD(&inode->i_devices); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); rwlock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); + init_rwsem(&inode->i_data.i_mmap_sem); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); diff --git a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -503,7 +503,7 @@ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ - spinlock_t i_mmap_lock; /* protect tree, count, list */ + struct rw_semaphore i_mmap_sem; /* protect tree, count, list */ unsigned int truncate_count; /* Cover race condition with truncate */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -716,7 +716,7 @@ struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ + struct rw_semaphore *i_mmap_sem; /* For unmap_mapping_range: */ unsigned long truncate_count; /* Compare vm_truncate_count */ }; @@ -1051,9 +1051,9 @@ unsigned long flags, struct page **pages); struct mm_lock_data { - spinlock_t **i_mmap_locks; + struct rw_semaphore **i_mmap_sems; spinlock_t **anon_vma_locks; - unsigned long nr_i_mmap_locks; + unsigned long nr_i_mmap_sems; unsigned long nr_anon_vma_locks; }; extern struct mm_lock_data *mm_lock(struct mm_struct * mm); diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -274,12 +274,12 @@ atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - spin_lock(&file->f_mapping->i_mmap_lock); + down_write(&file->f_mapping->i_mmap_sem); tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(file->f_mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(file->f_mapping); - spin_unlock(&file->f_mapping->i_mmap_lock); + up_write(&file->f_mapping->i_mmap_sem); } /* diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -61,16 +61,16 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_sem (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_sem (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_sem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -87,7 +87,7 @@ * ->sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_sem * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -184,7 +184,7 @@ if (!page) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + @@ -204,7 +204,7 @@ page_cache_release(page); } } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/mm/fremap.c b/mm/fremap.c --- a/mm/fremap.c +++ b/mm/fremap.c @@ -206,13 +206,13 @@ } goto out; } - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_prio_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } mmu_notifier_invalidate_range_start(mm, start, start + size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -790,7 +790,7 @@ struct page *page; struct page *tmp; /* - * A page gathering list, protected by per file i_mmap_lock. The + * A page gathering list, protected by per file i_mmap_sem. The * lock is used to avoid list corruption from multiple unmapping * of the same page since we are using page->lru. */ @@ -840,9 +840,9 @@ * do nothing in this case. */ if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); __unmap_hugepage_range(vma, start, end); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); } } @@ -1085,7 +1085,7 @@ BUG_ON(address >= end); flush_cache_range(vma, address, end); - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); spin_lock(&mm->page_table_lock); for (; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); @@ -1100,7 +1100,7 @@ } } spin_unlock(&mm->page_table_lock); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); flush_tlb_range(vma, start, end); } diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -838,7 +838,6 @@ unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { @@ -875,22 +874,12 @@ } tlb_finish_mmu(*tlbp, tlb_start, start); - - if (need_resched() || - (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { - *tlbp = NULL; - goto out; - } - cond_resched(); - } - + cond_resched(); *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } -out: return start; /* which is now the end (or restart) address */ } @@ -1752,7 +1741,7 @@ /* * Helper functions for unmap_mapping_range(). * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __ * * We have to restart searching the prio_tree whenever we drop the lock, * since the iterator is only valid while the lock is held, and anyway @@ -1771,7 +1760,7 @@ * can't efficiently keep all vmas in step with mapping->truncate_count: * so instead reset them all whenever it wraps back to 0 (then go to 1). * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. + * i_mmap_sem. * * In order to make forward progress despite repeatedly restarting some * large vma, note the restart_addr from unmap_vmas when it breaks out: @@ -1821,7 +1810,7 @@ restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); + need_break = need_resched(); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ @@ -1835,9 +1824,9 @@ goto again; } - spin_unlock(details->i_mmap_lock); + up_write(details->i_mmap_sem); cond_resched(); - spin_lock(details->i_mmap_lock); + down_write(details->i_mmap_sem); return -EINTR; } @@ -1931,9 +1920,9 @@ details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; + details.i_mmap_sem = &mapping->i_mmap_sem; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); /* Protect against endless unmapping loops */ mapping->truncate_count++; @@ -1948,7 +1937,7 @@ unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/migrate.c b/mm/migrate.c --- a/mm/migrate.c +++ b/mm/migrate.c @@ -211,12 +211,12 @@ if (!mapping) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) remove_migration_pte(vma, old, new); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -188,7 +188,7 @@ } /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_sem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -216,9 +216,9 @@ if (file) { struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } } @@ -441,7 +441,7 @@ mapping = vma->vm_file->f_mapping; if (mapping) { - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); vma->vm_truncate_count = mapping->truncate_count; } anon_vma_lock(vma); @@ -451,7 +451,7 @@ anon_vma_unlock(vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); mm->map_count++; validate_mm(mm); @@ -538,7 +538,7 @@ mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (importer && vma->vm_truncate_count != next->vm_truncate_count) { /* @@ -622,7 +622,7 @@ if (anon_vma) spin_unlock(&anon_vma->lock); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); if (remove_next) { if (file) @@ -2066,7 +2066,7 @@ /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_sem is taken here. */ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { @@ -2258,22 +2258,23 @@ struct mm_lock_data *mm_lock(struct mm_struct * mm) { struct vm_area_struct *vma; - spinlock_t *i_mmap_lock_last, *anon_vma_lock_last; - unsigned long nr_i_mmap_locks, nr_anon_vma_locks, i; + struct rw_semaphore *i_mmap_sem_last; + spinlock_t *anon_vma_lock_last; + unsigned long nr_i_mmap_sems, nr_anon_vma_locks, i; struct mm_lock_data *data; int err; down_write(&mm->mmap_sem); err = -EINTR; - nr_i_mmap_locks = nr_anon_vma_locks = 0; + nr_i_mmap_sems = nr_anon_vma_locks = 0; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); if (unlikely(signal_pending(current))) goto out; if (vma->vm_file && vma->vm_file->f_mapping) - nr_i_mmap_locks++; + nr_i_mmap_sems++; if (vma->anon_vma) nr_anon_vma_locks++; } @@ -2283,13 +2284,13 @@ if (!data) goto out; - if (nr_i_mmap_locks) { - data->i_mmap_locks = vmalloc(nr_i_mmap_locks * - sizeof(spinlock_t)); - if (!data->i_mmap_locks) + if (nr_i_mmap_sems) { + data->i_mmap_sems = vmalloc(nr_i_mmap_sems * + sizeof(struct rw_semaphore)); + if (!data->i_mmap_sems) goto out_kfree; } else - data->i_mmap_locks = NULL; + data->i_mmap_sems = NULL; if (nr_anon_vma_locks) { data->anon_vma_locks = vmalloc(nr_anon_vma_locks * @@ -2300,10 +2301,11 @@ data->anon_vma_locks = NULL; err = -EINTR; - i_mmap_lock_last = NULL; - nr_i_mmap_locks = 0; + i_mmap_sem_last = NULL; + nr_i_mmap_sems = 0; for (;;) { - spinlock_t *i_mmap_lock = (spinlock_t *) -1UL; + struct rw_semaphore *i_mmap_sem; + i_mmap_sem = (struct rw_semaphore *) -1UL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); if (unlikely(signal_pending(current))) @@ -2311,21 +2313,21 @@ if (!vma->vm_file || !vma->vm_file->f_mapping) continue; - if ((unsigned long) i_mmap_lock > + if ((unsigned long) i_mmap_sem > (unsigned long) - &vma->vm_file->f_mapping->i_mmap_lock && + &vma->vm_file->f_mapping->i_mmap_sem && (unsigned long) - &vma->vm_file->f_mapping->i_mmap_lock > - (unsigned long) i_mmap_lock_last) - i_mmap_lock = - &vma->vm_file->f_mapping->i_mmap_lock; + &vma->vm_file->f_mapping->i_mmap_sem > + (unsigned long) i_mmap_sem_last) + i_mmap_sem = + &vma->vm_file->f_mapping->i_mmap_sem; } - if (i_mmap_lock == (spinlock_t *) -1UL) + if (i_mmap_sem == (struct rw_semaphore *) -1UL) break; - i_mmap_lock_last = i_mmap_lock; - data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock; + i_mmap_sem_last = i_mmap_sem; + data->i_mmap_sems[nr_i_mmap_sems++] = i_mmap_sem; } - data->nr_i_mmap_locks = nr_i_mmap_locks; + data->nr_i_mmap_sems = nr_i_mmap_sems; anon_vma_lock_last = NULL; nr_anon_vma_locks = 0; @@ -2351,8 +2353,8 @@ } data->nr_anon_vma_locks = nr_anon_vma_locks; - for (i = 0; i < nr_i_mmap_locks; i++) - spin_lock(data->i_mmap_locks[i]); + for (i = 0; i < nr_i_mmap_sems; i++) + down_write(data->i_mmap_sems[i]); for (i = 0; i < nr_anon_vma_locks; i++) spin_lock(data->anon_vma_locks[i]); @@ -2361,7 +2363,7 @@ out_vfree_both: vfree(data->anon_vma_locks); out_vfree: - vfree(data->i_mmap_locks); + vfree(data->i_mmap_sems); out_kfree: kfree(data); out: @@ -2373,14 +2375,14 @@ { unsigned long i; - for (i = 0; i < data->nr_i_mmap_locks; i++) - spin_unlock(data->i_mmap_locks[i]); + for (i = 0; i < data->nr_i_mmap_sems; i++) + up_write(data->i_mmap_sems[i]); for (i = 0; i < data->nr_anon_vma_locks; i++) spin_unlock(data->anon_vma_locks[i]); up_write(&mm->mmap_sem); - vfree(data->i_mmap_locks); + vfree(data->i_mmap_sems); vfree(data->anon_vma_locks); kfree(data); } diff --git a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c +++ b/mm/mremap.c @@ -88,7 +88,7 @@ * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (new_vma->vm_truncate_count && new_vma->vm_truncate_count != vma->vm_truncate_count) new_vma->vm_truncate_count = 0; @@ -120,7 +120,7 @@ pte_unmap_nested(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock + * mapping->i_mmap_sem * anon_vma->lock * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -373,14 +373,14 @@ * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_sem. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_sem does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -403,7 +403,7 @@ break; } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return referenced; } @@ -489,12 +489,12 @@ BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) ret += page_mkclean_one(page, vma); } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return ret; } @@ -930,7 +930,7 @@ unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) @@ -967,7 +967,6 @@ mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -989,7 +988,6 @@ } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1001,7 +999,7 @@ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); return ret; } ------------------------------------------------------------------------- This SF.net email is sponsored by the 2008 JavaOne(SM) Conference Register now and save $200. Hurry, offer ends at 11:59 p.m., Monday, April 7! Use priority code J8TLD2. http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel