rw_semaphore is a sizable structure of 40 bytes and consumes
considerable space for each vm_area_struct. However vma_lock has
two important specifics which can be used to replace rw_semaphore
with a simpler structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode.
Because of these requirements, full rw_semaphore functionality is not
needed and we can replace rw_semaphore with an atomic variable.
When a reader takes read lock, it increments the atomic unless the
value is negative. If that fails read-locking is aborted and mmap_lock
is used instead.
When writer takes write lock, it resets atomic value to -1 if the
current value is 0 (no readers). Since all writers take mmap_lock in
write mode first, there can be only one writer at a time. If there
are readers, writer will place itself into a wait queue using new
mm_struct.vma_writer_wait waitqueue head. The last reader to release
the vma_lock will signal the writer to wake up.
vm_lock_seq is also moved into vma_lock and along with atomic_t they
are nicely packed and consume 8 bytes, bringing the overhead from
vma_lock from 44 to 16 bytes:

    slabinfo before the changes:
     <name>            ... <objsize> <objperslab> <pagesperslab> : ...
    vm_area_struct    ...    152   53    2 : ...

    slabinfo with vma_lock:
     <name>            ... <objsize> <objperslab> <pagesperslab> : ...
    rw_semaphore      ...      8  512    1 : ...
    vm_area_struct    ...    160   51    2 : ...

Assuming 40000 vm_area_structs, memory consumption would be:
baseline: 6040kB
vma_lock (vm_area_structs+vma_lock): 6280kB+316kB=6596kB
Total increase: 556kB

atomic_t might overflow if there are many competing readers, therefore
vma_read_trylock() implements an overflow check and if that occurs it
restors the previous value and exits with a failure to lock.

Signed-off-by: Suren Baghdasaryan <sur...@google.com>
---
 include/linux/mm.h       | 37 +++++++++++++++++++++++++------------
 include/linux/mm_types.h | 10 ++++++++--
 kernel/fork.c            |  6 +++---
 mm/init-mm.c             |  2 ++
 4 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d40bf8a5e19e..294dd44b2198 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -627,12 +627,16 @@ static inline void vma_write_lock(struct vm_area_struct 
*vma)
         * mm->mm_lock_seq can't be concurrently modified.
         */
        mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
-       if (vma->vm_lock_seq == mm_lock_seq)
+       if (vma->vm_lock->lock_seq == mm_lock_seq)
                return;
 
-       down_write(&vma->vm_lock->lock);
-       vma->vm_lock_seq = mm_lock_seq;
-       up_write(&vma->vm_lock->lock);
+       if (atomic_cmpxchg(&vma->vm_lock->count, 0, -1))
+               wait_event(vma->vm_mm->vma_writer_wait,
+                          atomic_cmpxchg(&vma->vm_lock->count, 0, -1) == 0);
+       vma->vm_lock->lock_seq = mm_lock_seq;
+       /* Write barrier to ensure lock_seq change is visible before count */
+       smp_wmb();
+       atomic_set(&vma->vm_lock->count, 0);
 }
 
 /*
@@ -643,20 +647,28 @@ static inline void vma_write_lock(struct vm_area_struct 
*vma)
 static inline bool vma_read_trylock(struct vm_area_struct *vma)
 {
        /* Check before locking. A race might cause false locked result. */
-       if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+       if (vma->vm_lock->lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
                return false;
 
-       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+       if (unlikely(!atomic_inc_unless_negative(&vma->vm_lock->count)))
                return false;
 
+       /* If atomic_t overflows, restore and fail to lock. */
+       if (unlikely(atomic_read(&vma->vm_lock->count) < 0)) {
+               if (atomic_dec_and_test(&vma->vm_lock->count))
+                       wake_up(&vma->vm_mm->vma_writer_wait);
+               return false;
+       }
+
        /*
         * Overflow might produce false locked result.
         * False unlocked result is impossible because we modify and check
         * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         */
-       if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
-               up_read(&vma->vm_lock->lock);
+       if (unlikely(vma->vm_lock->lock_seq == 
READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+               if (atomic_dec_and_test(&vma->vm_lock->count))
+                       wake_up(&vma->vm_mm->vma_writer_wait);
                return false;
        }
        return true;
@@ -664,7 +676,8 @@ static inline bool vma_read_trylock(struct vm_area_struct 
*vma)
 
 static inline void vma_read_unlock(struct vm_area_struct *vma)
 {
-       up_read(&vma->vm_lock->lock);
+       if (atomic_dec_and_test(&vma->vm_lock->count))
+               wake_up(&vma->vm_mm->vma_writer_wait);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -674,13 +687,13 @@ static inline void vma_assert_write_locked(struct 
vm_area_struct *vma)
         * current task is holding mmap_write_lock, both vma->vm_lock_seq and
         * mm->mm_lock_seq can't be concurrently modified.
         */
-       VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), 
vma);
+       VM_BUG_ON_VMA(vma->vm_lock->lock_seq != 
READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
 }
 
 static inline void vma_assert_no_reader(struct vm_area_struct *vma)
 {
-       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock) &&
-                     vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq),
+       VM_BUG_ON_VMA(atomic_read(&vma->vm_lock->count) > 0 &&
+                     vma->vm_lock->lock_seq != 
READ_ONCE(vma->vm_mm->mm_lock_seq),
                      vma);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index faa61b400f9b..a6050c38ca2e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -527,7 +527,13 @@ struct anon_vma_name {
 };
 
 struct vma_lock {
-       struct rw_semaphore lock;
+       /*
+        * count > 0 ==> read-locked with 'count' number of readers
+        * count < 0 ==> write-locked
+        * count = 0 ==> unlocked
+        */
+       atomic_t count;
+       int lock_seq;
 };
 
 /*
@@ -566,7 +572,6 @@ struct vm_area_struct {
        unsigned long vm_flags;
 
 #ifdef CONFIG_PER_VMA_LOCK
-       int vm_lock_seq;
        struct vma_lock *vm_lock;
 #endif
 
@@ -706,6 +711,7 @@ struct mm_struct {
                                          * by mmlist_lock
                                          */
 #ifdef CONFIG_PER_VMA_LOCK
+               struct wait_queue_head vma_writer_wait;
                int mm_lock_seq;
                struct {
                        struct list_head head;
diff --git a/kernel/fork.c b/kernel/fork.c
index 95db6a521cf1..b221ad182d98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -461,9 +461,8 @@ static bool vma_init_lock(struct vm_area_struct *vma)
        vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
        if (!vma->vm_lock)
                return false;
-
-       init_rwsem(&vma->vm_lock->lock);
-       vma->vm_lock_seq = -1;
+       atomic_set(&vma->vm_lock->count, 0);
+       vma->vm_lock->lock_seq = -1;
 
        return true;
 }
@@ -1229,6 +1228,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
 #ifdef CONFIG_PER_VMA_LOCK
+       init_waitqueue_head(&mm->vma_writer_wait);
        WRITE_ONCE(mm->mm_lock_seq, 0);
        INIT_LIST_HEAD(&mm->vma_free_list.head);
        spin_lock_init(&mm->vma_free_list.lock);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index b53d23c2d7a3..0088e31e5f7e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -38,6 +38,8 @@ struct mm_struct init_mm = {
        .arg_lock       =  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
 #ifdef CONFIG_PER_VMA_LOCK
+       .vma_writer_wait =
+               __WAIT_QUEUE_HEAD_INITIALIZER(init_mm.vma_writer_wait),
        .mm_lock_seq    = 0,
        .vma_free_list.head = LIST_HEAD_INIT(init_mm.vma_free_list.head),
        .vma_free_list.lock =  __SPIN_LOCK_UNLOCKED(init_mm.vma_free_list.lock),
-- 
2.39.0

Reply via email to