[PATCH 06/16] mm: RCU free VMAs

2017-08-08 Thread Laurent Dufour
From: Peter Zijlstra 

Manage the VMAs with SRCU such that we can do a lockless VMA lookup.

We put the fput(vma->vm_file) in the SRCU callback, this keeps files
valid during speculative faults, this is possible due to the delayed
fput work by Al Viro -- do we need srcu_barrier() in unmount
someplace?

We guard the mm_rb tree with a seqlock (this could be a seqcount but
we'd have to disable preemption around the write side in order to make
the retry loop in __read_seqcount_begin() work) such that we can know
if the rb tree walk was correct. We cannot trust the restult of a
lockless tree walk in the face of concurrent tree rotations; although
we can trust on the termination of such walks -- tree rotations
guarantee the end result is a tree again after all.

Furthermore, we rely on the WMB implied by the
write_seqlock/count_begin() to separate the VMA initialization and the
publishing stores, analogous to the RELEASE in rcu_assign_pointer().
We also rely on the RMB from read_seqretry() to separate the vma load
from further loads like the smp_read_barrier_depends() in regular
RCU.

We must not touch the vmacache while doing SRCU lookups as that is not
properly serialized against changes. We update gap information after
publishing the VMA, but A) we don't use that and B) the seqlock
read side would fix that anyhow.

We clear vma->vm_rb for nodes removed from the vma tree such that we
can easily detect such 'dead' nodes, we rely on the WMB from
write_sequnlock() to separate the tree removal and clearing the node.

Provide find_vma_srcu() which wraps the required magic.

Signed-off-by: Peter Zijlstra (Intel) 

[Remove the warnings in description about the SRCU global lock which
 has been removed now]
[Rename vma_is_dead() to vma_has_changed()]
[Pass vm_fault structure pointer instead of 2 arguments  to
 vmf_has_changed() ]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm_types.h |   2 +
 kernel/fork.c|   1 +
 mm/init-mm.c |   1 +
 mm/internal.h|  19 +
 mm/mmap.c| 100 +++
 5 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d7d6dae4c009..30c127b8a4d8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -343,6 +343,7 @@ struct vm_area_struct {
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
seqcount_t vm_sequence;
+   struct rcu_head vm_rcu_head;
 } __randomize_layout;
 
 struct core_thread {
@@ -360,6 +361,7 @@ struct kioctx_table;
 struct mm_struct {
struct vm_area_struct *mmap;/* list of VMAs */
struct rb_root mm_rb;
+   seqlock_t mm_seq;
u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..8d1223270fea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -791,6 +791,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+   seqlock_init(>mm_seq);
atomic_set(>mm_users, 1);
atomic_set(>mm_count, 1);
init_rwsem(>mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 975e49f00f34..2b1fa061684f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -16,6 +16,7 @@
 
 struct mm_struct init_mm = {
.mm_rb  = RB_ROOT,
+   .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq),
.pgd= swapper_pg_dir,
.mm_users   = ATOMIC_INIT(2),
.mm_count   = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..9d6347e35747 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,25 @@ void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);
 
+extern struct srcu_struct vma_srcu;
+
+extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm,
+   unsigned long addr);
+
+static inline bool vma_has_changed(struct vm_fault *vmf)
+{
+   int ret = RB_EMPTY_NODE(>vma->vm_rb);
+   unsigned seq = ACCESS_ONCE(vmf->vma->vm_sequence.sequence);
+
+   /*
+* Matches both the wmb in write_seqlock_{begin,end}() and
+* the wmb in vma_rb_erase().
+*/
+   smp_rmb();
+
+   return ret || seq != vmf->sequence;
+}
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 221b1f3e966a..73f5ffc03155 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -159,6 +159,23 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
 }
 
+DEFINE_SRCU(vma_srcu);
+
+static void __free_vma(struct rcu_head *head)
+{
+   struct vm_area_struct 

[PATCH 06/16] mm: RCU free VMAs

2017-08-08 Thread Laurent Dufour
From: Peter Zijlstra 

Manage the VMAs with SRCU such that we can do a lockless VMA lookup.

We put the fput(vma->vm_file) in the SRCU callback, this keeps files
valid during speculative faults, this is possible due to the delayed
fput work by Al Viro -- do we need srcu_barrier() in unmount
someplace?

We guard the mm_rb tree with a seqlock (this could be a seqcount but
we'd have to disable preemption around the write side in order to make
the retry loop in __read_seqcount_begin() work) such that we can know
if the rb tree walk was correct. We cannot trust the restult of a
lockless tree walk in the face of concurrent tree rotations; although
we can trust on the termination of such walks -- tree rotations
guarantee the end result is a tree again after all.

Furthermore, we rely on the WMB implied by the
write_seqlock/count_begin() to separate the VMA initialization and the
publishing stores, analogous to the RELEASE in rcu_assign_pointer().
We also rely on the RMB from read_seqretry() to separate the vma load
from further loads like the smp_read_barrier_depends() in regular
RCU.

We must not touch the vmacache while doing SRCU lookups as that is not
properly serialized against changes. We update gap information after
publishing the VMA, but A) we don't use that and B) the seqlock
read side would fix that anyhow.

We clear vma->vm_rb for nodes removed from the vma tree such that we
can easily detect such 'dead' nodes, we rely on the WMB from
write_sequnlock() to separate the tree removal and clearing the node.

Provide find_vma_srcu() which wraps the required magic.

Signed-off-by: Peter Zijlstra (Intel) 

[Remove the warnings in description about the SRCU global lock which
 has been removed now]
[Rename vma_is_dead() to vma_has_changed()]
[Pass vm_fault structure pointer instead of 2 arguments  to
 vmf_has_changed() ]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm_types.h |   2 +
 kernel/fork.c|   1 +
 mm/init-mm.c |   1 +
 mm/internal.h|  19 +
 mm/mmap.c| 100 +++
 5 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d7d6dae4c009..30c127b8a4d8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -343,6 +343,7 @@ struct vm_area_struct {
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
seqcount_t vm_sequence;
+   struct rcu_head vm_rcu_head;
 } __randomize_layout;
 
 struct core_thread {
@@ -360,6 +361,7 @@ struct kioctx_table;
 struct mm_struct {
struct vm_area_struct *mmap;/* list of VMAs */
struct rb_root mm_rb;
+   seqlock_t mm_seq;
u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..8d1223270fea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -791,6 +791,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+   seqlock_init(>mm_seq);
atomic_set(>mm_users, 1);
atomic_set(>mm_count, 1);
init_rwsem(>mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 975e49f00f34..2b1fa061684f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -16,6 +16,7 @@
 
 struct mm_struct init_mm = {
.mm_rb  = RB_ROOT,
+   .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq),
.pgd= swapper_pg_dir,
.mm_users   = ATOMIC_INIT(2),
.mm_count   = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..9d6347e35747 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,25 @@ void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);
 
+extern struct srcu_struct vma_srcu;
+
+extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm,
+   unsigned long addr);
+
+static inline bool vma_has_changed(struct vm_fault *vmf)
+{
+   int ret = RB_EMPTY_NODE(>vma->vm_rb);
+   unsigned seq = ACCESS_ONCE(vmf->vma->vm_sequence.sequence);
+
+   /*
+* Matches both the wmb in write_seqlock_{begin,end}() and
+* the wmb in vma_rb_erase().
+*/
+   smp_rmb();
+
+   return ret || seq != vmf->sequence;
+}
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 221b1f3e966a..73f5ffc03155 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -159,6 +159,23 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
 }
 
+DEFINE_SRCU(vma_srcu);
+
+static void __free_vma(struct rcu_head *head)
+{
+   struct vm_area_struct *vma =
+   container_of(head, struct vm_area_struct,