The current percpu-rwsem read side is entirely free of serializing
instructions at the cost of having a synchronize_sched() in the write
path.

The latency of the synchronize_sched() is too high for some users
(cgroups), so provide a __percpu_init_rwsem(.bias) argument to forgot
this synchronize_sched() at the cost of forcing all readers into the
slow path, which has serializing instructions.

Cc: Tejun Heo <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paul McKenney <[email protected]>
Reported-by: John Stultz <[email protected]>
Reported-by: Dmitry Shmidt <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 fs/super.c                    |    3 ++-
 include/linux/percpu-rwsem.h  |   15 +++++++++++++--
 kernel/cgroup.c               |    2 +-
 kernel/locking/percpu-rwsem.c |   10 +++++++++-
 4 files changed, 25 insertions(+), 5 deletions(-)

--- a/fs/super.c
+++ b/fs/super.c
@@ -195,7 +195,8 @@ static struct super_block *alloc_super(s
        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
                if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
                                        sb_writers_name[i],
-                                       &type->s_writers_key[i]))
+                                       &type->s_writers_key[i],
+                                       PERCPU_RWSEM_READER))
                        goto fail;
        }
        init_waitqueue_head(&s->s_writers.wait_unfrozen);
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -90,15 +90,26 @@ static inline void percpu_up_read(struct
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
+enum percpu_rwsem_bias { PERCPU_RWSEM_READER, PERCPU_RWSEM_WRITER };
+
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
-                               const char *, struct lock_class_key *);
+                               const char *, struct lock_class_key *,
+                               enum percpu_rwsem_bias bias);
 
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
 #define percpu_init_rwsem(sem)                                 \
 ({                                                             \
        static struct lock_class_key rwsem_key;                 \
-       __percpu_init_rwsem(sem, #sem, &rwsem_key);             \
+       __percpu_init_rwsem(sem, #sem, &rwsem_key,              \
+                           PERCPU_RWSEM_READER);               \
+})
+
+#define percpu_init_rwsem_writer(sem)                          \
+({                                                             \
+       static struct lock_class_key rwsem_key;                 \
+       __percpu_init_rwsem(sem, #sem, &rwsem_key,              \
+                           PERCPU_RWSEM_WRITER);               \
 })
 
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5605,7 +5605,7 @@ int __init cgroup_init(void)
        int ssid;
 
        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
-       BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
+       BUG_ON(percpu_init_rwsem_writer(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -11,7 +11,8 @@
 enum { readers_slow, readers_block };
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
-                       const char *name, struct lock_class_key *rwsem_key)
+                       const char *name, struct lock_class_key *rwsem_key,
+                       enum percpu_rwsem_bias bias)
 {
        sem->read_count = alloc_percpu(int);
        if (unlikely(!sem->read_count))
@@ -19,6 +20,13 @@ int __percpu_init_rwsem(struct percpu_rw
 
        /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
        rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+       if (bias == PERCPU_RWSEM_WRITER) {
+               /*
+                * Disable rcu_sync() and force slow path.
+                */
+               sem->rss.gp_count++;
+               sem->rss.gp_state = !0;
+       }
        __init_rwsem(&sem->rw_sem, name, rwsem_key);
        init_waitqueue_head(&sem->writer);
        sem->state = readers_slow;


Reply via email to