Curently mutex is used to protect pernet operations list. It makes
cleanup_net() to execute ->exit methods of the same operations set,
which was used on the time of ->init, even after net namespace is
unlinked from net_namespace_list.

But the problem is it's need to synchronize_rcu() after net is removed
from net_namespace_list():

Destroy net_ns:
cleanup_net()
  mutex_lock(&net_mutex)
  list_del_rcu(&net->list)
  synchronize_rcu()                                  <--- Sleep there for ages
  list_for_each_entry_reverse(ops, &pernet_list, list)
    ops_exit_list(ops, &net_exit_list)
  list_for_each_entry_reverse(ops, &pernet_list, list)
    ops_free_list(ops, &net_exit_list)
  mutex_unlock(&net_mutex)

This primitive is not fast, especially on the systems with many processors
and/or when preemptible RCU is enabled in config. So, all the time, while
cleanup_net() is waiting for RCU grace period, creation of new net namespaces
is not possible, the tasks, who makes it, are sleeping on the same mutex:

Create net_ns:
copy_net_ns()
  mutex_lock_killable(&net_mutex)                    <--- Sleep there for ages

I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop
with preemptible RCU enabled.

The solution is to convert net_mutex to the rw_semaphore and add small locks
to really small number of pernet_operations, what really need them. Then,
pernet_operations::init/::exit methods, modifying the net-related data,
will require down_read() locking only, while down_write() will be used
for changing pernet_list.

This gives signify performance increase, like you may see here:
https://www.spinics.net/lists/netdev/msg467095.html

It's 4.6 times performance increase on one-thread test.
Multi-thread tests increase may be close to 4.6 multiplied
to number of threads.

This patch starts replacing net_mutex to net_sem. It adds rw_semaphore,
describes the variables it protects, and makes to use where appropriate.
net_mutex is still present, and next patches will kick it out step-by-step.

Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com>
---
 include/linux/rtnetlink.h |    1 +
 net/core/net_namespace.c  |   37 +++++++++++++++++++++++++------------
 net/core/rtnetlink.c      |    4 ++--
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2032ce2eb20b..f640fc87fe1d 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -35,6 +35,7 @@ extern int rtnl_is_locked(void);
 
 extern wait_queue_head_t netdev_unregistering_wq;
 extern struct mutex net_mutex;
+extern struct rw_semaphore net_sem;
 
 #ifdef CONFIG_PROVE_LOCKING
 extern bool lockdep_rtnl_is_held(void);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2e512965bf42..2254b1639209 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -41,6 +41,11 @@ struct net init_net = {
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
+/*
+ * net_sem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_* pointers.
+ */
+DECLARE_RWSEM(net_sem);
 
 #define MIN_PERNET_OPS_ID      \
        ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -411,12 +416,16 @@ struct net *copy_net_ns(unsigned long flags,
        net->ucounts = ucounts;
        get_user_ns(user_ns);
 
-       rv = mutex_lock_killable(&net_mutex);
+       rv = down_read_killable(&net_sem);
        if (rv < 0)
                goto put_userns;
-
+       rv = mutex_lock_killable(&net_mutex);
+       if (rv < 0)
+               goto up_read;
        rv = setup_net(net, user_ns);
        mutex_unlock(&net_mutex);
+up_read:
+       up_read(&net_sem);
        if (rv < 0) {
 put_userns:
                put_user_ns(user_ns);
@@ -443,6 +452,7 @@ static void cleanup_net(struct work_struct *work)
        list_replace_init(&cleanup_list, &net_kill_list);
        spin_unlock_irq(&cleanup_list_lock);
 
+       down_read(&net_sem);
        mutex_lock(&net_mutex);
 
        /* Don't let anyone else find us. */
@@ -484,6 +494,7 @@ static void cleanup_net(struct work_struct *work)
                ops_free_list(ops, &net_exit_list);
 
        mutex_unlock(&net_mutex);
+       up_read(&net_sem);
 
        /* Ensure there are no outstanding rcu callbacks using this
         * network namespace.
@@ -510,8 +521,10 @@ static void cleanup_net(struct work_struct *work)
  */
 void net_ns_barrier(void)
 {
+       down_write(&net_sem);
        mutex_lock(&net_mutex);
        mutex_unlock(&net_mutex);
+       up_write(&net_sem);
 }
 EXPORT_SYMBOL(net_ns_barrier);
 
@@ -838,12 +851,12 @@ static int __init net_ns_init(void)
 
        rcu_assign_pointer(init_net.gen, ng);
 
-       mutex_lock(&net_mutex);
+       down_write(&net_sem);
        if (setup_net(&init_net, &init_user_ns))
                panic("Could not setup the initial network namespace");
 
        init_net_initialized = true;
-       mutex_unlock(&net_mutex);
+       up_write(&net_sem);
 
        register_pernet_subsys(&net_ns_ops);
 
@@ -983,9 +996,9 @@ static void unregister_pernet_operations(struct 
pernet_operations *ops)
 int register_pernet_subsys(struct pernet_operations *ops)
 {
        int error;
-       mutex_lock(&net_mutex);
+       down_write(&net_sem);
        error =  register_pernet_operations(first_device, ops);
-       mutex_unlock(&net_mutex);
+       up_write(&net_sem);
        return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -1001,9 +1014,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
  */
 void unregister_pernet_subsys(struct pernet_operations *ops)
 {
-       mutex_lock(&net_mutex);
+       down_write(&net_sem);
        unregister_pernet_operations(ops);
-       mutex_unlock(&net_mutex);
+       up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 
@@ -1029,11 +1042,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 int register_pernet_device(struct pernet_operations *ops)
 {
        int error;
-       mutex_lock(&net_mutex);
+       down_write(&net_sem);
        error = register_pernet_operations(&pernet_list, ops);
        if (!error && (first_device == &pernet_list))
                first_device = &ops->list;
-       mutex_unlock(&net_mutex);
+       up_write(&net_sem);
        return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1049,11 +1062,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
  */
 void unregister_pernet_device(struct pernet_operations *ops)
 {
-       mutex_lock(&net_mutex);
+       down_write(&net_sem);
        if (&ops->list == first_device)
                first_device = first_device->next;
        unregister_pernet_operations(ops);
-       mutex_unlock(&net_mutex);
+       up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..cb06d43c4230 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -390,11 +390,11 @@ static void rtnl_lock_unregistering_all(void)
 void rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
        /* Close the race with cleanup_net() */
-       mutex_lock(&net_mutex);
+       down_write(&net_sem);
        rtnl_lock_unregistering_all();
        __rtnl_link_unregister(ops);
        rtnl_unlock();
-       mutex_unlock(&net_mutex);
+       up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(rtnl_link_unregister);
 

Reply via email to