The following sysctl are global and can't be read or set from a netns:

net.core.rmem_default
net.core.rmem_max
net.core.wmem_default
net.core.wmem_max

Make the following sysctl parameters available from within a network
namespace, allowing to set unique values per network namespace.

My concern is about the initial value of this sysctl in the newly
creates netns: I'm not sure if is better to copy them from the init
namespace or set them to the default values.

Setting them to the default value has the advantage that a new namespace
behaves like a freshly booted system, while copying them from the init
netns has the advantage of keeping the current behaviour as the values
from the init netns are used.

Signed-off-by: Matteo Croce <mcr...@redhat.com>
---
 include/net/netns/core.h        |  5 +++
 include/net/sock.h              |  6 ----
 include/net/tcp.h               |  3 +-
 net/core/net_namespace.c        | 22 +++++++++++++
 net/core/sock.c                 | 31 +++++-------------
 net/core/sysctl_net_core.c      | 70 ++++++++++++++++++++++-------------------
 net/ipv4/ip_output.c            |  2 +-
 net/ipv4/syncookies.c           |  3 +-
 net/ipv4/tcp_minisocks.c        |  3 +-
 net/ipv4/tcp_output.c           | 12 ++++---
 net/ipv6/syncookies.c           |  3 +-
 net/netfilter/ipvs/ip_vs_sync.c |  4 +--
 12 files changed, 89 insertions(+), 75 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 78eb1ff75475..9b613162467d 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -9,6 +9,11 @@ struct netns_core {
        struct ctl_table_header *sysctl_hdr;
 
        int     sysctl_somaxconn;
+       u32     sysctl_wmem_max;
+       u32     sysctl_rmem_max;
+
+       u32     sysctl_wmem_default;
+       u32     sysctl_rmem_default;
 
        struct prot_inuse __percpu *inuse;
 };
diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..e62a279e420f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2363,13 +2363,7 @@ bool sk_net_capable(const struct sock *sk, int cap);
 
 void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
 
-extern __u32 sysctl_wmem_max;
-extern __u32 sysctl_rmem_max;
-
 extern int sysctl_tstamp_allow_data;
 extern int sysctl_optmem_max;
 
-extern __u32 sysctl_wmem_default;
-extern __u32 sysctl_rmem_default;
-
 #endif /* _SOCK_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70483296157f..460f4373d42a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1300,7 +1300,8 @@ static inline void tcp_slow_start_after_idle_check(struct 
sock *sk)
 /* Determine a window scaling and initial window to offer. */
 void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
                               __u32 *window_clamp, int wscale_ok,
-                              __u8 *rcv_wscale, __u32 init_rcv_wnd);
+                              __u8 *rcv_wscale, __u32 init_rcv_wnd,
+                              __u32 rmem_max);
 
 static inline int tcp_win_from_space(int space)
 {
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 8726d051f31d..2d72b2bd6eab 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -23,6 +23,16 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
+/* Take into consideration the size of the struct sk_buff overhead in the
+ * determination of these values, since that is non-constant across
+ * platforms.  This makes socket queueing behavior and performance
+ * not depend upon such differences.
+ */
+#define _SK_MEM_PACKETS                256
+#define _SK_MEM_OVERHEAD       SKB_TRUESIZE(256)
+#define SK_WMEM_MAX            (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_MAX            (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+
 /*
  *     Our network namespace constructor/destructor lists
  */
@@ -318,6 +328,18 @@ static __net_init int setup_net(struct net *net, struct 
user_namespace *user_ns)
 static int __net_init net_defaults_init_net(struct net *net)
 {
        net->core.sysctl_somaxconn = SOMAXCONN;
+       if (net_eq(net, &init_net)) {
+               init_net.core.sysctl_wmem_max = SK_WMEM_MAX;
+               init_net.core.sysctl_rmem_max = SK_RMEM_MAX;
+               init_net.core.sysctl_wmem_default = SK_WMEM_MAX;
+               init_net.core.sysctl_rmem_default = SK_RMEM_MAX;
+       } else {
+               net->core.sysctl_wmem_max = init_net.core.sysctl_wmem_max;
+               net->core.sysctl_rmem_max = init_net.core.sysctl_rmem_max;
+               net->core.sysctl_wmem_default = 
init_net.core.sysctl_wmem_default;
+               net->core.sysctl_rmem_default = 
init_net.core.sysctl_rmem_default;
+       }
+
        return 0;
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index ac2a404c73eb..8086a660d75f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,24 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX];
 static struct lock_class_key af_elock_keys[AF_MAX];
 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 
-/* Take into consideration the size of the struct sk_buff overhead in the
- * determination of these values, since that is non-constant across
- * platforms.  This makes socket queueing behavior and performance
- * not depend upon such differences.
- */
-#define _SK_MEM_PACKETS                256
-#define _SK_MEM_OVERHEAD       SKB_TRUESIZE(256)
-#define SK_WMEM_MAX            (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-#define SK_RMEM_MAX            (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-
-/* Run time adjustable parameters. */
-__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
-EXPORT_SYMBOL(sysctl_wmem_max);
-__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
-EXPORT_SYMBOL(sysctl_rmem_max);
-__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
-__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-
 /* Maximal space eaten by iovec or ancillary data plus some space */
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 EXPORT_SYMBOL(sysctl_optmem_max);
@@ -702,6 +684,7 @@ int sock_setsockopt(struct socket *sock, int level, int 
optname,
                    char __user *optval, unsigned int optlen)
 {
        struct sock *sk = sock->sk;
+       struct net *net = sock_net(sk);
        int val;
        int valbool;
        struct linger ling;
@@ -755,7 +738,7 @@ int sock_setsockopt(struct socket *sock, int level, int 
optname,
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
-               val = min_t(u32, val, sysctl_wmem_max);
+               val = min_t(u32, val, net->core.sysctl_wmem_max);
 set_sndbuf:
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
@@ -776,7 +759,7 @@ int sock_setsockopt(struct socket *sock, int level, int 
optname,
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
-               val = min_t(u32, val, sysctl_rmem_max);
+               val = min_t(u32, val, net->core.sysctl_rmem_max);
 set_rcvbuf:
                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
                /*
@@ -820,7 +803,7 @@ int sock_setsockopt(struct socket *sock, int level, int 
optname,
 
        case SO_PRIORITY:
                if ((val >= 0 && val <= 6) ||
-                   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+                   ns_capable(net->user_ns, CAP_NET_ADMIN))
                        sk->sk_priority = val;
                else
                        ret = -EPERM;
@@ -994,7 +977,7 @@ int sock_setsockopt(struct socket *sock, int level, int 
optname,
                        clear_bit(SOCK_PASSSEC, &sock->flags);
                break;
        case SO_MARK:
-               if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+               if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        ret = -EPERM;
                else
                        sk->sk_mark = val;
@@ -2626,8 +2609,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        init_timer(&sk->sk_timer);
 
        sk->sk_allocation       =       GFP_KERNEL;
-       sk->sk_rcvbuf           =       sysctl_rmem_default;
-       sk->sk_sndbuf           =       sysctl_wmem_default;
+       sk->sk_rcvbuf           =       sock_net(sk)->core.sysctl_rmem_default;
+       sk->sk_sndbuf           =       sock_net(sk)->core.sysctl_wmem_default;
        sk->sk_state            =       TCP_CLOSE;
        sk_set_socket(sk, sock);
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b7cd9aafe99e..01bb23ba4c86 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -252,38 +252,6 @@ static int proc_do_rss_key(struct ctl_table *table, int 
write,
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
        {
-               .procname       = "wmem_max",
-               .data           = &sysctl_wmem_max,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &min_sndbuf,
-       },
-       {
-               .procname       = "rmem_max",
-               .data           = &sysctl_rmem_max,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &min_rcvbuf,
-       },
-       {
-               .procname       = "wmem_default",
-               .data           = &sysctl_wmem_default,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &min_sndbuf,
-       },
-       {
-               .procname       = "rmem_default",
-               .data           = &sysctl_rmem_default,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &min_rcvbuf,
-       },
-       {
                .procname       = "dev_weight",
                .data           = &weight_p,
                .maxlen         = sizeof(int),
@@ -472,6 +440,38 @@ static struct ctl_table netns_core_table[] = {
                .extra1         = &zero,
                .proc_handler   = proc_dointvec_minmax
        },
+       {
+               .procname       = "wmem_max",
+               .data           = &init_net.core.sysctl_wmem_max,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &min_sndbuf,
+       },
+       {
+               .procname       = "rmem_max",
+               .data           = &init_net.core.sysctl_rmem_max,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &min_rcvbuf,
+       },
+       {
+               .procname       = "wmem_default",
+               .data           = &init_net.core.sysctl_wmem_default,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &min_sndbuf,
+       },
+       {
+               .procname       = "rmem_default",
+               .data           = &init_net.core.sysctl_rmem_default,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &min_rcvbuf,
+       },
        { }
 };
 
@@ -481,11 +481,15 @@ static __net_init int sysctl_core_net_init(struct net 
*net)
 
        tbl = netns_core_table;
        if (!net_eq(net, &init_net)) {
+               int i;
+
                tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
                if (tbl == NULL)
                        goto err_dup;
 
-               tbl[0].data = &net->core.sysctl_somaxconn;
+               /* Update the variables to point into the current struct net */
+               for (i = 0; i < ARRAY_SIZE(netns_core_table) - 1; i++)
+                       tbl[i].data += (void *)net - (void *)&init_net;
 
                /* Don't export any sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 50c74cd890bc..658927c673ee 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1639,7 +1639,7 @@ void ip_send_unicast_reply(struct sock *sk, struct 
sk_buff *skb,
        sk->sk_priority = skb->priority;
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
-       sk->sk_sndbuf = sysctl_wmem_default;
+       sk->sk_sndbuf = net->core.sysctl_wmem_default;
        sk->sk_mark = fl4.flowi4_mark;
        err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
                             len, 0, &ipc, &rt, MSG_DONTWAIT);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 03ad8778c395..ee364e5976a4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -388,7 +388,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct 
sk_buff *skb)
        tcp_select_initial_window(tcp_full_space(sk), req->mss,
                                  &req->rsk_rcv_wnd, &req->rsk_window_clamp,
                                  ireq->wscale_ok, &rcv_wscale,
-                                 dst_metric(&rt->dst, RTAX_INITRWND));
+                                 dst_metric(&rt->dst, RTAX_INITRWND),
+                                 sock_net(sk)->core.sysctl_rmem_max);
 
        ireq->rcv_wscale  = rcv_wscale;
        ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0ff83c1637d8..e5243ac2edd3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -377,7 +377,8 @@ void tcp_openreq_init_rwin(struct request_sock *req,
                &req->rsk_window_clamp,
                ireq->wscale_ok,
                &rcv_wscale,
-               rcv_wnd);
+               rcv_wnd,
+               sock_net(sk_listener)->core.sysctl_rmem_max);
        ireq->rcv_wscale = rcv_wscale;
 }
 EXPORT_SYMBOL(tcp_openreq_init_rwin);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4e985dea1dd2..9173d01e7d21 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -206,7 +206,7 @@ u32 tcp_default_init_rwnd(u32 mss)
 void tcp_select_initial_window(int __space, __u32 mss,
                               __u32 *rcv_wnd, __u32 *window_clamp,
                               int wscale_ok, __u8 *rcv_wscale,
-                              __u32 init_rcv_wnd)
+                              __u32 init_rcv_wnd, __u32 rmem_max)
 {
        unsigned int space = (__space < 0 ? 0 : __space);
 
@@ -236,7 +236,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
        if (wscale_ok) {
                /* Set window scaling on max possible window */
                space = max_t(u32, space, sysctl_tcp_rmem[2]);
-               space = max_t(u32, space, sysctl_rmem_max);
+               space = max_t(u32, space, rmem_max);
                space = min_t(u32, space, *window_clamp);
                while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
                        space >>= 1;
@@ -3268,6 +3268,7 @@ static void tcp_connect_init(struct sock *sk)
 {
        const struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
+       struct net *net = sock_net(sk);
        __u8 rcv_wscale;
        u32 rcv_wnd;
 
@@ -3275,7 +3276,7 @@ static void tcp_connect_init(struct sock *sk)
         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
         */
        tp->tcp_header_len = sizeof(struct tcphdr);
-       if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
+       if (net->ipv4.sysctl_tcp_timestamps)
                tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -3311,9 +3312,10 @@ static void tcp_connect_init(struct sock *sk)
                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? 
tp->tcp_header_len - sizeof(struct tcphdr) : 0),
                                  &tp->rcv_wnd,
                                  &tp->window_clamp,
-                                 sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
+                                 net->ipv4.sysctl_tcp_window_scaling,
                                  &rcv_wscale,
-                                 rcv_wnd);
+                                 rcv_wnd,
+                                 net->core.sysctl_rmem_max);
 
        tp->rx_opt.rcv_wscale = rcv_wscale;
        tp->rcv_ssthresh = tp->rcv_wnd;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 4e7817abc0b9..bf38ee15766c 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -247,7 +247,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct 
sk_buff *skb)
        tcp_select_initial_window(tcp_full_space(sk), req->mss,
                                  &req->rsk_rcv_wnd, &req->rsk_window_clamp,
                                  ireq->wscale_ok, &rcv_wscale,
-                                 dst_metric(dst, RTAX_INITRWND));
+                                 dst_metric(dst, RTAX_INITRWND),
+                                 sock_net(sk)->core.sysctl_rmem_max);
 
        ireq->rcv_wscale = rcv_wscale;
        ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 0e5b64a75da0..4ad447333379 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1283,12 +1283,12 @@ static void set_sock_size(struct sock *sk, int mode, 
int val)
        lock_sock(sk);
        if (mode) {
                val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
-                             sysctl_wmem_max);
+                             sock_net(sk)->core.sysctl_wmem_max);
                sk->sk_sndbuf = val * 2;
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
        } else {
                val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
-                             sysctl_rmem_max);
+                             sock_net(sk)->core.sysctl_rmem_max);
                sk->sk_rcvbuf = val * 2;
                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
        }
-- 
2.13.3

Reply via email to