For applications using SO_REUSEPORT listeners, there is
no clean way to switch traffic on/off or add/remove
listeners without dropping pending connections. With this
patch, applications can turn off queueing of new connections
for a specific listener socket which enables implementation of
zero down time server applications.

For example, a popular web server nginx handles application
configuration changes by forking new processes (listeners)
and waiting for old processes (listeners) to finish up their
processing. However, this approach is distruptive as removal
of a listener will drop pending connections for that listener.
Instead, with this patch, nginx can maintain two sets of listener
socket pools to be used by old/new processes and switch traffic off/on
using this socket option. Old processes set set this socket option
to drain their existing queues.

Tested on a x86_64 kernel.

Signed-off-by: Tolga Ceylan <tolga.cey...@gmail.com>
---
 arch/alpha/include/uapi/asm/socket.h   | 2 ++
 arch/avr32/include/uapi/asm/socket.h   | 2 ++
 arch/frv/include/uapi/asm/socket.h     | 2 ++
 arch/ia64/include/uapi/asm/socket.h    | 2 ++
 arch/m32r/include/uapi/asm/socket.h    | 2 ++
 arch/mips/include/uapi/asm/socket.h    | 2 ++
 arch/mn10300/include/uapi/asm/socket.h | 2 ++
 arch/parisc/include/uapi/asm/socket.h  | 2 ++
 arch/powerpc/include/uapi/asm/socket.h | 2 ++
 arch/sparc/include/uapi/asm/socket.h   | 2 ++
 include/net/sock.h                     | 3 +++
 include/uapi/asm-generic/socket.h      | 2 ++
 net/core/sock.c                        | 3 +++
 net/ipv4/inet_hashtables.c             | 5 ++++-
 14 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/include/uapi/asm/socket.h 
b/arch/alpha/include/uapi/asm/socket.h
index 9a20821..d2ad268 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h 
b/arch/avr32/include/uapi/asm/socket.h
index 2b65ed6..6b6d0af 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h 
b/arch/frv/include/uapi/asm/socket.h
index 4823ad1..23d6b82 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -85,5 +85,7 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h 
b/arch/ia64/include/uapi/asm/socket.h
index 59be3d8..c3d5ada 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h 
b/arch/m32r/include/uapi/asm/socket.h
index 7bc4cb2..602f4b4 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h 
b/arch/mips/include/uapi/asm/socket.h
index dec3c85..e0880e2 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -103,4 +103,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h 
b/arch/mn10300/include/uapi/asm/socket.h
index cab7d6d..d60f747 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h 
b/arch/parisc/include/uapi/asm/socket.h
index a5cd40c..0ffa8de 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -84,4 +84,6 @@
 #define SO_ATTACH_BPF          0x402B
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h 
b/arch/powerpc/include/uapi/asm/socket.h
index c046666..6935839 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h 
b/arch/sparc/include/uapi/asm/socket.h
index e6a16c4..e5ecf16 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -81,6 +81,8 @@
 #define SO_ATTACH_BPF          0x0034
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 0x0035
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION             0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
diff --git a/include/net/sock.h b/include/net/sock.h
index 94dff7f..ebb3c08 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -142,6 +142,7 @@ typedef __u64 __bitwise __addrpair;
  *     @skc_state: Connection state
  *     @skc_reuse: %SO_REUSEADDR setting
  *     @skc_reuseport: %SO_REUSEPORT setting
+ *     @skc_reuseport_listen_off: %SO_REUSEPORT_LISTEN_OFF setting
  *     @skc_bound_dev_if: bound device index if != 0
  *     @skc_bind_node: bind hash linkage for various protocol lookup tables
  *     @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -183,6 +184,7 @@ struct sock_common {
        volatile unsigned char  skc_state;
        unsigned char           skc_reuse:4;
        unsigned char           skc_reuseport:1;
+       unsigned char       skc_reuseport_listen_off:1;
        unsigned char           skc_ipv6only:1;
        unsigned char           skc_net_refcnt:1;
        int                     skc_bound_dev_if;
@@ -322,6 +324,7 @@ struct sock {
 #define sk_state               __sk_common.skc_state
 #define sk_reuse               __sk_common.skc_reuse
 #define sk_reuseport           __sk_common.skc_reuseport
+#define sk_reuseport_listen_off                
__sk_common.skc_reuseport_listen_off
 #define sk_ipv6only            __sk_common.skc_ipv6only
 #define sk_net_refcnt          __sk_common.skc_net_refcnt
 #define sk_bound_dev_if                __sk_common.skc_bound_dev_if
diff --git a/include/uapi/asm-generic/socket.h 
b/include/uapi/asm-generic/socket.h
index 5c15c2a..ed22ee4 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -87,4 +87,6 @@
 #define SO_ATTACH_BPF          50
 #define SO_DETACH_BPF          SO_DETACH_FILTER
 
+#define SO_REUSEPORT_LISTEN_OFF 51
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 3307c02..5861513 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -714,6 +714,9 @@ int sock_setsockopt(struct socket *sock, int level, int 
optname,
        case SO_REUSEPORT:
                sk->sk_reuseport = valbool;
                break;
+       case SO_REUSEPORT_LISTEN_OFF:
+               sk->sk_reuseport_listen_off = valbool;
+               break;
        case SO_TYPE:
        case SO_PROTOCOL:
        case SO_DOMAIN:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8912019..59e8540 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -224,10 +224,13 @@ begin:
                                phash = inet_ehashfn(net, daddr, hnum,
                                                     saddr, sport);
                                matches = 1;
+                               if (sk->sk_reuseport_listen_off)
+                                       result = NULL;
                        }
                } else if (score == hiscore && reuseport) {
                        matches++;
-                       if (reciprocal_scale(phash, matches) == 0)
+                       if (reciprocal_scale(phash, matches) == 0 &&
+                           !sk->sk_reuseport_listen_off)
                                result = sk;
                        phash = next_pseudo_random32(phash);
                }
-- 
2.5.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to