Hi Dave et al, The patch below merges the use of the wait queue lock and socket spinlock into one. This gains us ~100-150Mbit/s on netperf, mostly due to the fact that because we know how the spinlock is used, we can avoid the whole irq save, disable and reenable sequence since the spinlock only needs bh protection. As a bonus, this also removes one atomic operation per wakeup.
[This is with x86-64 task switching tweaks as otherwise it is in the noise.] Before: 87380 16384 16384 10.01 9501.56 90.66 90.66 1.563 1.563 87380 16384 16384 10.01 9476.08 93.40 93.40 1.615 1.615 87380 16384 16384 10.00 9473.65 80.66 80.66 1.395 1.395 87380 16384 16384 10.00 9525.82 80.41 80.41 1.383 1.383 87380 16384 16384 10.00 9523.49 80.71 80.71 1.388 1.388 87380 16384 16384 10.00 9430.09 80.01 80.01 1.390 1.390 87380 16384 16384 10.00 9469.60 80.71 80.71 1.396 1.396 87380 16384 16384 10.01 9517.88 79.32 79.32 1.365 1.365 87380 16384 16384 10.01 9512.30 80.31 80.31 1.383 1.383 87380 16384 16384 10.00 9453.69 80.90 80.90 1.402 1.402 After: 87380 16384 16384 10.01 9629.42 92.01 92.01 1.565 1.565 87380 16384 16384 10.01 9641.69 90.16 90.16 1.532 1.532 87380 16384 16384 10.01 9650.40 90.16 90.16 1.531 1.531 87380 16384 16384 10.00 9638.69 90.60 90.60 1.540 1.540 87380 16384 16384 10.01 9667.15 89.36 89.36 1.514 1.514 87380 16384 16384 10.01 9684.13 89.86 89.86 1.520 1.520 87380 16384 16384 10.01 9642.38 90.31 90.31 1.534 1.534 87380 16384 16384 10.00 9669.24 90.90 90.90 1.540 1.540 87380 16384 16384 10.00 9676.82 90.25 90.25 1.528 1.528 87380 16384 16384 10.00 9711.26 90.80 90.80 1.532 1.532 -ben Signed-off-by: Benjamin LaHaise <[EMAIL PROTECTED]> diff --git a/include/net/sock.h b/include/net/sock.h index 57e5f6b..a864d32 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -76,13 +76,12 @@ */ struct sock_iocb; typedef struct { - spinlock_t slock; struct sock_iocb *owner; wait_queue_head_t wq; } socket_lock_t; #define sock_lock_init(__sk) \ -do { spin_lock_init(&((__sk)->sk_lock.slock)); \ +do { \ (__sk)->sk_lock.owner = NULL; \ init_waitqueue_head(&((__sk)->sk_lock.wq)); \ } while(0) @@ -733,8 +732,8 @@ extern void FASTCALL(lock_sock(struct so extern void FASTCALL(release_sock(struct sock *sk)); /* BH context may only use the following locking interface. */ -#define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) -#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) +#define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.wq.lock)) +#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.wq.lock)) extern struct sock *sk_alloc(int family, gfp_t priority, diff --git a/net/core/filter.c b/net/core/filter.c index 93fbd01..69e4636 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -421,10 +421,10 @@ int sk_attach_filter(struct sock_fprog * if (!err) { struct sk_filter *old_fp; - spin_lock_bh(&sk->sk_lock.slock); + spin_lock_bh(&sk->sk_lock.wq.lock); old_fp = sk->sk_filter; sk->sk_filter = fp; - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.wq.lock); fp = old_fp; } diff --git a/net/core/sock.c b/net/core/sock.c index f152783..96008af 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -445,15 +445,15 @@ set_rcvbuf: break; case SO_DETACH_FILTER: - spin_lock_bh(&sk->sk_lock.slock); + spin_lock_bh(&sk->sk_lock.wq.lock); filter = sk->sk_filter; if (filter) { sk->sk_filter = NULL; - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.wq.lock); sk_filter_release(sk, filter); break; } - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.wq.lock); ret = -ENONET; break; @@ -1031,20 +1031,25 @@ struct sk_buff *sock_alloc_send_skb(stru return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); } -static void __lock_sock(struct sock *sk) +/* We use noinline here as this is the slow path and allowing gcc to inline + * results in much poorer code for lock_sock. + */ +static void noinline __lock_sock(struct sock *sk) { DEFINE_WAIT(wait); for(;;) { - prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_bh(&sk->sk_lock.slock); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue_exclusive_locked(&sk->sk_lock.wq, &wait); + spin_unlock_bh(&sk->sk_lock.wq.lock); schedule(); - spin_lock_bh(&sk->sk_lock.slock); + spin_lock_bh(&sk->sk_lock.wq.lock); if(!sock_owned_by_user(sk)) break; } - finish_wait(&sk->sk_lock.wq, &wait); + __set_current_state(TASK_RUNNING); + if (!list_empty_careful(&wait.task_list)) + list_del_init(&wait.task_list); } static void __release_sock(struct sock *sk) @@ -1331,24 +1336,28 @@ void sock_init_data(struct socket *sock, void fastcall lock_sock(struct sock *sk) { might_sleep(); - spin_lock_bh(&(sk->sk_lock.slock)); + spin_lock_bh(&(sk->sk_lock.wq.lock)); if (sk->sk_lock.owner) __lock_sock(sk); sk->sk_lock.owner = (void *)1; - spin_unlock_bh(&(sk->sk_lock.slock)); + spin_unlock_bh(&(sk->sk_lock.wq.lock)); } EXPORT_SYMBOL(lock_sock); void fastcall release_sock(struct sock *sk) { - spin_lock_bh(&(sk->sk_lock.slock)); + if (!sk->sk_backlog.tail && !waitqueue_active(&sk->sk_lock.wq)) { + sk->sk_lock.owner = NULL; + return; + } + spin_lock_bh(&(sk->sk_lock.wq.lock)); if (sk->sk_backlog.tail) __release_sock(sk); sk->sk_lock.owner = NULL; if (waitqueue_active(&(sk->sk_lock.wq))) - wake_up(&(sk->sk_lock.wq)); - spin_unlock_bh(&(sk->sk_lock.slock)); + wake_up_locked(&(sk->sk_lock.wq)); + spin_unlock_bh(&(sk->sk_lock.wq.lock)); } EXPORT_SYMBOL(release_sock); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e7bbff4..d28c277 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -237,7 +237,7 @@ static __inline__ int icmp_xmit_lock(voi { local_bh_disable(); - if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) { + if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.wq.lock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ @@ -249,7 +249,7 @@ static __inline__ int icmp_xmit_lock(voi static void icmp_xmit_unlock(void) { - spin_unlock_bh(&icmp_socket->sk->sk_lock.slock); + spin_unlock_bh(&icmp_socket->sk->sk_lock.wq.lock); } /* diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 21eb725..fabd268 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -91,7 +91,7 @@ static __inline__ int icmpv6_xmit_lock(v { local_bh_disable(); - if (unlikely(!spin_trylock(&icmpv6_socket->sk->sk_lock.slock))) { + if (unlikely(!spin_trylock(&icmpv6_socket->sk->sk_lock.wq.lock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. @@ -104,7 +104,7 @@ static __inline__ int icmpv6_xmit_lock(v static __inline__ void icmpv6_xmit_unlock(void) { - spin_unlock_bh(&icmpv6_socket->sk->sk_lock.slock); + spin_unlock_bh(&icmpv6_socket->sk->sk_lock.wq.lock); } /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0ea947e..95beba0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5602,12 +5602,12 @@ static void sctp_sock_migrate(struct soc */ newsp->type = type; - spin_lock_bh(&oldsk->sk_lock.slock); + spin_lock_bh(&oldsk->sk_lock.wq.lock); /* Migrate the backlog from oldsk to newsk. */ sctp_backlog_migrate(assoc, oldsk, newsk); /* Migrate the association to the new socket. */ sctp_assoc_migrate(assoc, newsk); - spin_unlock_bh(&oldsk->sk_lock.slock); + spin_unlock_bh(&oldsk->sk_lock.wq.lock); /* If the association on the newsk is already closed before accept() * is called, set RCV_SHUTDOWN flag. ----- End forwarded message ----- - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html