Re: use-after-free in sock_wake_async

2015-11-26 Thread Hannes Frederic Sowa
Hannes Frederic Sowa  writes:


> I have seen filesystems already doing so in .destroy_inode, that's why I
> am asking. The allocation happens the same way as we do with sock_alloc,
> e.g. shmem. I actually thought that struct inode already provides an
> rcu_head for exactly that reason.

E.g.:

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 54036ae..e15c49f 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -35,7 +35,6 @@
 struct macvtap_queue {
struct sock sk;
struct socket sock;
-   struct socket_wq wq;
int vnet_hdr_sz;
struct macvlan_dev __rcu *vlan;
struct file *file;
@@ -529,8 +528,7 @@ static int macvtap_open(struct inode *inode, struct file 
*file)
if (!q)
goto out;
 
-   RCU_INIT_POINTER(q->sock.wq, >wq);
-   init_waitqueue_head(>wq.wait);
+   init_waitqueue_head(>sock.wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
@@ -579,7 +577,7 @@ static unsigned int macvtap_poll(struct file *file, 
poll_table * wait)
goto out;
 
mask = 0;
-   poll_wait(file, >wq.wait, wait);
+   poll_wait(file, >sock.wq.wait, wait);
 
if (!skb_queue_empty(>sk.sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b1878fa..20c5d34 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -145,7 +145,6 @@ struct tap_filter {
 struct tun_file {
struct sock sk;
struct socket socket;
-   struct socket_wq wq;
struct tun_struct __rcu *tun;
struct fasync_struct *fasync;
/* only used for fasnyc */
@@ -2219,8 +2218,7 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
tfile->flags = 0;
tfile->ifindex = 0;
 
-   init_waitqueue_head(>wq.wait);
-   RCU_INIT_POINTER(tfile->socket.wq, >wq);
+   init_waitqueue_head(>socket.wq.wait);
 
tfile->socket.file = file;
tfile->socket.ops = _socket_ops;
diff --git a/include/linux/net.h b/include/linux/net.h
index 70ac5e2..3a7a4d1 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -89,8 +89,7 @@ struct socket_wq {
/* Note: wait MUST be first field of socket_wq */
wait_queue_head_t   wait;
struct fasync_struct*fasync_list;
-   struct rcu_head rcu;
-} cacheline_aligned_in_smp;
+};
 
 /**
  *  struct socket - general BSD socket
@@ -111,7 +110,7 @@ struct socket {
 
unsigned long   flags;
 
-   struct socket_wq __rcu  *wq;
+   struct socket_wqwq;
 
struct file *file;
struct sock *sk;
diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4b..ae34da1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1674,7 +1674,7 @@ static inline void sock_orphan(struct sock *sk)
 static inline void sock_graft(struct sock *sk, struct socket *parent)
 {
write_lock_bh(>sk_callback_lock);
-   sk->sk_wq = parent->wq;
+   sk->sk_wq = >wq;
parent->sk = sk;
sk_set_socket(sk, parent);
security_sock_graft(sk, parent);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 630c197..c125881 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -657,7 +657,7 @@ static void rcu_preempt_do_callbacks(void)
 /*
  * Queue a preemptible-RCU callback for invocation after a grace period.
  */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
+static void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
__call_rcu(head, func, rcu_state_p, -1, 0);
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e4dd54..314ab6a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2383,7 +2383,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
if (sock) {
sk->sk_type =   sock->type;
-   sk->sk_wq   =   sock->wq;
+   sk->sk_wq   =   >wq;
sock->sk=   sk;
} else
sk->sk_wq   =   NULL;
diff --git a/net/socket.c b/net/socket.c
index dd2c247..495485e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -245,19 +245,12 @@ static struct kmem_cache *sock_inode_cachep __read_mostly;
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
struct socket_alloc *ei;
-   struct socket_wq *wq;
 
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
-   wq = kmalloc(sizeof(*wq), GFP_KERNEL);
-   if (!wq) {
-   kmem_cache_free(sock_inode_cachep, ei);
-   return NULL;
-   }
-   init_waitqueue_head(>wait);
-   wq->fasync_list = NULL;
-   RCU_INIT_POINTER(ei->socket.wq, wq);
+   init_waitqueue_head(>socket.wq.wait);
+   ei->socket.wq.fasync_list = NULL;
 

Re: use-after-free in sock_wake_async

2015-11-26 Thread Hannes Frederic Sowa


On Thu, Nov 26, 2015, at 14:32, Hannes Frederic Sowa wrote:
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 7f89e4b..ae34da1 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1674,7 +1674,7 @@ static inline void sock_orphan(struct sock *sk)
>  static inline void sock_graft(struct sock *sk, struct socket *parent)
>  {
>   write_lock_bh(>sk_callback_lock);
> -   sk->sk_wq = parent->wq;
> +   sk->sk_wq = >wq;

RCU_INIT_POINTER(sk->sk_wq, >wq);

>   parent->sk = sk;
>   sk_set_socket(sk, parent);
>   security_sock_graft(sk, parent);
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index 630c197..c125881 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -657,7 +657,7 @@ static void rcu_preempt_do_callbacks(void)
>  /*
>   * Queue a preemptible-RCU callback for invocation after a grace period.
>   */
> -void call_rcu(struct rcu_head *head, rcu_callback_t func)
> +static void call_rcu(struct rcu_head *head, rcu_callback_t func)
>  {
>   __call_rcu(head, func, rcu_state_p, -1, 0);
>  }
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 1e4dd54..314ab6a 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2383,7 +2383,7 @@ void sock_init_data(struct socket *sock, struct
> sock *sk)
>  
>   if (sock) {
>   sk->sk_type =   sock->type;
> -   sk->sk_wq   =   sock->wq;
> +   sk->sk_wq   =   >wq;

RCU_INIT_POINTER()

>   sock->sk=   sk;
>   } else
>   sk->sk_wq   =   NULL;
> diff --git a/net/socket.c b/net/socket.c
> index dd2c247..495485e 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -245,19 +245,12 @@ static struct kmem_cache *sock_inode_cachep
> __read_mostly;
>  static struct inode *sock_alloc_inode(struct super_block *sb)
>  {
>   struct socket_alloc *ei;
> -   struct socket_wq *wq;
>  
>   ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
>   if (!ei)
>   return NULL;
> -   wq = kmalloc(sizeof(*wq), GFP_KERNEL);
> -   if (!wq) {
> -   kmem_cache_free(sock_inode_cachep, ei);
> -   return NULL;
> -   }
> -   init_waitqueue_head(>wait);
> -   wq->fasync_list = NULL;
> -   RCU_INIT_POINTER(ei->socket.wq, wq);
> +   init_waitqueue_head(>socket.wq.wait);
> +   ei->socket.wq.fasync_list = NULL;
>  
>   ei->socket.state = SS_UNCONNECTED;
>   ei->socket.flags = 0;
> @@ -268,17 +261,18 @@ static struct inode *sock_alloc_inode(struct
> super_block *sb)
>   return >vfs_inode;
>  }
>  
> -static void sock_destroy_inode(struct inode *inode)
> +static void sock_cache_free_rcu(struct rcu_head *rcu)
>  {
> -   struct socket_alloc *ei;
> -   struct socket_wq *wq;
> -
> -   ei = container_of(inode, struct socket_alloc, vfs_inode);
> -   wq = rcu_dereference_protected(ei->socket.wq, 1);
> -   kfree_rcu(wq, rcu);
> +   struct socket_alloc *ei =
> +   container_of(rcu, struct socket_alloc, vfs_inode.i_rcu);
>   kmem_cache_free(sock_inode_cachep, ei);
>  }
>  
> +static void sock_destroy_inode(struct inode *inode)
> +{
> +   call_rcu(>i_rcu, sock_cache_free_rcu);
> +}
> +
>  static void init_once(void *foo)
>  {
>   struct socket_alloc *ei = (struct socket_alloc *)foo;
> @@ -573,7 +567,7 @@ void sock_release(struct socket *sock)
>   module_put(owner);
>   }
>  
> -   if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
> +   if (sock->wq.fasync_list)
>   pr_err("%s: fasync list not empty!\n", __func__);
>  
>   this_cpu_sub(sockets_in_use, 1);
> @@ -1044,7 +1038,7 @@ static int sock_fasync(int fd, struct file *filp,
> int on)
>   return -EINVAL;
>  
>   lock_sock(sk);
> -   wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
> +   wq = >wq;
>   fasync_helper(fd, filp, on, >fasync_list);
>  
>   if (!wq->fasync_list)
> @@ -1065,7 +1059,7 @@ int sock_wake_async(struct socket *sock, int how,
> int band)
>   if (!sock)
>   return -1;
>   rcu_read_lock();
> -   wq = rcu_dereference(sock->wq);
> +   wq = >wq;
>   if (!wq || !wq->fasync_list) {
>   rcu_read_unlock();
>   return -1;
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-26 Thread Eric Dumazet
On Thu, 2015-11-26 at 14:32 +0100, Hannes Frederic Sowa wrote:
> Hannes Frederic Sowa  writes:
> 
> 
> > I have seen filesystems already doing so in .destroy_inode, that's why I
> > am asking. The allocation happens the same way as we do with sock_alloc,
> > e.g. shmem. I actually thought that struct inode already provides an
> > rcu_head for exactly that reason.
> 
> E.g.:

> +static void sock_destroy_inode(struct inode *inode)
> +{
> + call_rcu(>i_rcu, sock_cache_free_rcu);
> +}

I guess you missed few years back why we had to implement
SLAB_DESTROY_BY_RCU for TCP sockets to not destroy performance.

By adding RCU grace period before reuse of this inode (about 640 bytes
today), you are asking the CPU to evict from its cache precious content,
and slow down some workloads, adding lot of ram pressure, as the cpu
allocating a TCP socket will have to populate its cache for a cold
inode.

The reason we put in a small object the RCU protected fields should be
pretty clear.

Do not copy code that people wrote in other layers without understanding
the performance implications.

Thanks.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-26 Thread Hannes Frederic Sowa
On Thu, Nov 26, 2015, at 16:51, Eric Dumazet wrote:
> On Thu, 2015-11-26 at 14:32 +0100, Hannes Frederic Sowa wrote:
> > Hannes Frederic Sowa  writes:
> > 
> > 
> > > I have seen filesystems already doing so in .destroy_inode, that's why I
> > > am asking. The allocation happens the same way as we do with sock_alloc,
> > > e.g. shmem. I actually thought that struct inode already provides an
> > > rcu_head for exactly that reason.
> > 
> > E.g.:
> 
> > +static void sock_destroy_inode(struct inode *inode)
> > +{
> > +   call_rcu(>i_rcu, sock_cache_free_rcu);
> > +}
> 
> I guess you missed few years back why we had to implement
> SLAB_DESTROY_BY_RCU for TCP sockets to not destroy performance.

I think I wasn't even subscribed to netdev@ at that time, so I probably
missed it. Few years back is 7. :}

> By adding RCU grace period before reuse of this inode (about 640 bytes
> today), you are asking the CPU to evict from its cache precious content,
> and slow down some workloads, adding lot of ram pressure, as the cpu
> allocating a TCP socket will have to populate its cache for a cold
> inode.

My rationale was like this: we already have rcu to free the wq, so we
don't add any more callbacks as current code. sock_alloc is right now
1136 bytes, which is huge, like 18 cachelines. I wouldn't think it does
matter a lot as we thrash anyway. tcp_sock is like 45 cachelines right
now, hui.

Also isn't the reason why slub exists so it can track memory regions
per-cpu.

Anyway, I am only speculating why it could be tried. I probably need to
do some performance experiments.

> The reason we put in a small object the RCU protected fields should be
> pretty clear.

Yes, I thought about that.

> Do not copy code that people wrote in other layers without understanding
> the performance implications.

Duuh. :)

Bye,
Hannes
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-26 Thread Eric Dumazet
On Thu, 2015-11-26 at 18:03 +0100, Hannes Frederic Sowa wrote:

> My rationale was like this: we already have rcu to free the wq, so we
> don't add any more callbacks as current code. sock_alloc is right now
> 1136 bytes, which is huge, like 18 cachelines. I wouldn't think it does
> matter a lot as we thrash anyway. tcp_sock is like 45 cachelines right
> now, hui.

Again, these speculations are really killing me.

Can you wait I send my patches ?

They work just great, and there is no risk of perf regression with added
RCU freeing of inodes :(

It is Thanks Giving in the US, vacation time.

Thanks.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-26 Thread Hannes Frederic Sowa
On Thu, Nov 26, 2015, at 18:09, Eric Dumazet wrote:
> On Thu, 2015-11-26 at 18:03 +0100, Hannes Frederic Sowa wrote:
> 
> > My rationale was like this: we already have rcu to free the wq, so we
> > don't add any more callbacks as current code. sock_alloc is right now
> > 1136 bytes, which is huge, like 18 cachelines. I wouldn't think it does
> > matter a lot as we thrash anyway. tcp_sock is like 45 cachelines right
> > now, hui.
> 
> Again, these speculations are really killing me.
>
> Can you wait I send my patches ?

Eric, I wasn't targeting upstream. I just had this idea and wanted to
show you that it is possible. I didn't send an official patch and
wouldn't do that without proper performance analysis.

> They work just great, and there is no risk of perf regression with added
> RCU freeing of inodes :(

I agree. Your patch is certainly the best thing for net while this
complete rcuification should never ever hit net.

> It is Thanks Giving in the US, vacation time.

Enjoy the time! :)

Bye,
Hannes
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-26 Thread Eric Dumazet
On Thu, 2015-11-26 at 18:03 +0100, Hannes Frederic Sowa wrote:

> Also isn't the reason why slub exists so it can track memory regions
> per-cpu.

call_rcu() and kfree_rcu() will add a grace period (multiple ms) where
the cpu will likely evict from its caches the data contained in the
'about to be freed' objects, defeating the SLUB/SLAB ability to quickly
reuse a freed and hot object (LIFO)

This is one of the major RCU drawback : Force a FIFO behavior in object
reuse while LIFO one is much better for data locality, especially with
per-cpu lists.

Another problem is a slightly bigger working set size, which can hurt
some workloads that used to exactly fit cpu caches.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Rainer Weikusat
Eric Dumazet  writes:
> On Tue, Nov 24, 2015 at 5:10 PM, Rainer Weikusat
>  wrote:

[...]

>> It's also easy to verify: Swap the unix_state_lock and
>> other->sk_data_ready and see if the issue still occurs. Right now (this
>> may change after I had some sleep as it's pretty late for me), I don't
>> think there's another local fix: The ->sk_data_ready accesses a
>> pointer after the lock taken by the code which will clear and
>> then later free it was released.
>
> It seems that :
>
> int sock_wake_async(struct socket *sock, int how, int band)
>
> should really be changed to
>
> int sock_wake_async(struct socket_wq *wq, int how, int band)
>
> So that RCU rules (already present) apply safely.
>
> sk->sk_socket is inherently racy (that is : racy without using
> sk_callback_lock rwlock )

The comment above sock_wait_async states that

/* This function may be called only under socket lock or callback_lock or 
rcu_lock */

In this case, it's called via sk_wake_async (include/net/sock.h) which
is - in turn - called via sock_def_readable (the 'default' data ready
routine/ net/core/sock.c) which looks like this:

static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;

rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible_sync_poll(>wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}

and should thus satisfy the constraint documented by the comment (I
didn't verify if the comment is actually correct, though).

Further - sorry about that - I think changing code in "half of the
network stack" in order to avoid calling a certain routine which will
only ever do something in case someone's using signal-driven I/O with an
already acquired lock held is a terrifying idea. Because of this, I
propose the following alternate patch which should also solve the
problem by ensuring that the ->sk_data_ready activity happens before
unix_release_sock/ sock_release get a chance to clear or free anything
which will be needed.

In case this demonstrably causes other issues, a more complicated
alternate idea (still restricting itself to changes to the af_unix code)
would be to move the socket_wq structure to a dummy struct socket
allocated by unix_release_sock and freed by the destructor.

---
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4e95bdf..5c87ea6 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1754,8 +1754,8 @@ restart_locked:
skb_queue_tail(>sk_receive_queue, skb);
if (max_level > unix_sk(other)->recursion_level)
unix_sk(other)->recursion_level = max_level;
-   unix_state_unlock(other);
other->sk_data_ready(other);
+   unix_state_unlock(other);
sock_put(other);
scm_destroy();
return len;
@@ -1860,8 +1860,8 @@ static int unix_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
skb_queue_tail(>sk_receive_queue, skb);
if (max_level > unix_sk(other)->recursion_level)
unix_sk(other)->recursion_level = max_level;
-   unix_state_unlock(other);
other->sk_data_ready(other);
+   unix_state_unlock(other);
sent += size;
}
 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Wed, 2015-11-25 at 16:43 +, Rainer Weikusat wrote:
> Eric Dumazet  writes:
> > On Tue, Nov 24, 2015 at 5:10 PM, Rainer Weikusat
> >  wrote:
> 
> [...]
> 
> >> It's also easy to verify: Swap the unix_state_lock and
> >> other->sk_data_ready and see if the issue still occurs. Right now (this
> >> may change after I had some sleep as it's pretty late for me), I don't
> >> think there's another local fix: The ->sk_data_ready accesses a
> >> pointer after the lock taken by the code which will clear and
> >> then later free it was released.
> >
> > It seems that :
> >
> > int sock_wake_async(struct socket *sock, int how, int band)
> >
> > should really be changed to
> >
> > int sock_wake_async(struct socket_wq *wq, int how, int band)
> >
> > So that RCU rules (already present) apply safely.
> >
> > sk->sk_socket is inherently racy (that is : racy without using
> > sk_callback_lock rwlock )
> 
> The comment above sock_wait_async states that
> 
> /* This function may be called only under socket lock or callback_lock or 
> rcu_lock */
> 
> In this case, it's called via sk_wake_async (include/net/sock.h) which
> is - in turn - called via sock_def_readable (the 'default' data ready
> routine/ net/core/sock.c) which looks like this:
> 
> static void sock_def_readable(struct sock *sk)
> {
>   struct socket_wq *wq;
> 
>   rcu_read_lock();
>   wq = rcu_dereference(sk->sk_wq);
>   if (wq_has_sleeper(wq))
>   wake_up_interruptible_sync_poll(>wait, POLLIN | POLLPRI |
>   POLLRDNORM | POLLRDBAND);
>   sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
>   rcu_read_unlock();
> }
> 
> and should thus satisfy the constraint documented by the comment (I
> didn't verify if the comment is actually correct, though).
> 
> Further - sorry about that - I think changing code in "half of the
> network stack" in order to avoid calling a certain routine which will
> only ever do something in case someone's using signal-driven I/O with an
> already acquired lock held is a terrifying idea. Because of this, I
> propose the following alternate patch which should also solve the
> problem by ensuring that the ->sk_data_ready activity happens before
> unix_release_sock/ sock_release get a chance to clear or free anything
> which will be needed.
> 
> In case this demonstrably causes other issues, a more complicated
> alternate idea (still restricting itself to changes to the af_unix code)
> would be to move the socket_wq structure to a dummy struct socket
> allocated by unix_release_sock and freed by the destructor.
> 
> ---
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 4e95bdf..5c87ea6 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1754,8 +1754,8 @@ restart_locked:
> skb_queue_tail(>sk_receive_queue, skb);
> if (max_level > unix_sk(other)->recursion_level)
> unix_sk(other)->recursion_level = max_level;
> -   unix_state_unlock(other);
> other->sk_data_ready(other);
> +   unix_state_unlock(other);
> sock_put(other);
> scm_destroy();
> return len;
> @@ -1860,8 +1860,8 @@ static int unix_stream_sendmsg(struct socket *sock, 
> struct msghdr *msg,
> skb_queue_tail(>sk_receive_queue, skb);
> if (max_level > unix_sk(other)->recursion_level)
> unix_sk(other)->recursion_level = max_level;
> -   unix_state_unlock(other);
> other->sk_data_ready(other);
> +   unix_state_unlock(other);
> sent += size;
> }
>  


The issue is way more complex than that.

We cannot prevent inode from disappearing.
We can not safely dereference "(struct socket *)->flags"

locking the 'struct sock' wont help at all.

Here is my current work/patch :

It ran for ~2 hours under stress without warning, but I want it to run
24 hours before official submission.


Note that moving flags into sk_wq will actually avoid one cache line
miss in fast path, so might give performance improvement.

This minimal patch only moves SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
but we can move other flags later.

sock_wake_async() must not even attempt to deref a struct socket.

-> sock_wake_async(struct socket_wq *wq, int how, int band);

Thanks.

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 0aa6fdfb448a..6d4d4569447e 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -125,7 +125,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned 
flags)
if (flags & MSG_DONTWAIT)
return -EAGAIN;
 
-   set_bit(SOCK_ASYNC_WAITDATA, >sk_socket->flags);
+   sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
for (;;) {
if (signal_pending(current))
@@ -139,7 +139,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned 
flags)
}
finish_wait(sk_sleep(sk), );
 
- 

Re: use-after-free in sock_wake_async

2015-11-25 Thread Rainer Weikusat
Eric Dumazet  writes:
> On Wed, 2015-11-25 at 16:43 +, Rainer Weikusat wrote:
>> Eric Dumazet  writes:
>> > On Tue, Nov 24, 2015 at 5:10 PM, Rainer Weikusat
>> >  wrote:
>> 
>> [...]
>> 
>> >> It's also easy to verify: Swap the unix_state_lock and
>> >> other->sk_data_ready and see if the issue still occurs. Right now (this
>> >> may change after I had some sleep as it's pretty late for me), I don't
>> >> think there's another local fix: The ->sk_data_ready accesses a
>> >> pointer after the lock taken by the code which will clear and
>> >> then later free it was released.
>> >
>> > It seems that :
>> >
>> > int sock_wake_async(struct socket *sock, int how, int band)
>> >
>> > should really be changed to
>> >
>> > int sock_wake_async(struct socket_wq *wq, int how, int band)
>> >
>> > So that RCU rules (already present) apply safely.
>> >
>> > sk->sk_socket is inherently racy (that is : racy without using
>> > sk_callback_lock rwlock )
>> 
>> The comment above sock_wait_async states that
>> 
>> /* This function may be called only under socket lock or callback_lock or 
>> rcu_lock */
>> 
>> In this case, it's called via sk_wake_async (include/net/sock.h) which
>> is - in turn - called via sock_def_readable (the 'default' data ready
>> routine/ net/core/sock.c) which looks like this:
>> 
>> static void sock_def_readable(struct sock *sk)
>> {
>>  struct socket_wq *wq;
>> 
>>  rcu_read_lock();
>>  wq = rcu_dereference(sk->sk_wq);
>>  if (wq_has_sleeper(wq))
>>  wake_up_interruptible_sync_poll(>wait, POLLIN | POLLPRI |
>>  POLLRDNORM | POLLRDBAND);
>>  sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
>>  rcu_read_unlock();
>> }
>> 
>> and should thus satisfy the constraint documented by the comment (I
>> didn't verify if the comment is actually correct, though).
>> 
>> Further - sorry about that - I think changing code in "half of the
>> network stack" in order to avoid calling a certain routine which will
>> only ever do something in case someone's using signal-driven I/O with an
>> already acquired lock held is a terrifying idea. Because of this, I
>> propose the following alternate patch which should also solve the
>> problem by ensuring that the ->sk_data_ready activity happens before
>> unix_release_sock/ sock_release get a chance to clear or free anything
>> which will be needed.
>> 
>> In case this demonstrably causes other issues, a more complicated
>> alternate idea (still restricting itself to changes to the af_unix code)
>> would be to move the socket_wq structure to a dummy struct socket
>> allocated by unix_release_sock and freed by the destructor.
>> 
>> ---
>> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>> index 4e95bdf..5c87ea6 100644
>> --- a/net/unix/af_unix.c
>> +++ b/net/unix/af_unix.c
>> @@ -1754,8 +1754,8 @@ restart_locked:
>> skb_queue_tail(>sk_receive_queue, skb);
>> if (max_level > unix_sk(other)->recursion_level)
>> unix_sk(other)->recursion_level = max_level;
>> -   unix_state_unlock(other);
>> other->sk_data_ready(other);
>> +   unix_state_unlock(other);
>> sock_put(other);
>> scm_destroy();
>> return len;
>> @@ -1860,8 +1860,8 @@ static int unix_stream_sendmsg(struct socket *sock, 
>> struct msghdr *msg,
>> skb_queue_tail(>sk_receive_queue, skb);
>> if (max_level > unix_sk(other)->recursion_level)
>> unix_sk(other)->recursion_level = max_level;
>> -   unix_state_unlock(other);
>> other->sk_data_ready(other);
>> +   unix_state_unlock(other);
>> sent += size;
>> }
>>  
>
>
> The issue is way more complex than that.
>
> We cannot prevent inode from disappearing.
> We can not safely dereference "(struct socket *)->flags"
>
> locking the 'struct sock' wont help at all.

The inode can't disappear unless it's freed which is done by
sock_release,

void sock_release(struct socket *sock)
{
if (sock->ops) {
struct module *owner = sock->ops->owner;

sock->ops->release(sock);
sock->ops = NULL;
module_put(owner);
}

if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);

this_cpu_sub(sockets_in_use, 1);
if (!sock->file) {
iput(SOCK_INODE(sock));
return;
}
sock->file = NULL;
}

after calling the 'protocol release routine' (unix_release_sock) and
unix_release_sock will either be blocked waiting for the
unix_state_lock of the socket that's to be closed or it will set
SOCK_DEAD while holding this lock which will then be detected by either
of the unix _sendmsg routine before executing any of the receive code
(including the call to 

Re: use-after-free in sock_wake_async

2015-11-25 Thread Hannes Frederic Sowa
On Wed, Nov 25, 2015, at 23:09, Eric Dumazet wrote:
> On Wed, 2015-11-25 at 20:57 +, Rainer Weikusat wrote:
> 
> > I do agree that keeping the ->sk_data_ready outside of the lock will
> > very likely have performance advantages. That's just something I
> > wouldn't have undertaken because I'd be reluctant to make a fairly
> > complicated change to a lot of code.
> 
> All I am saying is that we can keep current performance.
> 
> We already have the core infrastructure, we only need to properly use
> it.
> 
> I will split my changes in two parts.
> 
> One part doing a very boring change of
> 
> rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
> for X in SOCK_ASYNC_NOSPACE SOCK_ASYNC_WAITDATA 
> 
>  set_bit(X, >sk_socket->flags) -> sk_set_bit(X, sk)
>  clear_bit(X, >sk_socket->flags) -> sk_clear_bit(X, sk)

sk_set_bit and sk_clear_bit will forward the set_bit and clear_bit into
the socket_wq like you explained above?

> The rename will help backports to catch code that might have been
> removed in recent kernels.
> 
> Then the second patch will do the actual changes, and they will look
> very sensible for people wanting to review them, and or familiar with
> the stack, do not worry ;)

Do you see a chance to inline socket_wq into struct socket and discard
struct socket_alloc in one go by rcu in socket_destroy_inode?

Thanks,
Hannes
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Wed, 2015-11-25 at 17:30 +, Rainer Weikusat wrote:

> In case this is wrong, it obviously implies that sk_sleep(sk) must not
> be used anywhere as it accesses the same struck sock, hence, when that
> can "suddenly" disappear despite locks are used in the way indicated
> above, there is now safe way to invoke that, either, as it just does a
> rcu_dereference_raw based on the assumption that the caller knows that
> the i-node (and the corresponding wait queue) still exist.
> 

Oh well.

sk_sleep() is not used if the return is NULL

This is exactly why we have such code in critical functions :

wqueue = sk_sleep(sk);
if (wqueue && waitqueue_active(wqueue))
wake_up_interruptible_poll(wqueue, 
   POLLOUT | POLLWRNORM | POLLWRBAND);


We already took care of this problem years ago, but missed the ASYNC
case (that almost nobody really uses these days)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Wed, 2015-11-25 at 23:32 +0100, Hannes Frederic Sowa wrote:
> On Wed, Nov 25, 2015, at 23:09, Eric Dumazet wrote:
> > On Wed, 2015-11-25 at 20:57 +, Rainer Weikusat wrote:
> > 
> > > I do agree that keeping the ->sk_data_ready outside of the lock will
> > > very likely have performance advantages. That's just something I
> > > wouldn't have undertaken because I'd be reluctant to make a fairly
> > > complicated change to a lot of code.
> > 
> > All I am saying is that we can keep current performance.
> > 
> > We already have the core infrastructure, we only need to properly use
> > it.
> > 
> > I will split my changes in two parts.
> > 
> > One part doing a very boring change of
> > 
> > rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
> > for X in SOCK_ASYNC_NOSPACE SOCK_ASYNC_WAITDATA 
> > 
> >  set_bit(X, >sk_socket->flags) -> sk_set_bit(X, sk)
> >  clear_bit(X, >sk_socket->flags) -> sk_clear_bit(X, sk)
> 
> sk_set_bit and sk_clear_bit will forward the set_bit and clear_bit into
> the socket_wq like you explained above?

In the first patch (no functional change), the helpers will look like

static void inline sk_set_bit(int nr, struct sock *sk)
{
set_bit(nr, >sk_socket->flags);
}


Then the second patch will change the helper to :

static void inline sk_set_bit(int nr, struct sock *sk)
{
set_bit(nr, >sk_wq_raw->flags);
}


> 
> > The rename will help backports to catch code that might have been
> > removed in recent kernels.
> > 
> > Then the second patch will do the actual changes, and they will look
> > very sensible for people wanting to review them, and or familiar with
> > the stack, do not worry ;)
> 
> Do you see a chance to inline socket_wq into struct socket and discard
> struct socket_alloc in one go by rcu in socket_destroy_inode?

???

I guess you missed the whole point to have socket_wq allocated outside
of the inode :(

inodes are not rcu protected (yet). I certainly don't want to mess with
VFS, we have enough problems in net/ directory already.




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Hannes Frederic Sowa
On Wed, Nov 25, 2015, at 23:43, Eric Dumazet wrote:
> On Wed, 2015-11-25 at 23:32 +0100, Hannes Frederic Sowa wrote:
> > On Wed, Nov 25, 2015, at 23:09, Eric Dumazet wrote:
> > > On Wed, 2015-11-25 at 20:57 +, Rainer Weikusat wrote:
> > > 
> > > > I do agree that keeping the ->sk_data_ready outside of the lock will
> > > > very likely have performance advantages. That's just something I
> > > > wouldn't have undertaken because I'd be reluctant to make a fairly
> > > > complicated change to a lot of code.
> > > 
> > > All I am saying is that we can keep current performance.
> > > 
> > > We already have the core infrastructure, we only need to properly use
> > > it.
> > > 
> > > I will split my changes in two parts.
> > > 
> > > One part doing a very boring change of
> > > 
> > > rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
> > > for X in SOCK_ASYNC_NOSPACE SOCK_ASYNC_WAITDATA 
> > > 
> > >  set_bit(X, >sk_socket->flags) -> sk_set_bit(X, sk)
> > >  clear_bit(X, >sk_socket->flags) -> sk_clear_bit(X, sk)
> > 
> > sk_set_bit and sk_clear_bit will forward the set_bit and clear_bit into
> > the socket_wq like you explained above?
> 
> In the first patch (no functional change), the helpers will look like
> 
> static void inline sk_set_bit(int nr, struct sock *sk)
> {
>   set_bit(nr, >sk_socket->flags);
> }
> 
> 
> Then the second patch will change the helper to :
> 
> static void inline sk_set_bit(int nr, struct sock *sk)
> {
>   set_bit(nr, >sk_wq_raw->flags);
> }

Yep, that looks sensible.

> > > The rename will help backports to catch code that might have been
> > > removed in recent kernels.
> > > 
> > > Then the second patch will do the actual changes, and they will look
> > > very sensible for people wanting to review them, and or familiar with
> > > the stack, do not worry ;)
> > 
> > Do you see a chance to inline socket_wq into struct socket and discard
> > struct socket_alloc in one go by rcu in socket_destroy_inode?
> 
> ???
> 
> I guess you missed the whole point to have socket_wq allocated outside
> of the inode :(

Yep, sure, so inode could be torn down while wq is freed by rcu
callback.

> inodes are not rcu protected (yet). I certainly don't want to mess with
> VFS, we have enough problems in net/ directory already.

I have seen filesystems already doing so in .destroy_inode, that's why I
am asking. The allocation happens the same way as we do with sock_alloc,
e.g. shmem. I actually thought that struct inode already provides an
rcu_head for exactly that reason.

Bye,
Hannes
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Rainer Weikusat
Eric Dumazet  writes:
> On Wed, 2015-11-25 at 18:24 +, Rainer Weikusat wrote:
>> Eric Dumazet  writes:
>> > On Wed, 2015-11-25 at 17:30 +, Rainer Weikusat wrote:
>> >
>> >> In case this is wrong, it obviously implies that sk_sleep(sk) must not
>> >> be used anywhere as it accesses the same struck sock, hence, when that
>> >> can "suddenly" disappear despite locks are used in the way indicated
>> >> above, there is now safe way to invoke that, either, as it just does a
>> >> rcu_dereference_raw based on the assumption that the caller knows that
>> >> the i-node (and the corresponding wait queue) still exist.
>> >> 
>> >
>> > Oh well.
>> >
>> > sk_sleep() is not used if the return is NULL

[...]

>>  finish_wait(sk_sleep(sk), );
>>  unix_state_unlock(sk);
>>  return timeo;
>> }
>> 
>> Neither prepare_to_wait nor finish_wait check if the pointer is
>> null.

[...]

> You are looking at the wrong side.
>
> Of course, the thread 'owning' a socket has a reference on it, so it
> knows sk->sk_socket and sk->sk_ww is not NULL.

I could continue argueing about this but IMHO, it just leads away from
the actual issue. Taking a couple of steps back, therefore,


diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4e95bdf..5c87ea6 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1754,8 +1754,8 @@ restart_locked:
skb_queue_tail(>sk_receive_queue, skb);
if (max_level > unix_sk(other)->recursion_level)
unix_sk(other)->recursion_level = max_level;
-   unix_state_unlock(other);
other->sk_data_ready(other);
+   unix_state_unlock(other);
sock_put(other);
scm_destroy();
return len;
@@ -1860,8 +1860,8 @@ static int unix_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
skb_queue_tail(>sk_receive_queue, skb);
if (max_level > unix_sk(other)->recursion_level)
unix_sk(other)->recursion_level = max_level;
-   unix_state_unlock(other);
other->sk_data_ready(other);
+   unix_state_unlock(other);
sent += size;
}
 
-

I'm convinced this will work for the given problem (I don't claim that
it's technically superior to the larger change in any aspect except that
it's simpler and localized) because

1) The use-after-free occurred while accessing the struck sock allocated
   together with the socket inode.

2) This happened because a close was racing with a write.   

3) The socket inode, struct sock and struct socket_wq are freed by
   sock_destroy_inode.

4) sock_destroy_inode can only be called as consequence of the iput in
   sock_release.

5) sock_release invokes the per-protocol/ family release function before
   doing the iput.

6) unix_sock_release has to acquire the unix_state_lock on the socket
   referred to as other in the code above before it can do anything, in
   particular, before it calls sock_orphan which resets the struct sock and wq
   pointers and also sets the SOCK_DEAD flag.

7) the unix_stream_sendmsg code acquires the unix_state_lock on other
   and then checks the SOCK_DEAD flag. The code above only runs if it
   was not set, hence, the iput in sock_release can't have happened yet
   because a concurrent unix_sock_release must still be blocked on the
   unix_state_lock of other.

If there's an error in this reasoning, I'd very much like to know where
and what it is, not the least because the unix_dgram_peer_wake_relay
function I wrote also relies on it being correct (wrt to using the
result of sk_sleep outside of rcu_read_lock/ rcu_read_unlock).
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Wed, 2015-11-25 at 19:38 +, Rainer Weikusat wrote:
> Eric Dumazet  writes:
> > On Wed, 2015-11-25 at 18:24 +, Rainer Weikusat wrote:
> >> Eric Dumazet  writes:
> >> > On Wed, 2015-11-25 at 17:30 +, Rainer Weikusat wrote:
> >> >
> >> >> In case this is wrong, it obviously implies that sk_sleep(sk) must not
> >> >> be used anywhere as it accesses the same struck sock, hence, when that
> >> >> can "suddenly" disappear despite locks are used in the way indicated
> >> >> above, there is now safe way to invoke that, either, as it just does a
> >> >> rcu_dereference_raw based on the assumption that the caller knows that
> >> >> the i-node (and the corresponding wait queue) still exist.
> >> >> 
> >> >
> >> > Oh well.
> >> >
> >> > sk_sleep() is not used if the return is NULL
> 
> [...]
> 
> >>finish_wait(sk_sleep(sk), );
> >>unix_state_unlock(sk);
> >>return timeo;
> >> }
> >> 
> >> Neither prepare_to_wait nor finish_wait check if the pointer is
> >> null.
> 
> [...]
> 
> > You are looking at the wrong side.
> >
> > Of course, the thread 'owning' a socket has a reference on it, so it
> > knows sk->sk_socket and sk->sk_ww is not NULL.
> 
> I could continue argueing about this but IMHO, it just leads away from
> the actual issue. Taking a couple of steps back, therefore,
> 
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 4e95bdf..5c87ea6 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1754,8 +1754,8 @@ restart_locked:
> skb_queue_tail(>sk_receive_queue, skb);
> if (max_level > unix_sk(other)->recursion_level)
> unix_sk(other)->recursion_level = max_level;
> -   unix_state_unlock(other);
> other->sk_data_ready(other);
> +   unix_state_unlock(other);
> sock_put(other);
> scm_destroy();
> return len;
> @@ -1860,8 +1860,8 @@ static int unix_stream_sendmsg(struct socket *sock, 
> struct msghdr *msg,
> skb_queue_tail(>sk_receive_queue, skb);
> if (max_level > unix_sk(other)->recursion_level)
> unix_sk(other)->recursion_level = max_level;
> -   unix_state_unlock(other);
> other->sk_data_ready(other);
> +   unix_state_unlock(other);
> sent += size;
> }
>  
> -
> 
> I'm convinced this will work for the given problem (I don't claim that
> it's technically superior to the larger change in any aspect except that
> it's simpler and localized) because
> 
> 1) The use-after-free occurred while accessing the struck sock allocated
>together with the socket inode.
> 
> 2) This happened because a close was racing with a write.   
> 
> 3) The socket inode, struct sock and struct socket_wq are freed by
>sock_destroy_inode.
> 
> 4) sock_destroy_inode can only be called as consequence of the iput in
>sock_release.
> 
> 5) sock_release invokes the per-protocol/ family release function before
>doing the iput.
> 
> 6) unix_sock_release has to acquire the unix_state_lock on the socket
>referred to as other in the code above before it can do anything, in
>particular, before it calls sock_orphan which resets the struct sock and wq
>pointers and also sets the SOCK_DEAD flag.
> 
> 7) the unix_stream_sendmsg code acquires the unix_state_lock on other
>and then checks the SOCK_DEAD flag. The code above only runs if it
>was not set, hence, the iput in sock_release can't have happened yet
>because a concurrent unix_sock_release must still be blocked on the
>unix_state_lock of other.
> 
> If there's an error in this reasoning, I'd very much like to know where
> and what it is, not the least because the unix_dgram_peer_wake_relay
> function I wrote also relies on it being correct (wrt to using the
> result of sk_sleep outside of rcu_read_lock/ rcu_read_unlock).

Yeah, we can either continue to patch particular buggy code, with hard
to review and debug stuff (this particular issue is not new)

OR, we add core infrastructure and we have less headaches, because it
works for all sockets, including af_unix.

My choice is pretty clear, having to maintain this code at Google, I can
tell you that solid core networking stack is much much better by all
means.

This af_unix code needs serious rewrite, we have absolute nightmares
with it (like the bugs added in recent sendpage() support).

I believe anything we can do to avoid having to make sure the points you
listed are going to be kept at next patch or code refactoring is a big
win.




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Rainer Weikusat
Eric Dumazet  writes:
> On Wed, 2015-11-25 at 17:30 +, Rainer Weikusat wrote:
>
>> In case this is wrong, it obviously implies that sk_sleep(sk) must not
>> be used anywhere as it accesses the same struck sock, hence, when that
>> can "suddenly" disappear despite locks are used in the way indicated
>> above, there is now safe way to invoke that, either, as it just does a
>> rcu_dereference_raw based on the assumption that the caller knows that
>> the i-node (and the corresponding wait queue) still exist.
>> 
>
> Oh well.
>
> sk_sleep() is not used if the return is NULL

static long unix_stream_data_wait(struct sock *sk, long timeo,
  struct sk_buff *last, unsigned int last_len)
{
struct sk_buff *tail;
DEFINE_WAIT(wait);

unix_state_lock(sk);

for (;;) {
prepare_to_wait(sk_sleep(sk), , TASK_INTERRUPTIBLE);

tail = skb_peek_tail(>sk_receive_queue);
if (tail != last ||
(tail && tail->len != last_len) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||
!timeo)
break;

set_bit(SOCK_ASYNC_WAITDATA, >sk_socket->flags);
unix_state_unlock(sk);
timeo = freezable_schedule_timeout(timeo);
unix_state_lock(sk);

if (sock_flag(sk, SOCK_DEAD))
break;

clear_bit(SOCK_ASYNC_WAITDATA, >sk_socket->flags);
}

finish_wait(sk_sleep(sk), );
unix_state_unlock(sk);
return timeo;
}

Neither prepare_to_wait nor finish_wait check if the pointer is
null. For the finish_wait case, it shouldn't be null because if
SOCK_DEAD is not found to be set after the unix_state_lock was acquired,
unix_release_sock didn't execute the corresponding code yet, hence,
inode etc will remain available until after the corresponding unlock.

But this isn't true anymore if the inode can go away despite
sock_release couldn't complete yet.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Wed, 2015-11-25 at 18:24 +, Rainer Weikusat wrote:
> Eric Dumazet  writes:
> > On Wed, 2015-11-25 at 17:30 +, Rainer Weikusat wrote:
> >
> >> In case this is wrong, it obviously implies that sk_sleep(sk) must not
> >> be used anywhere as it accesses the same struck sock, hence, when that
> >> can "suddenly" disappear despite locks are used in the way indicated
> >> above, there is now safe way to invoke that, either, as it just does a
> >> rcu_dereference_raw based on the assumption that the caller knows that
> >> the i-node (and the corresponding wait queue) still exist.
> >> 
> >
> > Oh well.
> >
> > sk_sleep() is not used if the return is NULL
> 
> static long unix_stream_data_wait(struct sock *sk, long timeo,
> struct sk_buff *last, unsigned int last_len)
> {
>   struct sk_buff *tail;
>   DEFINE_WAIT(wait);
> 
>   unix_state_lock(sk);
> 
>   for (;;) {
>   prepare_to_wait(sk_sleep(sk), , TASK_INTERRUPTIBLE);
> 
>   tail = skb_peek_tail(>sk_receive_queue);
>   if (tail != last ||
>   (tail && tail->len != last_len) ||
>   sk->sk_err ||
>   (sk->sk_shutdown & RCV_SHUTDOWN) ||
>   signal_pending(current) ||
>   !timeo)
>   break;
> 
>   set_bit(SOCK_ASYNC_WAITDATA, >sk_socket->flags);
>   unix_state_unlock(sk);
>   timeo = freezable_schedule_timeout(timeo);
>   unix_state_lock(sk);
> 
>   if (sock_flag(sk, SOCK_DEAD))
>   break;
> 
>   clear_bit(SOCK_ASYNC_WAITDATA, >sk_socket->flags);
>   }
> 
>   finish_wait(sk_sleep(sk), );
>   unix_state_unlock(sk);
>   return timeo;
> }
> 
> Neither prepare_to_wait nor finish_wait check if the pointer is
> null. For the finish_wait case, it shouldn't be null because if
> SOCK_DEAD is not found to be set after the unix_state_lock was acquired,
> unix_release_sock didn't execute the corresponding code yet, hence,
> inode etc will remain available until after the corresponding unlock.


> 
> But this isn't true anymore if the inode can go away despite
> sock_release couldn't complete yet.


You are looking at the wrong side.

Of course, the thread 'owning' a socket has a reference on it, so it
knows sk->sk_socket and sk->sk_ww is not NULL.

The problem is that at the time a wakeup is done, it can be done by a
process or softirq having no ref on the 'struct socket', as
sk->sk_socket can become NULL at anytime.

This is why we have sk_wq , and RCU protection, so that we do not have
to use expensive atomic operations in this fast path.




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Wed, 2015-11-25 at 20:57 +, Rainer Weikusat wrote:

> I do agree that keeping the ->sk_data_ready outside of the lock will
> very likely have performance advantages. That's just something I
> wouldn't have undertaken because I'd be reluctant to make a fairly
> complicated change to a lot of code.

All I am saying is that we can keep current performance.

We already have the core infrastructure, we only need to properly use
it.

I will split my changes in two parts.

One part doing a very boring change of

rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
for X in SOCK_ASYNC_NOSPACE SOCK_ASYNC_WAITDATA 

 set_bit(X, >sk_socket->flags) -> sk_set_bit(X, sk)
 clear_bit(X, >sk_socket->flags) -> sk_clear_bit(X, sk)

The rename will help backports to catch code that might have been
removed in recent kernels.

Then the second patch will do the actual changes, and they will look
very sensible for people wanting to review them, and or familiar with
the stack, do not worry ;)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Wed, 2015-11-25 at 11:50 -0800, Eric Dumazet wrote:

> > other->sk_data_ready(other);
> > +   unix_state_unlock(other);


Also, problem with such construct is that we wakeup a thread that will
block on the lock we hold.

Beauty of sk_data_ready() is to call it once we hold no lock any more,
to enable another cpu to immediately proceed.

In this case, 'other' can not disappear, so it should be safe.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Rainer Weikusat
Eric Dumazet  writes:
> On Wed, 2015-11-25 at 11:50 -0800, Eric Dumazet wrote:
>
>> > other->sk_data_ready(other);
>> > +   unix_state_unlock(other);
>
>
> Also, problem with such construct is that we wakeup a thread that will
> block on the lock we hold.
>
> Beauty of sk_data_ready() is to call it once we hold no lock any more,
> to enable another cpu to immediately proceed.
>
> In this case, 'other' can not disappear, so it should be safe.

I do agree that keeping the ->sk_data_ready outside of the lock will
very likely have performance advantages. That's just something I
wouldn't have undertaken because I'd be reluctant to make a fairly
complicated change to a lot of code in order to improve performance
unless performance was actually found to be lacking and because it would
step onto to many different people's turf.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-25 Thread Eric Dumazet
On Tue, 2015-11-24 at 21:43 -0800, Eric Dumazet wrote:
> On Tue, 2015-11-24 at 18:28 -0800, Eric Dumazet wrote:
> > Dmitry, could you test following patch with your setup ?
> > 
> > ( I tried to reproduce the error you reported but could not )
> > 
> > Inode can be freed (without RCU grace period), but not the socket or
> > sk_wq
> > 
> > By using sk_wq in the critical paths, we do not dereference the inode,
> > 
> > 
> 
> I finally was able to reproduce the warning (with more instances running
> in parallel), and apparently this patch solves the problem.

OK I got another one :

This means we also need to move SOCK_ASYNC_WAITDATA and
SOCK_ASYNC_NOSPACE from sock->flags into storage protected by RCU (in
struct socket_wq)

Patch will be much bigger, as we have many 

set_bit(SOCK_ASYNC_WAITDATA, >sk_socket->flags)
clear_bit(SOCK_ASYNC_WAITDATA, >sk_socket->flags);
and similar calls.

[ 7040.574519] 
==============
[ 7040.581838] AddressSanitizer: heap-use-after-free in sock_wake_async
[ 7040.588229] Read of size 8 by thread T977831:
[ 7040.592647]  [] sock_wake_async+0xd1/0xf0
[ 7040.598264]  [] sock_def_readable+0xa4/0xb0
[ 7040.604071]  [] unix_stream_sendmsg+0x326/0x730
[ 7040.611561]  [] sock_aio_write+0x253/0x280
[ 7040.617412]  [] do_sync_write+0xe3/0x130
[ 7040.622945]  [] vfs_write+0x2dd/0x300
[ 7040.628267]  [] SyS_write+0x72/0xd0
[ 7040.633380]  [] system_call_fastpath+0x16/0x1b
[ 7040.639442] 
[ 7040.640969] Freed by thread T977830:
[ 7040.644587]  [] sock_destroy_inode+0x4a/0x50
[ 7040.650472]  [] destroy_inode+0x64/0xa0
[ 7040.655921]  [] evict+0x1cd/0x2a0
[ 7040.660850]  [] iput+0x2ae/0x340
[ 7040.665689]  [] dput.part.20+0x3f0/0x660
[ 7040.671235]  [] dput+0x19/0x20
[ 7040.675915]  [] __fput+0x219/0x350
[ 7040.680940]  [] fput+0xe/0x10
[ 7040.685881]  [] task_work_run+0xee/0x100
[ 7040.691424]  [] do_notify_resume+0x12d/0x140
[ 7040.697314]  [] int_signal+0x12/0x17
[ 7040.702507] 
[ 7040.704039] Allocated by thread T977733:
[ 7040.708008]  [] sock_alloc_inode+0x2b/0x140
[ 7040.713796]  [] alloc_inode+0x32/0xf0
[ 7040.719080]  [] new_inode_pseudo+0x11/0x80
[ 7040.724803]  [] sock_alloc+0x37/0x110
[ 7040.730102]  [] __sock_create+0x95/0x340
[ 7040.735652]  [] sock_create+0x88/0xc0
[ 7040.740935]  [] SyS_socketpair+0x51/0x2c0
[ 7040.746574]  [] system_call_fastpath+0x16/0x1b
[ 7040.752626] 
[ 7040.754156] The buggy address 880987ed2c08 is located 8 bytes inside
[ 7040.754156]  of 624-byte region [880987ed2c00, 880987ed2e70)
[ 7040.767219] 
[ 7040.768771] Memory state around the buggy address:
[ 7040.773640]  880987ed2700:    ffrr
[ 7040.779867]  880987ed2800:    
[ 7040.786127]  880987ed2900:    
[ 7040.792380]  880987ed2a00:    
[ 7040.798631]  880987ed2b00: ffrr   
[ 7040.804890] >880987ed2c00:    
[ 7040.84] ^
[ 7040.814519]  880987ed2d00:    
[ 7040.820786]  880987ed2e00:  ffrr  
[ 7040.827053]  880987ed2f00:    
[ 7040.833315]  880987ed3000:    
[ 7040.839579]  880987ed3100:    
[ 7040.845803] Legend:
[ 7040.847937]  f - 8 freed bytes
[ 7040.851026]  r - 8 redzone bytes
[ 7040.854298]  . - 8 allocated bytes
[ 7040.857752]  x=1..7 - x allocated bytes + (8-x) redzone bytes
[ 7040.863537] 
==



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Benjamin LaHaise
On Tue, Nov 24, 2015 at 04:30:01PM -0500, Jason Baron wrote:
> So looking at this trace I think its the other->sk_socket that gets
> freed and then we call sk_wake_async() on it.
> 
> We could I think grab the socket reference there with unix_state_lock(),
> since that is held by unix_release_sock() before the final iput() is called.
> 
> So something like below might work (compile tested only):

That just adds the performance regression back in.  It should be possible 
to protect the other socket dereference using RCU.  I haven't had time to 
look at this yet today, but will try to find some time this evening to come 
up with a suggested patch.

-ben

> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index aaa0b58..2b014f1 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -196,6 +196,19 @@ static inline int unix_recvq_full(struct sock const
> *sk)
>   return skb_queue_len(>sk_receive_queue) > sk->sk_max_ack_backlog;
>  }
> 
> +struct socket *unix_peer_get_socket(struct sock *s)
> +{
> + struct socket *peer;
> +
> + unix_state_lock(s);
> + peer = s->sk_socket;
> + if (peer)
> + __iget(SOCK_INODE(s->sk_socket));
> + unix_state_unlock(s);
> +
> + return peer;
> +}
> +
>  struct sock *unix_peer_get(struct sock *s)
>  {
>   struct sock *peer;
> @@ -1639,6 +1652,7 @@ static int unix_stream_sendmsg(struct socket
> *sock, struct msghdr *msg,
>  {
>   struct sock *sk = sock->sk;
>   struct sock *other = NULL;
> + struct socket *other_socket = NULL;
>   int err, size;
>   struct sk_buff *skb;
>   int sent = 0;
> @@ -1662,7 +1676,10 @@ static int unix_stream_sendmsg(struct socket
> *sock, struct msghdr *msg,
>   } else {
>   err = -ENOTCONN;
>   other = unix_peer(sk);
> - if (!other)
> + if (other)
> + other_socket = unix_peer_get_socket(other);
> +
> + if (!other_socket)
>   goto out_err;
>   }
> 
> @@ -1721,6 +1738,9 @@ static int unix_stream_sendmsg(struct socket
> *sock, struct msghdr *msg,
>   sent += size;
>   }
> 
> + if (other_socket)
> + iput(SOCK_INODE(other_socket));
> +
>   scm_destroy();
> 
>   return sent;
> @@ -1733,6 +1753,8 @@ pipe_err:
>   send_sig(SIGPIPE, current, 0);
>   err = -EPIPE;
>  out_err:
> + if (other_socket)
> + iput(SOCK_INODE(other_socket));
>   scm_destroy();
>   return sent ? : err;
>  }

-- 
"Thought is the essence of where you are now."
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
On Tue, Nov 24, 2015 at 3:34 PM, Rainer Weikusat
<rweiku...@mobileactivedefense.com> wrote:
> Eric Dumazet <eduma...@google.com> writes:
>> On Tue, Nov 24, 2015 at 6:18 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
>>> Hello,
>>>
>>> The following program triggers use-after-free in sock_wake_async:
>
> [...]
>
>>> void *thr1(void *arg)
>>> {
>>> syscall(SYS_close, r2, 0, 0, 0, 0, 0);
>>> return 0;
>>> }
>>>
>>> void *thr2(void *arg)
>>> {
>>> syscall(SYS_write, r3, 0x20003000ul, 0xe7ul, 0, 0, 0);
>>> return 0;
>>> }
>
> [...]
>
>>> pthread_t th[3];
>>> pthread_create([0], 0, thr0, 0);
>>> pthread_create([1], 0, thr1, 0);
>>> pthread_create([2], 0, thr2, 0);
>>> pthread_join(th[0], 0);
>>> pthread_join(th[1], 0);
>>> pthread_join(th[2], 0);
>>> return 0;
>>> }
>
> [...]
>
>> Looks like commit 830a1e5c212fb3fdc83b66359c780c3b3a294897 should be 
>> reverted ?
>>
>> commit 830a1e5c212fb3fdc83b66359c780c3b3a294897
>> Author: Benjamin LaHaise <benjamin.c.laha...@intel.com>
>> Date:   Tue Dec 13 23:22:32 2005 -0800
>>
>> [AF_UNIX]: Remove superfluous reference counting in unix_stream_sendmsg
>>
>> AF_UNIX stream socket performance on P4 CPUs tends to suffer due to a
>> lot of pipeline flushes from atomic operations.  The patch below
>> removes the sock_hold() and sock_put() in unix_stream_sendmsg().  This
>> should be safe as the socket still holds a reference to its peer which
>> is only released after the file descriptor's final user invokes
>> unix_release_sock().  The only consideration is that we must add a
>> memory barrier before setting the peer initially.
>>
>> Signed-off-by: Benjamin LaHaise <benjamin.c.laha...@intel.com>
>> Signed-off-by: David S. Miller <da...@davemloft.net>
>
> JFTR: This seems to be unrelated. (As far as I understand this), the
> problem is that sk_wake_async accesses sk->sk_socket. That's invoked via
> the
>
> other->sk_data_ready(other)
>
> in unix_stream_sendmsg after an
>
> unix_state_unlock(other);
>
> because of this, it can race with the code in unix_release_sock clearing
> this pointer (via sock_orphan). The structure this pointer points to is
> freed via iput in sock_release (net/socket.c) after the af_unix release
> routine returned (it's really one part of a "twin structure" with the
> socket inode being the other).
>
> A quick way to test if this was true would be to swap the
>
> unix_state_unlock(other);
> other->sk_data_ready(other);
>
> in unix_stream_sendmsg and in case it is, a very 'hacky' fix could be to
> put a pointer to the socket inode into the struct unix_sock, do an iget
> on that in unix_create1 and a corresponding iput in
> unix_sock_destructor.

This is interesting, but is not the problem or/and the fix.

We are supposed to own a reference on the 'other' socket or make sure
it cannot disappear under us.

Otherwise, no matter what you do, it is racy to even access other->any_field

In particular, you can trap doing unix_state_lock(other), way before
the code you want to change.

Please do not propose hacky things like iget or anything inode
related, this is clearly af_unix bug.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Rainer Weikusat
Rainer Weikusat  writes:

[...]

> Swap the unix_state_lock and

s/lock/unlock/ :-(
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
On Tue, Nov 24, 2015 at 5:10 PM, Rainer Weikusat
 wrote:
>
> The af_unix part of this, yes, ie, what gets allocated in
> unix_create1. But neither the socket inode nor the struct sock
> originally passed to unix_create. Since these are part of the same
> umbrella structure, they'll both be freed as consequence of the
> sock_release iput. As far as I can tell (I don't claim that I'm
> necessarily right on this, this is just the result of spending ca 2h
> reading the code with the problem report in mind and looking for
> something which could cause it), doing a sock_hold on the unix peer of
> the socket in unix_stream_sendmsg is indeed not needed, however, there's
> no additional reference to the inode or the struct sock accompanying it,
> ie, both of these will be freed by unix_release_sock. This also affects
> unix_dgram_sendmsg.
>
> It's also easy to verify: Swap the unix_state_lock and
> other->sk_data_ready and see if the issue still occurs. Right now (this
> may change after I had some sleep as it's pretty late for me), I don't
> think there's another local fix: The ->sk_data_ready accesses a
> pointer after the lock taken by the code which will clear and
> then later free it was released.

It seems that :

int sock_wake_async(struct socket *sock, int how, int band)

should really be changed to

int sock_wake_async(struct socket_wq *wq, int how, int band)

So that RCU rules (already present) apply safely.

sk->sk_socket is inherently racy (that is : racy without using
sk_callback_lock rwlock )

Other possibility would be _not_ calling sock_orphan() from unix_release_sock()
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Rainer Weikusat
Eric Dumazet <eduma...@google.com> writes:
> On Tue, Nov 24, 2015 at 3:34 PM, Rainer Weikusat
> <rweiku...@mobileactivedefense.com> wrote:
>> Eric Dumazet <eduma...@google.com> writes:
>>> On Tue, Nov 24, 2015 at 6:18 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
>>>> Hello,
>>>>
>>>> The following program triggers use-after-free in sock_wake_async:
>>
>> [...]
>>
>>>> void *thr1(void *arg)
>>>> {
>>>> syscall(SYS_close, r2, 0, 0, 0, 0, 0);
>>>> return 0;
>>>> }
>>>>
>>>> void *thr2(void *arg)
>>>> {
>>>> syscall(SYS_write, r3, 0x20003000ul, 0xe7ul, 0, 0, 0);
>>>> return 0;
>>>> }
>>
>> [...]
>>
>>>> pthread_t th[3];
>>>> pthread_create([0], 0, thr0, 0);
>>>> pthread_create([1], 0, thr1, 0);
>>>> pthread_create([2], 0, thr2, 0);
>>>> pthread_join(th[0], 0);
>>>> pthread_join(th[1], 0);
>>>> pthread_join(th[2], 0);
>>>> return 0;
>>>> }
>>
>> [...]
>>
>>> Looks like commit 830a1e5c212fb3fdc83b66359c780c3b3a294897 should be 
>>> reverted ?
>>>
>>> commit 830a1e5c212fb3fdc83b66359c780c3b3a294897
>>> Author: Benjamin LaHaise <benjamin.c.laha...@intel.com>
>>> Date:   Tue Dec 13 23:22:32 2005 -0800
>>>
>>> [AF_UNIX]: Remove superfluous reference counting in unix_stream_sendmsg
>>>
>>> AF_UNIX stream socket performance on P4 CPUs tends to suffer due to a
>>> lot of pipeline flushes from atomic operations.  The patch below
>>> removes the sock_hold() and sock_put() in unix_stream_sendmsg().  This
>>> should be safe as the socket still holds a reference to its peer which
>>> is only released after the file descriptor's final user invokes
>>> unix_release_sock().  The only consideration is that we must add a
>>> memory barrier before setting the peer initially.
>>>
>>> Signed-off-by: Benjamin LaHaise <benjamin.c.laha...@intel.com>
>>> Signed-off-by: David S. Miller <da...@davemloft.net>
>>
>> JFTR: This seems to be unrelated. (As far as I understand this), the
>> problem is that sk_wake_async accesses sk->sk_socket. That's invoked via
>> the
>>
>> other->sk_data_ready(other)
>>
>> in unix_stream_sendmsg after an
>>
>> unix_state_unlock(other);
>>
>> because of this, it can race with the code in unix_release_sock clearing
>> this pointer (via sock_orphan). The structure this pointer points to is
>> freed via iput in sock_release (net/socket.c) after the af_unix release
>> routine returned (it's really one part of a "twin structure" with the
>> socket inode being the other).
>>
>> A quick way to test if this was true would be to swap the
>>
>> unix_state_unlock(other);
>> other->sk_data_ready(other);
>>
>> in unix_stream_sendmsg and in case it is, a very 'hacky' fix could be to
>> put a pointer to the socket inode into the struct unix_sock, do an iget
>> on that in unix_create1 and a corresponding iput in
>> unix_sock_destructor.
>
> This is interesting, but is not the problem or/and the fix.
>
> We are supposed to own a reference on the 'other' socket or make sure
> it cannot disappear under us.

The af_unix part of this, yes, ie, what gets allocated in
unix_create1. But neither the socket inode nor the struct sock
originally passed to unix_create. Since these are part of the same
umbrella structure, they'll both be freed as consequence of the
sock_release iput. As far as I can tell (I don't claim that I'm
necessarily right on this, this is just the result of spending ca 2h
reading the code with the problem report in mind and looking for
something which could cause it), doing a sock_hold on the unix peer of
the socket in unix_stream_sendmsg is indeed not needed, however, there's
no additional reference to the inode or the struct sock accompanying it,
ie, both of these will be freed by unix_release_sock. This also affects
unix_dgram_sendmsg.

It's also easy to verify: Swap the unix_state_lock and
other->sk_data_ready and see if the issue still occurs. Right now (this
may change after I had some sleep as it's pretty late for me), I don't
think there's another local fix: The ->sk_data_ready accesses a
pointer after the lock taken by the code which will clear and
then later free it was released.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
Dmitry, could you test following patch with your setup ?

( I tried to reproduce the error you reported but could not )

Inode can be freed (without RCU grace period), but not the socket or
sk_wq

By using sk_wq in the critical paths, we do not dereference the inode,



Thanks !

 include/linux/net.h |2 +-
 include/net/sock.h  |8 ++--
 net/core/stream.c   |2 +-
 net/sctp/socket.c   |6 +-
 net/socket.c|   16 +---
 5 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 70ac5e28e6b7..6b93ec234ce8 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -202,7 +202,7 @@ enum {
SOCK_WAKE_URG,
 };
 
-int sock_wake_async(struct socket *sk, int how, int band);
+int sock_wake_async(struct socket *sock, struct socket_wq *wq, int how, int 
band);
 int sock_register(const struct net_proto_family *fam);
 void sock_unregister(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4ba18d1..af78f9e7a218 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2007,8 +2007,12 @@ static inline unsigned long sock_wspace(struct sock *sk)
 
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
-   if (sock_flag(sk, SOCK_FASYNC))
-   sock_wake_async(sk->sk_socket, how, band);
+   if (sock_flag(sk, SOCK_FASYNC)) {
+   rcu_read_lock();
+   sock_wake_async(sk->sk_socket, rcu_dereference(sk->sk_wq),
+   how, band);
+   rcu_read_unlock();
+   }
 }
 
 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
diff --git a/net/core/stream.c b/net/core/stream.c
index d70f77a0c889..92682228919d 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk)
wake_up_interruptible_poll(>wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
-   sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+   sock_wake_async(sock, wq, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
}
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 897c01c029ca..6ab04866a1e7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6817,9 +6817,13 @@ static void __sctp_write_space(struct sctp_association 
*asoc)
 * here by modeling from the current TCP/UDP code.
 * We have not tested with it yet.
 */
-   if (!(sk->sk_shutdown & SEND_SHUTDOWN))
+   if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+   rcu_read_lock();
sock_wake_async(sock,
+   rcu_dereference(sk->sk_wq),
SOCK_WAKE_SPACE, POLL_OUT);
+   rcu_read_unlock();
+   }
}
}
 }
diff --git a/net/socket.c b/net/socket.c
index dd2c247c99e3..8df62c8bef90 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1058,18 +1058,12 @@ static int sock_fasync(int fd, struct file *filp, int 
on)
 
 /* This function may be called only under socket lock or callback_lock or 
rcu_lock */
 
-int sock_wake_async(struct socket *sock, int how, int band)
+int sock_wake_async(struct socket *sock, struct socket_wq *wq,
+   int how, int band)
 {
-   struct socket_wq *wq;
-
-   if (!sock)
-   return -1;
-   rcu_read_lock();
-   wq = rcu_dereference(sock->wq);
-   if (!wq || !wq->fasync_list) {
-   rcu_read_unlock();
+   if (!sock || !wq || !wq->fasync_list)
return -1;
-   }
+
switch (how) {
case SOCK_WAKE_WAITD:
if (test_bit(SOCK_ASYNC_WAITDATA, >flags))
@@ -1086,7 +1080,7 @@ call_kill:
case SOCK_WAKE_URG:
kill_fasync(>fasync_list, SIGURG, band);
}
-   rcu_read_unlock();
+
return 0;
 }
 EXPORT_SYMBOL(sock_wake_async);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
On Tue, 2015-11-24 at 18:28 -0800, Eric Dumazet wrote:
> Dmitry, could you test following patch with your setup ?
> 
> ( I tried to reproduce the error you reported but could not )
> 
> Inode can be freed (without RCU grace period), but not the socket or
> sk_wq
> 
> By using sk_wq in the critical paths, we do not dereference the inode,
> 
> 

I finally was able to reproduce the warning (with more instances running
in parallel), and apparently this patch solves the problem.

> 
> Thanks !
> 
>  include/linux/net.h |2 +-
>  include/net/sock.h  |8 ++--
>  net/core/stream.c   |2 +-
>  net/sctp/socket.c   |6 +-
>  net/socket.c|   16 +---
>  5 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/net.h b/include/linux/net.h
> index 70ac5e28e6b7..6b93ec234ce8 100644
> --- a/include/linux/net.h
> +++ b/include/linux/net.h
> @@ -202,7 +202,7 @@ enum {
>   SOCK_WAKE_URG,
>  };
>  
> -int sock_wake_async(struct socket *sk, int how, int band);
> +int sock_wake_async(struct socket *sock, struct socket_wq *wq, int how, int 
> band);
>  int sock_register(const struct net_proto_family *fam);
>  void sock_unregister(int family);
>  int __sock_create(struct net *net, int family, int type, int proto,
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 7f89e4ba18d1..af78f9e7a218 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2007,8 +2007,12 @@ static inline unsigned long sock_wspace(struct sock 
> *sk)
>  
>  static inline void sk_wake_async(struct sock *sk, int how, int band)
>  {
> - if (sock_flag(sk, SOCK_FASYNC))
> - sock_wake_async(sk->sk_socket, how, band);
> + if (sock_flag(sk, SOCK_FASYNC)) {
> + rcu_read_lock();
> + sock_wake_async(sk->sk_socket, rcu_dereference(sk->sk_wq),
> + how, band);
> + rcu_read_unlock();
> + }
>  }
>  
>  /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
> diff --git a/net/core/stream.c b/net/core/stream.c
> index d70f77a0c889..92682228919d 100644
> --- a/net/core/stream.c
> +++ b/net/core/stream.c
> @@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk)
>   wake_up_interruptible_poll(>wait, POLLOUT |
>   POLLWRNORM | POLLWRBAND);
>   if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
> - sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
> + sock_wake_async(sock, wq, SOCK_WAKE_SPACE, POLL_OUT);
>   rcu_read_unlock();
>   }
>  }
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 897c01c029ca..6ab04866a1e7 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -6817,9 +6817,13 @@ static void __sctp_write_space(struct sctp_association 
> *asoc)
>* here by modeling from the current TCP/UDP code.
>* We have not tested with it yet.
>*/
> - if (!(sk->sk_shutdown & SEND_SHUTDOWN))
> + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
> + rcu_read_lock();
>   sock_wake_async(sock,
> + rcu_dereference(sk->sk_wq),
>   SOCK_WAKE_SPACE, POLL_OUT);
> + rcu_read_unlock();
> + }
>   }
>   }
>  }
> diff --git a/net/socket.c b/net/socket.c
> index dd2c247c99e3..8df62c8bef90 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -1058,18 +1058,12 @@ static int sock_fasync(int fd, struct file *filp, int 
> on)
>  
>  /* This function may be called only under socket lock or callback_lock or 
> rcu_lock */
>  
> -int sock_wake_async(struct socket *sock, int how, int band)
> +int sock_wake_async(struct socket *sock, struct socket_wq *wq,
> + int how, int band)
>  {
> - struct socket_wq *wq;
> -
> - if (!sock)
> - return -1;
> - rcu_read_lock();
> - wq = rcu_dereference(sock->wq);
> - if (!wq || !wq->fasync_list) {
> - rcu_read_unlock();
> + if (!sock || !wq || !wq->fasync_list)
>   return -1;
> - }
> +
>   switch (how) {
>   case SOCK_WAKE_WAITD:
>   if (test_bit(SOCK_ASYNC_WAITDATA, >flags))
> @@ -1086,7 +1080,7 @@ call_kill:
>   case SOCK_WAKE_URG:
>   kill_fasync(>fasync_list, SIGURG, band);
>   }
> - rcu_read_unlock();
> +
>   return 0;
>  }
>  EXPORT_SYMBOL(sock_wake_async);
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
On Tue, Nov 24, 2015 at 2:03 PM, Eric Dumazet  wrote:

>
> This might be a data race in sk_wake_async() if inlined by compiler
> (see https://lkml.org/lkml/2015/11/24/680 for another example)
>
> KASAN adds register pressure and compiler can then do 'stupid' things :(
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 7f89e4ba18d1..2af6222ccc67 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2008,7 +2008,7 @@ static inline unsigned long sock_wspace(struct sock *sk)
>  static inline void sk_wake_async(struct sock *sk, int how, int band)
>  {
> if (sock_flag(sk, SOCK_FASYNC))
> -   sock_wake_async(sk->sk_socket, how, band);
> +   sock_wake_async(READ_ONCE(sk->sk_socket), how, band);
>  }
>
>  /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might

Oh well, sock_wake_async() can not be inlined, scratch this.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Jason Baron


On 11/24/2015 10:21 AM, Eric Dumazet wrote:
> On Tue, Nov 24, 2015 at 6:18 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
>> Hello,
>>
>> The following program triggers use-after-free in sock_wake_async:
>>
>> // autogenerated by syzkaller (http://github.com/google/syzkaller)
>> #include 
>> #include 
>> #include 
>> #include 
>>
>> long r2 = -1;
>> long r3 = -1;
>> long r7 = -1;
>>
>> void *thr0(void *arg)
>> {
>> syscall(SYS_splice, r2, 0x0ul, r7, 0x0ul, 0x4ul, 0x8ul);
>> return 0;
>> }
>>
>> void *thr1(void *arg)
>> {
>> syscall(SYS_close, r2, 0, 0, 0, 0, 0);
>> return 0;
>> }
>>
>> void *thr2(void *arg)
>> {
>> syscall(SYS_write, r3, 0x20003000ul, 0xe7ul, 0, 0, 0);
>> return 0;
>> }
>>
>> int main()
>> {
>> long r0 = syscall(SYS_mmap, 0x2000ul, 0x1ul, 0x3ul,
>> 0x32ul, 0xul, 0x0ul);
>> long r1 = syscall(SYS_socketpair, 0x1ul, 0x1ul, 0x0ul,
>> 0x2000ul, 0, 0);
>> r2 = *(uint32_t*)0x2000;
>> r3 = *(uint32_t*)0x2004;
>>
>> *(uint64_t*)0x20001000 = 0x4;
>> long r5 = syscall(SYS_ioctl, r2, 0x5452ul, 0x20001000ul, 0, 0, 0);
>>
>> long r6 = syscall(SYS_pipe2, 0x20002000ul, 0x80800ul, 0, 0, 0, 0);
>> r7 = *(uint32_t*)0x20002004;
>>
>> pthread_t th[3];
>> pthread_create([0], 0, thr0, 0);
>> pthread_create([1], 0, thr1, 0);
>> pthread_create([2], 0, thr2, 0);
>> pthread_join(th[0], 0);
>> pthread_join(th[1], 0);
>> pthread_join(th[2], 0);
>>     return 0;
>> }
>>
>>
>> The use-after-free fires after a minute of running it in a tight
>> parallel loop. I use the stress utility for this:
>>
>> $ go get golang.org/x/tools/cmd/stress
>> $ stress -p 128 -failure "ignore" ./a.out
>>
>>
>> ==
>> BUG: KASAN: use-after-free in sock_wake_async+0x325/0x340 at addr
>> 880061d1ad10
>> Read of size 8 by task a.out/23178
>> =
>> BUG sock_inode_cache (Not tainted): kasan: bad access detected
>> -
>>
>> Disabling lock debugging due to kernel taint
>> INFO: Allocated in sock_alloc_inode+0x1d/0x220 age=0 cpu=2 pid=23183
>> [<  none  >] kmem_cache_alloc+0x1a6/0x1f0 mm/slub.c:2514
>> [<  none  >] sock_alloc_inode+0x1d/0x220 net/socket.c:250
>> [<  none  >] alloc_inode+0x61/0x180 fs/inode.c:198
>> [<  none  >] new_inode_pseudo+0x17/0xe0 fs/inode.c:878
>> [<  none  >] sock_alloc+0x3d/0x260 net/socket.c:540
>> [<  none  >] __sock_create+0xa7/0x620 net/socket.c:1133
>> [< inline >] sock_create net/socket.c:1209
>> [< inline >] SYSC_socketpair net/socket.c:1281
>> [<  none  >] SyS_socketpair+0x112/0x4e0 net/socket.c:1260
>> [<  none  >] entry_SYSCALL_64_fastpath+0x16/0x7a
>> arch/x86/entry/entry_64.S:185
>>
>> INFO: Freed in sock_destroy_inode+0x56/0x70 age=0 cpu=2 pid=23185
>> [<  none  >] kmem_cache_free+0x24e/0x260 mm/slub.c:2742
>> [<  none  >] sock_destroy_inode+0x56/0x70 net/socket.c:279
>> [<  none  >] destroy_inode+0xc4/0x120 fs/inode.c:255
>> [<  none  >] evict+0x36b/0x580 fs/inode.c:559
>> [< inline >] iput_final fs/inode.c:1477
>> [<  none  >] iput+0x4a0/0x790 fs/inode.c:1504
>> [< inline >] dentry_iput fs/dcache.c:358
>> [<  none  >] __dentry_kill+0x4fe/0x700 fs/dcache.c:543
>> [< inline >] dentry_kill fs/dcache.c:587
>> [<  none  >] dput+0x6ab/0x7a0 fs/dcache.c:796
>> [<  none  >] __fput+0x3fb/0x6e0 fs/file_table.c:226
>> [<  none  >] fput+0x15/0x20 fs/file_table.c:244
>> [<  none  >] task_work_run+0x163/0x1f0 kernel/task_work.c:115
>> (discriminator 1)
>> [< inline >] tracehook_notify_resume include/linux/tracehook.h:191
>> [<  none  >] exit_to_usermode_loop+0x180/0x1a0
>> arch/x86/entry/common.c:251
>> [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:282
>> [<

Re: use-after-free in sock_wake_async

2015-11-24 Thread Al Viro
On Tue, Nov 24, 2015 at 04:30:01PM -0500, Jason Baron wrote:

> So looking at this trace I think its the other->sk_socket that gets
> freed and then we call sk_wake_async() on it.
> 
> We could I think grab the socket reference there with unix_state_lock(),
> since that is held by unix_release_sock() before the final iput() is called.
> 
> So something like below might work (compile tested only):

Ewww...

> +struct socket *unix_peer_get_socket(struct sock *s)
> +{
> + struct socket *peer;
> +
> + unix_state_lock(s);
> + peer = s->sk_socket;
> + if (peer)
> + __iget(SOCK_INODE(s->sk_socket));
> + unix_state_unlock(s);
> +
> + return peer;

>  out_err:
> + if (other_socket)
> + iput(SOCK_INODE(other_socket));
>   scm_destroy();
>   return sent ? : err;
>  }

Interplay between socket, file and inode lifetimes is already too convoluted,
and this just makes it nastier.  I don't have a ready solution at the moment,
but this one is too ugly to live.

Al, digging through RTFS(net/unix/af_unix.c) right now...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
On Tue, Nov 24, 2015 at 1:45 PM, Benjamin LaHaise  wrote:
> On Tue, Nov 24, 2015 at 04:30:01PM -0500, Jason Baron wrote:
>> So looking at this trace I think its the other->sk_socket that gets
>> freed and then we call sk_wake_async() on it.
>>
>> We could I think grab the socket reference there with unix_state_lock(),
>> since that is held by unix_release_sock() before the final iput() is called.
>>
>> So something like below might work (compile tested only):
>
> That just adds the performance regression back in.  It should be possible
> to protect the other socket dereference using RCU.  I haven't had time to
> look at this yet today, but will try to find some time this evening to come
> up with a suggested patch.
>
> -ben
>
>> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>> index aaa0b58..2b014f1 100644
>> --- a/net/unix/af_unix.c
>> +++ b/net/unix/af_unix.c
>> @@ -196,6 +196,19 @@ static inline int unix_recvq_full(struct sock const
>> *sk)
>>   return skb_queue_len(>sk_receive_queue) > sk->sk_max_ack_backlog;
>>  }
>>
>> +struct socket *unix_peer_get_socket(struct sock *s)
>> +{
>> + struct socket *peer;
>> +
>> + unix_state_lock(s);
>> + peer = s->sk_socket;
>> + if (peer)
>> + __iget(SOCK_INODE(s->sk_socket));
>> + unix_state_unlock(s);
>> +
>> + return peer;
>> +}
>> +
>>  struct sock *unix_peer_get(struct sock *s)
>>  {
>>   struct sock *peer;
>> @@ -1639,6 +1652,7 @@ static int unix_stream_sendmsg(struct socket
>> *sock, struct msghdr *msg,
>>  {
>>   struct sock *sk = sock->sk;
>>   struct sock *other = NULL;
>> + struct socket *other_socket = NULL;
>>   int err, size;
>>   struct sk_buff *skb;
>>   int sent = 0;
>> @@ -1662,7 +1676,10 @@ static int unix_stream_sendmsg(struct socket
>> *sock, struct msghdr *msg,
>>   } else {
>>   err = -ENOTCONN;
>>   other = unix_peer(sk);
>> - if (!other)
>> + if (other)
>> + other_socket = unix_peer_get_socket(other);
>> +
>> + if (!other_socket)
>>   goto out_err;
>>   }
>>
>> @@ -1721,6 +1738,9 @@ static int unix_stream_sendmsg(struct socket
>> *sock, struct msghdr *msg,
>>   sent += size;
>>   }
>>
>> + if (other_socket)
>> + iput(SOCK_INODE(other_socket));
>> +
>>   scm_destroy();
>>
>>   return sent;
>> @@ -1733,6 +1753,8 @@ pipe_err:
>>   send_sig(SIGPIPE, current, 0);
>>   err = -EPIPE;
>>  out_err:
>> + if (other_socket)
>> + iput(SOCK_INODE(other_socket));
>>   scm_destroy();
>>   return sent ? : err;
>>  }
>
> --
> "Thought is the essence of where you are now."


This might be a data race in sk_wake_async() if inlined by compiler
(see https://lkml.org/lkml/2015/11/24/680 for another example)

KASAN adds register pressure and compiler can then do 'stupid' things :(

diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4ba18d1..2af6222ccc67 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2008,7 +2008,7 @@ static inline unsigned long sock_wspace(struct sock *sk)
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
if (sock_flag(sk, SOCK_FASYNC))
-   sock_wake_async(sk->sk_socket, how, band);
+   sock_wake_async(READ_ONCE(sk->sk_socket), how, band);
 }

 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: use-after-free in sock_wake_async

2015-11-24 Thread Rainer Weikusat
Eric Dumazet <eduma...@google.com> writes:
> On Tue, Nov 24, 2015 at 6:18 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
>> Hello,
>>
>> The following program triggers use-after-free in sock_wake_async:

[...]

>> void *thr1(void *arg)
>> {
>> syscall(SYS_close, r2, 0, 0, 0, 0, 0);
>> return 0;
>> }
>>
>> void *thr2(void *arg)
>> {
>> syscall(SYS_write, r3, 0x20003000ul, 0xe7ul, 0, 0, 0);
>> return 0;
>> }

[...]

>> pthread_t th[3];
>> pthread_create([0], 0, thr0, 0);
>> pthread_create([1], 0, thr1, 0);
>> pthread_create([2], 0, thr2, 0);
>> pthread_join(th[0], 0);
>> pthread_join(th[1], 0);
>> pthread_join(th[2], 0);
>> return 0;
>> }

[...]

> Looks like commit 830a1e5c212fb3fdc83b66359c780c3b3a294897 should be reverted 
> ?
>
> commit 830a1e5c212fb3fdc83b66359c780c3b3a294897
> Author: Benjamin LaHaise <benjamin.c.laha...@intel.com>
> Date:   Tue Dec 13 23:22:32 2005 -0800
>
> [AF_UNIX]: Remove superfluous reference counting in unix_stream_sendmsg
>
> AF_UNIX stream socket performance on P4 CPUs tends to suffer due to a
> lot of pipeline flushes from atomic operations.  The patch below
> removes the sock_hold() and sock_put() in unix_stream_sendmsg().  This
> should be safe as the socket still holds a reference to its peer which
> is only released after the file descriptor's final user invokes
> unix_release_sock().  The only consideration is that we must add a
> memory barrier before setting the peer initially.
>
> Signed-off-by: Benjamin LaHaise <benjamin.c.laha...@intel.com>
> Signed-off-by: David S. Miller <da...@davemloft.net>

JFTR: This seems to be unrelated. (As far as I understand this), the
problem is that sk_wake_async accesses sk->sk_socket. That's invoked via
the

other->sk_data_ready(other)

in unix_stream_sendmsg after an

unix_state_unlock(other);

because of this, it can race with the code in unix_release_sock clearing
this pointer (via sock_orphan). The structure this pointer points to is
freed via iput in sock_release (net/socket.c) after the af_unix release
routine returned (it's really one part of a "twin structure" with the
socket inode being the other).

A quick way to test if this was true would be to swap the

unix_state_unlock(other);
other->sk_data_ready(other);

in unix_stream_sendmsg and in case it is, a very 'hacky' fix could be to
put a pointer to the socket inode into the struct unix_sock, do an iget
on that in unix_create1 and a corresponding iput in
unix_sock_destructor.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


use-after-free in sock_wake_async

2015-11-24 Thread Dmitry Vyukov
Hello,

The following program triggers use-after-free in sock_wake_async:

// autogenerated by syzkaller (http://github.com/google/syzkaller)
#include 
#include 
#include 
#include 

long r2 = -1;
long r3 = -1;
long r7 = -1;

void *thr0(void *arg)
{
syscall(SYS_splice, r2, 0x0ul, r7, 0x0ul, 0x4ul, 0x8ul);
return 0;
}

void *thr1(void *arg)
{
syscall(SYS_close, r2, 0, 0, 0, 0, 0);
return 0;
}

void *thr2(void *arg)
{
syscall(SYS_write, r3, 0x20003000ul, 0xe7ul, 0, 0, 0);
return 0;
}

int main()
{
long r0 = syscall(SYS_mmap, 0x2000ul, 0x1ul, 0x3ul,
0x32ul, 0xul, 0x0ul);
long r1 = syscall(SYS_socketpair, 0x1ul, 0x1ul, 0x0ul,
0x2000ul, 0, 0);
r2 = *(uint32_t*)0x2000;
r3 = *(uint32_t*)0x2004;

*(uint64_t*)0x20001000 = 0x4;
long r5 = syscall(SYS_ioctl, r2, 0x5452ul, 0x20001000ul, 0, 0, 0);

long r6 = syscall(SYS_pipe2, 0x20002000ul, 0x80800ul, 0, 0, 0, 0);
r7 = *(uint32_t*)0x20002004;

pthread_t th[3];
pthread_create([0], 0, thr0, 0);
pthread_create([1], 0, thr1, 0);
pthread_create([2], 0, thr2, 0);
pthread_join(th[0], 0);
pthread_join(th[1], 0);
pthread_join(th[2], 0);
return 0;
}


The use-after-free fires after a minute of running it in a tight
parallel loop. I use the stress utility for this:

$ go get golang.org/x/tools/cmd/stress
$ stress -p 128 -failure "ignore" ./a.out


==
BUG: KASAN: use-after-free in sock_wake_async+0x325/0x340 at addr
880061d1ad10
Read of size 8 by task a.out/23178
=
BUG sock_inode_cache (Not tainted): kasan: bad access detected
-

Disabling lock debugging due to kernel taint
INFO: Allocated in sock_alloc_inode+0x1d/0x220 age=0 cpu=2 pid=23183
[<  none  >] kmem_cache_alloc+0x1a6/0x1f0 mm/slub.c:2514
[<  none  >] sock_alloc_inode+0x1d/0x220 net/socket.c:250
[<  none  >] alloc_inode+0x61/0x180 fs/inode.c:198
[<  none  >] new_inode_pseudo+0x17/0xe0 fs/inode.c:878
[<  none  >] sock_alloc+0x3d/0x260 net/socket.c:540
[<  none  >] __sock_create+0xa7/0x620 net/socket.c:1133
[< inline >] sock_create net/socket.c:1209
[< inline >] SYSC_socketpair net/socket.c:1281
[<  none  >] SyS_socketpair+0x112/0x4e0 net/socket.c:1260
[<  none  >] entry_SYSCALL_64_fastpath+0x16/0x7a
arch/x86/entry/entry_64.S:185

INFO: Freed in sock_destroy_inode+0x56/0x70 age=0 cpu=2 pid=23185
[<  none  >] kmem_cache_free+0x24e/0x260 mm/slub.c:2742
[<  none  >] sock_destroy_inode+0x56/0x70 net/socket.c:279
[<  none  >] destroy_inode+0xc4/0x120 fs/inode.c:255
[<  none  >] evict+0x36b/0x580 fs/inode.c:559
[< inline >] iput_final fs/inode.c:1477
[<  none  >] iput+0x4a0/0x790 fs/inode.c:1504
[< inline >] dentry_iput fs/dcache.c:358
[<  none  >] __dentry_kill+0x4fe/0x700 fs/dcache.c:543
[< inline >] dentry_kill fs/dcache.c:587
[<  none  >] dput+0x6ab/0x7a0 fs/dcache.c:796
[<  none  >] __fput+0x3fb/0x6e0 fs/file_table.c:226
[<  none  >] fput+0x15/0x20 fs/file_table.c:244
[<  none  >] task_work_run+0x163/0x1f0 kernel/task_work.c:115
(discriminator 1)
[< inline >] tracehook_notify_resume include/linux/tracehook.h:191
[<  none  >] exit_to_usermode_loop+0x180/0x1a0
arch/x86/entry/common.c:251
[< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:282
[<  none  >] syscall_return_slowpath+0x19f/0x210
arch/x86/entry/common.c:344
[<  none  >] int_ret_from_sys_call+0x25/0x9f
arch/x86/entry/entry_64.S:281

INFO: Slab 0xea0001874600 objects=25 used=2 fp=0x880061d1c100
flags=0x5004080
INFO: Object 0x880061d1ad00 @offset=11520 fp=0x880061d1a300
CPU: 3 PID: 23178 Comm: a.out Tainted: GB   4.4.0-rc1+ #84
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  880061baf8f0 825d3336 88003e0dc280
 880061d1ad00 880061d18000 880061baf920 81618784
 88003e0dc280 ea0001874600 880061d1ad00 00e7

Call Trace:
 [] __asan_report_load8_noabort+0x3e/0x40
mm/kasan/report.c:280
 [< inline >] __read_once_size include/linux/compiler.h:218
 [] sock_wake_async+0x325/0x340 net/socket.c:1068
 [< inline >] sk_wake_async include/net/sock.h:2011
 [] sock_def_readable+0x1e4/0x290 net/core/sock.c:2312
 [] unix_stream_sendmsg+0x4db/0x930 net/unix/af_unix.c:1864
 [< inline 

Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
On Tue, Nov 24, 2015 at 6:18 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
> Hello,
>
> The following program triggers use-after-free in sock_wake_async:
>
> // autogenerated by syzkaller (http://github.com/google/syzkaller)
> #include 
> #include 
> #include 
> #include 
>
> long r2 = -1;
> long r3 = -1;
> long r7 = -1;
>
> void *thr0(void *arg)
> {
> syscall(SYS_splice, r2, 0x0ul, r7, 0x0ul, 0x4ul, 0x8ul);
> return 0;
> }
>
> void *thr1(void *arg)
> {
> syscall(SYS_close, r2, 0, 0, 0, 0, 0);
> return 0;
> }
>
> void *thr2(void *arg)
> {
> syscall(SYS_write, r3, 0x20003000ul, 0xe7ul, 0, 0, 0);
> return 0;
> }
>
> int main()
> {
> long r0 = syscall(SYS_mmap, 0x2000ul, 0x1ul, 0x3ul,
> 0x32ul, 0xul, 0x0ul);
> long r1 = syscall(SYS_socketpair, 0x1ul, 0x1ul, 0x0ul,
> 0x2000ul, 0, 0);
> r2 = *(uint32_t*)0x2000;
> r3 = *(uint32_t*)0x2004;
>
> *(uint64_t*)0x20001000 = 0x4;
> long r5 = syscall(SYS_ioctl, r2, 0x5452ul, 0x20001000ul, 0, 0, 0);
>
> long r6 = syscall(SYS_pipe2, 0x20002000ul, 0x80800ul, 0, 0, 0, 0);
> r7 = *(uint32_t*)0x20002004;
>
> pthread_t th[3];
> pthread_create([0], 0, thr0, 0);
> pthread_create([1], 0, thr1, 0);
> pthread_create([2], 0, thr2, 0);
> pthread_join(th[0], 0);
> pthread_join(th[1], 0);
> pthread_join(th[2], 0);
> return 0;
> }
>
>
> The use-after-free fires after a minute of running it in a tight
> parallel loop. I use the stress utility for this:
>
> $ go get golang.org/x/tools/cmd/stress
> $ stress -p 128 -failure "ignore" ./a.out
>
>
> ==
> BUG: KASAN: use-after-free in sock_wake_async+0x325/0x340 at addr
> 880061d1ad10
> Read of size 8 by task a.out/23178
> =
> BUG sock_inode_cache (Not tainted): kasan: bad access detected
> -
>
> Disabling lock debugging due to kernel taint
> INFO: Allocated in sock_alloc_inode+0x1d/0x220 age=0 cpu=2 pid=23183
> [<  none  >] kmem_cache_alloc+0x1a6/0x1f0 mm/slub.c:2514
> [<  none  >] sock_alloc_inode+0x1d/0x220 net/socket.c:250
> [<  none  >] alloc_inode+0x61/0x180 fs/inode.c:198
> [<  none  >] new_inode_pseudo+0x17/0xe0 fs/inode.c:878
> [<  none  >] sock_alloc+0x3d/0x260 net/socket.c:540
> [<  none  >] __sock_create+0xa7/0x620 net/socket.c:1133
> [< inline >] sock_create net/socket.c:1209
> [< inline >] SYSC_socketpair net/socket.c:1281
> [<  none  >] SyS_socketpair+0x112/0x4e0 net/socket.c:1260
> [<  none  >] entry_SYSCALL_64_fastpath+0x16/0x7a
> arch/x86/entry/entry_64.S:185
>
> INFO: Freed in sock_destroy_inode+0x56/0x70 age=0 cpu=2 pid=23185
> [<  none  >] kmem_cache_free+0x24e/0x260 mm/slub.c:2742
> [<  none  >] sock_destroy_inode+0x56/0x70 net/socket.c:279
> [<  none  >] destroy_inode+0xc4/0x120 fs/inode.c:255
> [<  none  >] evict+0x36b/0x580 fs/inode.c:559
> [< inline >] iput_final fs/inode.c:1477
> [<  none  >] iput+0x4a0/0x790 fs/inode.c:1504
> [< inline >] dentry_iput fs/dcache.c:358
> [<  none  >] __dentry_kill+0x4fe/0x700 fs/dcache.c:543
> [< inline >] dentry_kill fs/dcache.c:587
> [<  none  >] dput+0x6ab/0x7a0 fs/dcache.c:796
> [<  none  >] __fput+0x3fb/0x6e0 fs/file_table.c:226
> [<  none  >] fput+0x15/0x20 fs/file_table.c:244
> [<  none  >] task_work_run+0x163/0x1f0 kernel/task_work.c:115
> (discriminator 1)
> [< inline >] tracehook_notify_resume include/linux/tracehook.h:191
> [<  none  >] exit_to_usermode_loop+0x180/0x1a0
> arch/x86/entry/common.c:251
> [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:282
> [<  none  >] syscall_return_slowpath+0x19f/0x210
> arch/x86/entry/common.c:344
> [<  none  >] int_ret_from_sys_call+0x25/0x9f
> arch/x86/entry/entry_64.S:281
>
> INFO: Slab 0xea0001874600 objects=25 used=2 fp=0x880061d1c100
> flags=0x5004080
> INFO: Object 0x880061d1ad00 @offset=11520 fp=0x880061d1a300
> CPU: 3 PID: 23178 Comm: a.out Tainted: GB   4.4.0-rc1+ #84
> Hardware name: QEMU Stand

Re: use-after-free in sock_wake_async

2015-11-24 Thread Eric Dumazet
On Tue, 2015-11-24 at 07:21 -0800, Eric Dumazet wrote:
> On Tue, Nov 24, 2015 at 6:18 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
> > Hello,
> >
> > The following program triggers use-after-free in sock_wake_async:
> >
> > // autogenerated by syzkaller (http://github.com/google/syzkaller)
> > #include 
> > #include 
> > #include 
> > #include 
> >
> > long r2 = -1;
> > long r3 = -1;
> > long r7 = -1;
> >
> > void *thr0(void *arg)
> > {
> > syscall(SYS_splice, r2, 0x0ul, r7, 0x0ul, 0x4ul, 0x8ul);
> > return 0;
> > }
> >
> > void *thr1(void *arg)
> > {
> > syscall(SYS_close, r2, 0, 0, 0, 0, 0);
> > return 0;
> > }
> >
> > void *thr2(void *arg)
> > {
> > syscall(SYS_write, r3, 0x20003000ul, 0xe7ul, 0, 0, 0);
> > return 0;
> > }
> >
> > int main()
> > {
> > long r0 = syscall(SYS_mmap, 0x2000ul, 0x1ul, 0x3ul,
> > 0x32ul, 0xul, 0x0ul);
> > long r1 = syscall(SYS_socketpair, 0x1ul, 0x1ul, 0x0ul,
> > 0x2000ul, 0, 0);
> > r2 = *(uint32_t*)0x2000;
> > r3 = *(uint32_t*)0x2004;
> >
> > *(uint64_t*)0x20001000 = 0x4;
> > long r5 = syscall(SYS_ioctl, r2, 0x5452ul, 0x20001000ul, 0, 0, 0);
> >
> > long r6 = syscall(SYS_pipe2, 0x20002000ul, 0x80800ul, 0, 0, 0, 0);
> > r7 = *(uint32_t*)0x20002004;
> >
> > pthread_t th[3];
> > pthread_create([0], 0, thr0, 0);
> > pthread_create([1], 0, thr1, 0);
> > pthread_create([2], 0, thr2, 0);
> > pthread_join(th[0], 0);
> > pthread_join(th[1], 0);
> > pthread_join(th[2], 0);
> >     return 0;
> > }
> >
> >
> > The use-after-free fires after a minute of running it in a tight
> > parallel loop. I use the stress utility for this:
> >
> > $ go get golang.org/x/tools/cmd/stress
> > $ stress -p 128 -failure "ignore" ./a.out
> >
> >
> > ==
> > BUG: KASAN: use-after-free in sock_wake_async+0x325/0x340 at addr
> > 880061d1ad10
> > Read of size 8 by task a.out/23178
> > =
> > BUG sock_inode_cache (Not tainted): kasan: bad access detected
> > -
> >
> > Disabling lock debugging due to kernel taint
> > INFO: Allocated in sock_alloc_inode+0x1d/0x220 age=0 cpu=2 pid=23183
> > [<  none  >] kmem_cache_alloc+0x1a6/0x1f0 mm/slub.c:2514
> > [<  none  >] sock_alloc_inode+0x1d/0x220 net/socket.c:250
> > [<  none  >] alloc_inode+0x61/0x180 fs/inode.c:198
> > [<  none  >] new_inode_pseudo+0x17/0xe0 fs/inode.c:878
> > [<  none  >] sock_alloc+0x3d/0x260 net/socket.c:540
> > [<  none  >] __sock_create+0xa7/0x620 net/socket.c:1133
> > [< inline >] sock_create net/socket.c:1209
> > [< inline >] SYSC_socketpair net/socket.c:1281
> > [<  none  >] SyS_socketpair+0x112/0x4e0 net/socket.c:1260
> > [<  none  >] entry_SYSCALL_64_fastpath+0x16/0x7a
> > arch/x86/entry/entry_64.S:185
> >
> > INFO: Freed in sock_destroy_inode+0x56/0x70 age=0 cpu=2 pid=23185
> > [<  none  >] kmem_cache_free+0x24e/0x260 mm/slub.c:2742
> > [<  none  >] sock_destroy_inode+0x56/0x70 net/socket.c:279
> > [<  none  >] destroy_inode+0xc4/0x120 fs/inode.c:255
> > [<  none  >] evict+0x36b/0x580 fs/inode.c:559
> > [< inline >] iput_final fs/inode.c:1477
> > [<  none  >] iput+0x4a0/0x790 fs/inode.c:1504
> > [< inline >] dentry_iput fs/dcache.c:358
> > [<  none  >] __dentry_kill+0x4fe/0x700 fs/dcache.c:543
> > [< inline >] dentry_kill fs/dcache.c:587
> > [<  none  >] dput+0x6ab/0x7a0 fs/dcache.c:796
> > [<  none  >] __fput+0x3fb/0x6e0 fs/file_table.c:226
> > [<  none  >] fput+0x15/0x20 fs/file_table.c:244
> > [<  none  >] task_work_run+0x163/0x1f0 kernel/task_work.c:115
> > (discriminator 1)
> > [< inline >] tracehook_notify_resume include/linux/tracehook.h:191
> > [<  none  >] exit_to_usermode_loop+0x180/0x1a0
> > arch/x86/entry/common.c:25