[patch] netlink.7: srcfix Change buffer size in example code about reading netlink message.

2016-11-14 Thread dwilder
The example code in netlink(7) (for reading netlink message) suggests 
using
a 4k read buffer with recvmsg.  This can cause truncated messages on 
systems

using a page size is >4096.  Please see:
linux/include/linux/netlink.h (in the kernel source)


/*
 *  skb should fit one page. This choice is good for headerless 
malloc.

 *  But we should limit to 8K so that userspace does not have to
 *  use enormous buffer sizes on recvmsg() calls just to avoid
 *  MSG_TRUNC when PAGE_SIZE is very large.
 */
#if PAGE_SIZE < 8192UL
#define NLMSG_GOODSIZE  SKB_WITH_OVERHEAD(PAGE_SIZE)
#else
#define NLMSG_GOODSIZE  SKB_WITH_OVERHEAD(8192UL)
#endif

#define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN)


I was troubleshooting some up-stream code on a ppc64le system
(page:size of 64k) This code had duplicated the example from netlink(7) 
and
was using a 4k buffer.  On x86-64 with a 4k page size this is not a 
problem,
however on the 64k page system some messages were truncated.  Using an 
8k buffer

as implied in netlink.h prevents problems with any page size.

Lets change the example so others don't propagate the problem further.

Signed-off-by David Wilder 

--- man7/netlink.7.orig 2016-11-14 13:30:36.522101156 -0800
+++ man7/netlink.7  2016-11-14 13:30:51.002086354 -0800
@@ -511,7 +511,7 @@
 .in +4n
 .nf
 int len;
-char buf[4096];
+char buf[8192];
 struct iovec iov = { buf, sizeof(buf) };
 struct sockaddr_nl sa;
 struct msghdr msg;



Re: Double free of dst_entry in ipv4_dst_destroy()

2015-12-14 Thread dwilder

Eric -
With this patch applied the test ran clean for 2 days.

Thanks for your help.

Quoting Eric Dumazet :


On Fri, 2015-12-11 at 07:48 -0800, Eric Dumazet wrote:

On Fri, 2015-12-11 at 06:23 -0800, Eric Dumazet wrote:
> On Sun, 2015-12-06 at 17:58 -0800, Eric Dumazet wrote:
> > On Sun, 2015-12-06 at 13:03 -0800, Eric Dumazet wrote:
> >
> > > But then when later we promote a skb->dst to a refctounted one
> > > (skb_dst_force(), we might make sure we abort the operation  
if __refcnt

> > > == 0 ( and DST_NOCACHE is in dst->flags)
> > >
> >
> > Minimum patch would be :
> >
>
> Here is a more complete patch, it should fix the issue I think :

Hmm, I'll send a v3, I forgot to test DST_NOCACHE properly.



David, please test the following patch, thanks !

 include/net/dst.h   |   33 +
 include/net/sock.h  |2 +-
 net/ipv4/tcp_ipv4.c |5 ++---
 net/ipv6/tcp_ipv6.c |3 +--
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 1279f9b09791..c7329dcd90cc 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -322,6 +322,39 @@ static inline void skb_dst_force(struct sk_buff *skb)
}
 }

+/**
+ * dst_hold_safe - Take a reference on a dst if possible
+ * @dst: pointer to dst entry
+ *
+ * This helper returns false if it could not safely
+ * take a reference on a dst.
+ */
+static inline bool dst_hold_safe(struct dst_entry *dst)
+{
+   if (dst->flags & DST_NOCACHE)
+   return atomic_inc_not_zero(>__refcnt);
+   dst_hold(dst);
+   return true;
+}
+
+/**
+ * skb_dst_force_safe - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted and not destroyed, grab a ref on it.
+ */
+static inline void skb_dst_force_safe(struct sk_buff *skb)
+{
+   if (skb_dst_is_noref(skb)) {
+   struct dst_entry *dst = skb_dst(skb);
+
+   if (!dst_hold_safe(dst))
+   dst = NULL;
+
+   skb->_skb_refdst = (unsigned long)dst;
+   }
+}
+

 /**
  * __skb_tunnel_rx - prepare skb for rx reinsert
diff --git a/include/net/sock.h b/include/net/sock.h
index eaef41433d7a..18322bded064 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -816,7 +816,7 @@ void sk_stream_write_space(struct sock *sk);
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
/* dont let skb dst not refcounted, we are going to leave rcu lock */
-   skb_dst_force(skb);
+   skb_dst_force_safe(skb);

if (!sk->sk_backlog.tail)
sk->sk_backlog.head = skb;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index db003438aaf5..d8841a2f1569 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1493,7 +1493,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
if (likely(sk->sk_rx_dst))
skb_dst_drop(skb);
else
-   skb_dst_force(skb);
+   skb_dst_force_safe(skb);

__skb_queue_tail(>ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
@@ -1721,8 +1721,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const  
struct sk_buff *skb)

 {
struct dst_entry *dst = skb_dst(skb);

-   if (dst) {
-   dst_hold(dst);
+   if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e7aab561b7b4..6b8a8a9091fa 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk,  
const struct sk_buff *skb)

 {
struct dst_entry *dst = skb_dst(skb);

-   if (dst) {
+   if (dst && dst_hold_safe(dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;

-   dst_hold(dst);
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Double free of dst_entry in ipv4_dst_destroy()

2015-12-06 Thread dwilder

Hi-

I am seeing a crash on a distro V4.2.3 kernel caused by a double  
release of a dst_entry.  In ipv4_dst_destroy() the call to  
list_empty() finds a poisoned next pointer, indicating the dst_entry  
has already been removed from the list and freed.   The crash occurs  
18 to 24 hours into a run of a network stress exerciser.


[172135.963496] Unable to handle kernel paging request for data at  
address 0x00100108  << poison value

[172135.963913] Faulting instruction address: 0xc097f5ac
[172135.964184] Oops: Kernel access of bad area, sig: 11 [#1]
[172135.964327] SMP NR_CPUS=2048 NUMA PowerNV
[172135.964649] Modules linked in: iptable_filter ip_tables x_tables  
bnx2x cxlflash cxl mdio ses libcrc32c enclosure uio_pdrv_genirq uio  
powernv_rng sunrpc autofs4 ipr
[172135.965403] CPU: 51 PID: 65452 Comm: hxecpu Tainted: GW 
   4.2.3 #2
[172135.965482] task: c01d8a370ff0 ti: c01e2d57c000 task.ti:  
c01e2d57c000
[172135.965564] NIP: c097f5ac LR: c097f59c CTR:  

[172135.965664] REGS: c01e2d57f8e0 TRAP: 0300   Tainted: G 
W(4.2.3)
[172135.965782] MSR: 90009033   CR:  
22322442  XER: 
[172135.966033] CFAR: c0008468 DAR: 00100108 DSISR:  
4200 SOFTE: 1
GPR00: c097f59c c01e2d57fb60  
c151ad00 c01e504de300
GPR04: 0001   
c01fff8af370 c141ad00
GPR08: c16aad00 00200200  
00100100 3d7473696c5f6465
GPR12: 6531303030303063 c7b5e480  
0012 0001
GPR16:  c1431280  
c0ad38f0 7fff
GPR20:  c01e28caf000  
c01e2d57c000 c1429b80
GPR24: 000a   
c01e504ddb30 
GPR28:  c01e2d57c000  
c01e504de300 c01e28caf000  <<< pointer to dst

[172135.967772] NIP [c097f5ac] ipv4_dst_destroy+0x8c/0xc0
[172135.967921] LR [c097f59c] ipv4_dst_destroy+0x7c/0xc0
[172135.968076] Call Trace:
[172135.968133] [c01e2d57fb60] [c097f59c]  
ipv4_dst_destroy+0x7c/0xc0 (unreliable)

[172135.968306] [c01e2d57fbd0] [c093b8e0] dst_destroy+0xf0/0x1a0
[172135.968452] [c01e2d57fc10] [c093bc68]  
dst_destroy_rcu+0x28/0x50
[172135.968600] [c01e2d57fc40] [c01397a0]  
rcu_process_callbacks+0x340/0x6f0

[172135.968768] [c01e2d57fcf0] [c00ba7d8] __do_softirq+0x188/0x3a0
[172135.968913] [c01e2d57fde0] [c00bac68] irq_exit+0xc8/0x100
[172135.969056] [c01e2d57fe00] [c001f734]  
timer_interrupt+0xa4/0xe0
[172135.969223] [c01e2d57fe30] [c0002714]  
decrementer_common+0x114/0x180

[172135.969395] Instruction dump:
[172135.969492] 7fe5fb78 38842968 7fc6f378 3863e580 4811c989 6000  
7fc3f378 481154c1
[172135.969748] 6000 e93f00b8 e95f00b0 7fc3f378   
f949 3d200010 61290100

[172135.970009] ---[ end trace 34f3693ddc2d5aea ]---
--

I added a call to dump_stack() into dst_release() in an attempt to  
catch the two calls to dst_release().


-- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -307,6 +307,12 @@ void dst_release(struct dst_entry *dst)
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 __func__, dst, newrefcnt);
if (!newrefcnt && unlikely(dst->flags & DST_NOCACHE))
+
+   if (!list_empty(>rt_uncached)) {
+   printk("%s: dst=%p\n",__func__,dst);
+   dump_stack();
+   }
+
call_rcu(>rcu_head, dst_destroy_rcu);
}
 }

I got lucky and caught the following stack traces on my next run.

[26211.699357] tcp_v4_do_rcv: sk=c01d10a0 skb=c01d22c61d00  
dst=c01d22c62500 sk->sk_rx_dst=c01d22c62500

[26211.699527] dst_release: dst=c01d22c62500
[26211.699626] CPU: 51 PID: 23317 Comm: hxecom Tainted: GW  
  4.2.3 #4

[26211.699632] Call Trace:
[26211.699678] [c01cf0387440] [c0a9dcd4]  
dump_stack+0x90/0xbc (unreliable)

[26211.699829] [c01cf0387470] [c093bf80] dst_release+0x110/0x120
[26211.699875] [c01cf03874e0] [c09b4024] tcp_v4_do_rcv+0x4d4/0x4f0
[26211.699979] [c01cf0387580] [c09b7834] tcp_v4_rcv+0xb74/0xb90
[26211.700027] [c01cf0387660] [c0984bb8]  
ip_local_deliver_finish+0x178/0x350
[26211.700123] [c01cf03876b0] [c09853bc]  
ip_local_deliver+0x4c/0x120

[26211.700181] [c01cf0387720] [c0984eb4] ip_rcv_finish+0x124/0x420
[26211.700255] [c01cf03877a0] [c09857a4] ip_rcv+0x314/0x440
[26211.700312]