Re: [PATCH iproute2-next 8/9] utils: Introduce and use print_name_and_link() to print name@link

2018-02-06 Thread Serhey Popovych
David Ahern wrote:
> On 2/5/18 12:49 PM, Serhey Popovych wrote:
>> There is at least three places implementing same things: two in
>> ipaddress.c print_linkinfo() & print_linkinfo_brief() and one in
>> bridge/link.c.
>>
>> They are diverge from each other very little: bridge/link.c does not
>> support JSON output at the moment and print_linkinfo_brief() does not
>> handle IFLA_LINK_NETNS case.
>>
>> Introduce and use print_name_and_link() routine to handle name@link
>> output in all possible variations; respect IFLA_LINK_NETNS attribute to
>> handle case when link is in different namespace; use ll_idx_n2a() for
>> interface name instead of "" to share logic with other code (e.g.
>> ll_name_to_index() and ll_index_to_name()) supporting such template.
>>
>> Signed-off-by: Serhey Popovych 
>> ---
>>  bridge/link.c   |   13 +++--
>>  include/utils.h |4 
>>  ip/ipaddress.c  |   44 ++--
>>  lib/utils.c |   49 +
>>  4 files changed, 58 insertions(+), 52 deletions(-)
>>
> 
> This patch is causing a diff on my system:
> 
> # ip  -br add sh > /tmp/1
> # ip/ip  -br add sh > /tmp/2
> # diff /tmp/1 /tmp/2
> 8c8
> < veth-out@br3 UP fe80::18a8:89ff:fee7:55c5/64
> ---
>> veth-out@if7 UP fe80::18a8:89ff:fee7:55c5/64
> 
> So the current ip resolves ifindex 7 to br3:
> 
> # ip li sh dev br3
> 7: br3:  mtu 1500 qdisc noqueue master
> vrf3 state UP mode DEFAULT group default qlen 1000
> 
> where your patch causes if%d to be printed.
> 

That's interesting. I guess output comes from ll_idx_n2a() in this
change when both IFLA_LINK and IFLA_LINK_NETNS is seen.

My guess about this case is following:

  1) veth-out is of "veth" rtnl kind. (ip -d li sh dev veth-out).

  2) according to drivers/net/veth.c veth_get_iflink() and
 veth_get_link_net() IFLA_LINK and IFLA_LINK_NETNS are taken
 from peer device.

  3) seeing @br3 in current ip output looks confusing according to (2)
 as veth do not link to something other than it's peer that is in
 different network namespace.

From (3) I guess @br3 is incorrect value and caused by missing
IFLA_LINK_NETNS handling in old print_linkinfo_brief(): it always
calls ll_index_to_name().

Could you provide some more details about your setup if above guess is
wrong.

Especially following ones:

  1) ip -d li sh dev veth-out  (get the rtnl kind)

  2) ip -d li sh dev br3 (get the rtnl kind)

  3) uname -r or cat /proc/version



signature.asc
Description: OpenPGP digital signature


RE: Re: [Android][Kernel][TCP/IP] report of packet discarding during tcp handshaking

2018-02-06 Thread 배석진
Dear Dumazet,


although with your changes, the problem still there.
own_req couldn't update the lost_race as you wanted.
maybee it needs additional locking method.

and, i agree with your thought about the condition.
low probability, and recovery by retransmission.
(except DF, fragmented packet is over MTU)

but like as our case,
when the problem happened not rarely, (2~3 times in 60 trial)
and if the function of dropped packet is sensitive for time delay,
i think it could be a problem.

because we have to find the solution in time,
we blocked the parallel processing of two packets for same request sock.
it is rough way, but working good. :p

we want more delicate method too,
but we don't know yet perpectly about tcp stack and /kernel/net/...
in fact, this is why we were contact to you :)


best regards,
bae



< our patch >

--- /a/include/uapi/linux/snmp.h
+++ /b/include/uapi/linux/snmp.h

@@ -280,6 +280,8 @@
LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */
LINUX_MIB_TCPMTUPFAIL,  /* TCPMTUPFail */
LINUX_MIB_TCPMTUPSUCCESS,   /* TCPMTUPSuccess */
+   LINUX_MIB_TCPRACECNDREQSK,  /* TCPRaceCondInReqsk */
+   LINUX_MIB_TCPRACECNDREQSKDROP,  /* TCPRaceCondInReqskDrop */
__LINUX_MIB_MAX
 };
 

--- /a/net/ipv4/proc.c
+++ /b/net/ipv4/proc.c

@@ -302,6 +302,8 @@
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+   SNMP_MIB_ITEM("TCPRaceCondInReqsk", LINUX_MIB_TCPRACECNDREQSK),
+   SNMP_MIB_ITEM("TCPRaceCondInReqskDrop", LINUX_MIB_TCPRACECNDREQSKDROP),
SNMP_MIB_SENTINEL
 };
 

--- /a/net/ipv4/tcp_ipv4.c
+++ /b/net/ipv4/tcp_ipv4.c

@@ -1673,6 +1673,7 @@
  * From tcp_input.c
  */
 
+#define RC_RETRY_CNT 3
 int tcp_v4_rcv(struct sk_buff *skb)
 {
const struct iphdr *iph;
@@ -1683,6 +1684,7 @@
 #endif
int ret;
struct net *net = dev_net(skb->dev);
+   unsigned int retry_cnt = RC_RETRY_CNT;
 
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1749,6 +1751,33 @@
if (!sk)
goto no_tcp_socket;
 #endif
+   /* 
+* FIXME: SEC patch
+* If ACK packets for three-way handshake are received at the same time 
by multi core,
+* each core will try to access request socket and create new socket to 
establish TCP connection.
+* But, there is no synchronization scheme to avoid race condition for 
request socket,
+* 2nd attempt that create new socket will be fail, it caused 2nd ACK 
packet discard.
+* 
+* For that reason,
+* If 2nd ACK packet contained meaningful data, it caused unintended 
packet drop.
+* so, 2nd core should wait at this point until new socket was created 
by 1st core.
+* */
+   if (sk->sk_state == TCP_NEW_SYN_RECV) {
+   struct request_sock *req = inet_reqsk(sk);
+   if (atomic_read(>rsk_refcnt) > (2+1) && retry_cnt > 0) {
+   reqsk_put(req);
+   if (retry_cnt == RC_RETRY_CNT)
+   NET_INC_STATS_BH(net, 
LINUX_MIB_TCPRACECNDREQSK);
+   retry_cnt--;
+   udelay(500);
+
+   goto lookup;
+   }
+
+   if (!retry_cnt)
+   NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSKDROP);
+   }
+
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk;

--- /a/net/ipv6/tcp_ipv6.c
+++ /b/net/ipv6/tcp_ipv6.c

@@ -1539,6 +1539,7 @@
sizeof(struct inet6_skb_parm));
 }
 
+#define RC_RETRY_CNT 3
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
const struct tcphdr *th;
@@ -1549,6 +1550,7 @@
 #endif
int ret;
struct net *net = dev_net(skb->dev);
+   unsigned int retry_cnt = RC_RETRY_CNT;
 
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1594,6 +1596,22 @@
if (!sk)
goto no_tcp_socket;
 #endif
+   /* FIXME: SEC patch */
+   if (sk->sk_state == TCP_NEW_SYN_RECV) {
+   struct request_sock *req = inet_reqsk(sk);
+   if (atomic_read(>rsk_refcnt) > (2+1) && retry_cnt > 0) {
+   reqsk_put(req);
+   if (retry_cnt == RC_RETRY_CNT)
+   NET_INC_STATS_BH(net, 
LINUX_MIB_TCPRACECNDREQSK);
+   retry_cnt--;
+   udelay(500);
+
+   goto lookup;
+   }
+
+   if (!retry_cnt)
+   NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSKDROP);
+   }
 
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);





- Original 

[vhost:vhost 19/20] Warning: arch/x86/tools/test_get_len found difference at :ffffffff811aa5f0

2018-02-06 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost
head:   96bcd04462b99e2c80e09f6537770a0ca6b288d0
commit: cc1d1dc07885803981520a5303ef5b130f2ca2e8 [19/20] mm: support reporting 
free page blocks
config: x86_64-rhel
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
git checkout cc1d1dc07885803981520a5303ef5b130f2ca2e8
# save the attached .config to linux build tree
make ARCH=x86_64 

All warnings (new ones prefixed by >>):

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


Re: Fwd: u32 ht filters

2018-02-06 Thread Jiri Pirko
Wed, Feb 07, 2018 at 06:09:15AM CET, xiyou.wangc...@gmail.com wrote:
>Hi, Jiri
>
>Your  commit 7fa9d974f3c2a016b9accb18f4ee2ed2a738585c
>breaks the tc script by Paweł. Please find below for details.

Did you do the bisection?
The commit just uses block struct instead of q, but since they
are in 1:1 relation, that should be equvivalent. So basically you still
have per-qdisc hashtables for u32.


>
>
>commit 7fa9d974f3c2a016b9accb18f4ee2ed2a738585c
>Author: Jiri Pirko 
>Date:   Fri Oct 13 14:01:02 2017 +0200
>
>net: sched: cls_u32: use block instead of q in tc_u_common
>
>tc_u_common is now per-q. With blocks, it has to be converted to be
>per-block.
>
>Signed-off-by: Jiri Pirko 
>Signed-off-by: David S. Miller 
>
>Before this commit, u32 hashtables are per-qdisc, after this commit
>it becomes per-block or per-class... this is why the script below is broken.
>
>
>-- Forwarded message --
>From: Paweł Staszewski 
>Date: Tue, Feb 6, 2018 at 8:05 AM
>Subject: u32 ht filters
>To: Cong Wang 
>
>
>Hi
>
>
>Is there something changed in kernek 4.15 that makes problem with old
>configuration of tc filters with hashing filters ?
>
>for example this :
>
>tc qdisc del root dev ifb1
>
>tc qdisc add dev ifb1 root handle 1:0 hfsc default 8000
>tc filter add dev ifb1 parent 1:0 protocol ip u32
>tc class add dev ifb1 parent 1:0 classid 1:1 hfsc ls m2 1Mbit ul
>m2 1Mbit
>tc class add dev ifb1 parent 1:1 classid 1:2 hfsc ls m2 1Mbit ul
>m2 1Mbit
>tc class add dev ifb1 parent 1:1 classid 1:3 hfsc ls m2 5000Mbit ul m2 5000Mbit
>tc class add dev ifb1 parent 1:2 classid 1:8000 hfsc ls m2 1Mbit
>ul m2 1Mbit
>tc qdisc add dev ifb1 parent 1:8000 handle 8000: sfq perturb 60
>tc qdisc add dev ifb1 parent 1:3 handle 3: pfifo limit 1
>
>
>tc filter add dev ifb1 protocol ip parent 1:0 handle 9: u32 divisor 256
>tc filter add dev ifb1 protocol ip parent 1:0 u32 ht 800:: match ip
>dst 192.168.0.0/24 hashkey mask 0x00ff at 16 link 9:
>tc class add dev ifb1 parent 1:2 classid 1:60 hfsc ls m2 8kbit ul m2 51200kbit
>echo 1
>tc filter add dev ifb1 parent 1:2 protocol ip u32 ht 9:22 match ip dst
>192.168.0.34 flowid 1:60
>echo 2
>tc qdisc add dev ifb1 parent 1:60 handle 60: pfifo limit 8192
>
>
>Is working with 4.13
>
>
>But it is not working with 4.15
>
>error is when adding:
>
>tc filter add dev ifb1 protocol ip parent 1:2 prio 4 u32 ht 9:0x22
>match ip dst 192.168.0.34 flowid 1:60
>RTNETLINK answers: Invalid argument
>We have an error talking to the kernel
>
>
>
>
>Thanks
>
>Paweł Staszewski


Re: WARNING: kmalloc bug in tun_device_event

2018-02-06 Thread Jason Wang



On 2018年02月07日 06:58, syzbot wrote:

Hello,

syzbot hit the following crash on net-next commit
617aebe6a97efa539cc4b8a52adccd89596e6be0 (Sun Feb 4 00:25:42 2018 +)
Merge tag 'usercopy-v4.16-rc1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux


So far this crash happened 5 times on net-next, upstream.
C reproducer is attached.
syzkaller reproducer is attached.
Raw console output is attached.
compiler: gcc (GCC) 7.1.1 20170620
.config is attached.

IMPORTANT: if you fix the bug, please add the following tag to the 
commit:

Reported-by: syzbot+e4d4f9ddd42955397...@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for 
details.

If you forward the report, please keep this part and the footer.

WARNING: CPU: 1 PID: 4134 at mm/slab_common.c:1012 
kmalloc_slab+0x5d/0x70 mm/slab_common.c:1012

Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 4134 Comm: syzkaller993072 Not tainted 4.15.0+ #221
Hardware name: Google Google Compute Engine/Google Compute Engine, 
BIOS Google 01/01/2011

Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:53
 panic+0x1e4/0x41c kernel/panic.c:183
 __warn+0x1dc/0x200 kernel/panic.c:547
 report_bug+0x211/0x2d0 lib/bug.c:184
 fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
 fixup_bug arch/x86/kernel/traps.c:247 [inline]
 do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
 invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:1097
RIP: 0010:kmalloc_slab+0x5d/0x70 mm/slab_common.c:1012
RSP: 0018:8801ba7ceb20 EFLAGS: 00010246
RAX:  RBX:  RCX: 83b88bed
RDX:  RSI:  RDI: 00040008
RBP: 8801ba7ceb20 R08: 1100374f9cd7 R09: 
R10:  R11:  R12: 00040008
R13: dc00 R14: 014080c0 R15: 8801b5d52080
 __do_kmalloc mm/slab.c:3700 [inline]
 __kmalloc+0x25/0x760 mm/slab.c:3714
 kmalloc_array include/linux/slab.h:631 [inline]
 kcalloc include/linux/slab.h:642 [inline]
 __ptr_ring_init_queue_alloc include/linux/ptr_ring.h:469 [inline]
 ptr_ring_resize_multiple include/linux/ptr_ring.h:629 [inline]
 tun_queue_resize drivers/net/tun.c:3319 [inline]
 tun_device_event+0x471/0xec0 drivers/net/tun.c:3338
 notifier_call_chain+0x136/0x2c0 kernel/notifier.c:93
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401
 call_netdevice_notifiers_info+0x32/0x70 net/core/dev.c:1707
 call_netdevice_notifiers net/core/dev.c:1725 [inline]
 dev_change_tx_queue_len+0x117/0x220 net/core/dev.c:7065
 do_setlink+0xba7/0x3bb0 net/core/rtnetlink.c:2341
 rtnl_newlink+0xf1c/0x1a20 net/core/rtnetlink.c:2915
 rtnetlink_rcv_msg+0x57f/0xb10 net/core/rtnetlink.c:4587
 netlink_rcv_skb+0x14b/0x380 net/netlink/af_netlink.c:2442
 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4605
 netlink_unicast_kernel net/netlink/af_netlink.c:1308 [inline]
 netlink_unicast+0x4c4/0x6b0 net/netlink/af_netlink.c:1334
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1897
 sock_sendmsg_nosec net/socket.c:630 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:640
 ___sys_sendmsg+0x767/0x8b0 net/socket.c:2046
 __sys_sendmsg+0xe5/0x210 net/socket.c:2080
 SYSC_sendmsg net/socket.c:2091 [inline]
 SyS_sendmsg+0x2d/0x50 net/socket.c:2087
 entry_SYSCALL_64_fastpath+0x29/0xa0
RIP: 0033:0x4463c9
RSP: 002b:7ffe63916e68 EFLAGS: 0246 ORIG_RAX: 002e
RAX: ffda RBX: 004a7af2 RCX: 004463c9
RDX:  RSI: 20504000 RDI: 0004
RBP: 7ffe63916f08 R08:  R09: 004a7af2
R10:  R11: 0246 R12: 7ffe63916f08
R13: 00403890 R14:  R15: 
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..


---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkal...@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is 
merged

into any tree, please reply to this email with:
#syz fix: exact-commit-title
If you want to test a patch for this bug, please reply with:
#syz test: git://repo/address.git branch
and provide the patch inline or as an attachment.
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug 
report.
Note: all commands must start from beginning of the line in the email 
body.


Looks like we need cap the maximum size that ptr_ring could allocate.

Will post a patch soon.

Thanks


Re: WARNING: proc registration bug in clusterip_tg_check

2018-02-06 Thread Cong Wang
On Tue, Feb 6, 2018 at 6:27 AM, syzbot
 wrote:
> Hello,
>
> syzbot hit the following crash on net-next commit
> 617aebe6a97efa539cc4b8a52adccd89596e6be0 (Sun Feb 4 00:25:42 2018 +)
> Merge tag 'usercopy-v4.16-rc1' of
> git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
>
> So far this crash happened 5 times on net-next, upstream.
> C reproducer is attached.
> syzkaller reproducer is attached.
> Raw console output is attached.
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+03218bcdba6aa7644...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
>
> x_tables: ip_tables: osf match: only valid for protocol 6
> x_tables: ip_tables: osf match: only valid for protocol 6
> x_tables: ip_tables: osf match: only valid for protocol 6
> [ cut here ]
> proc_dir_entry 'ipt_CLUSTERIP/172.20.0.170' already registered
> WARNING: CPU: 1 PID: 4152 at fs/proc/generic.c:330 proc_register+0x2a4/0x370
> fs/proc/generic.c:329
> Kernel panic - not syncing: panic_on_warn set ...
>
> CPU: 1 PID: 4152 Comm: syzkaller851476 Not tainted 4.15.0+ #221
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  panic+0x1e4/0x41c kernel/panic.c:183
>  __warn+0x1dc/0x200 kernel/panic.c:547
>  report_bug+0x211/0x2d0 lib/bug.c:184
>  fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
>  fixup_bug arch/x86/kernel/traps.c:247 [inline]
>  do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
>  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
>  invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:1097
> RIP: 0010:proc_register+0x2a4/0x370 fs/proc/generic.c:329
> RSP: 0018:8801cbd6ee20 EFLAGS: 00010286
> RAX: dc08 RBX: 8801d2181038 RCX: 815a57ae
> RDX:  RSI: 1100397add74 RDI: 1100397add49
> RBP: 8801cbd6ee70 R08: 1100397add0b R09: 
> R10: 8801cbd6ecd8 R11:  R12: 8801b2bb1cc0
> R13: dc00 R14: 8801b0d8dbc8 R15: 8801b2bb1d81
>  proc_create_data+0xf8/0x180 fs/proc/generic.c:494
>  clusterip_config_init net/ipv4/netfilter/ipt_CLUSTERIP.c:250 [inline]

I think there is probably a race condition between clusterip_config_entry_put()
and clusterip_config_init(), after we release the spinlock, a new proc
with the same IP could be created therefore triggers this warning

I am not sure if it is enough to just move the proc_remove() under
spinlock...


diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a84a60f6b39..1ff72b87a066 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -107,12 +107,6 @@ clusterip_config_entry_put(struct net *net,
struct clusterip_config *c)

local_bh_disable();
if (refcount_dec_and_lock(>entries, >lock)) {
-   list_del_rcu(>list);
-   spin_unlock(>lock);
-   local_bh_enable();
-
-   unregister_netdevice_notifier(>notifier);
-
/* In case anyone still accesses the file, the open/close
 * functions are also incrementing the refcount on their own,
 * so it's safe to remove the entry even if it's in use. */
@@ -120,6 +114,12 @@ clusterip_config_entry_put(struct net *net,
struct clusterip_config *c)
if (cn->procdir)
proc_remove(c->pde);
 #endif
+   list_del_rcu(>list);
+   spin_unlock(>lock);
+   local_bh_enable();
+
+   unregister_netdevice_notifier(>notifier);
+
return;
}
local_bh_enable();


>  clusterip_tg_check+0xf9c/0x16d0 net/ipv4/netfilter/ipt_CLUSTERIP.c:488
>  xt_check_target+0x22c/0x7d0 net/netfilter/x_tables.c:850
>  check_target net/ipv4/netfilter/ip_tables.c:513 [inline]
>  find_check_entry.isra.8+0x8c8/0xcb0 net/ipv4/netfilter/ip_tables.c:554
>  translate_table+0xed1/0x1610 net/ipv4/netfilter/ip_tables.c:725
>  do_replace net/ipv4/netfilter/ip_tables.c:1141 [inline]
>  do_ipt_set_ctl+0x370/0x5f0 net/ipv4/netfilter/ip_tables.c:1675
>  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
>  nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115
>  ip_setsockopt+0x97/0xa0 net/ipv4/ip_sockglue.c:1259
>  sctp_setsockopt+0x2b6/0x61d0 net/sctp/socket.c:4104
>  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2975
>  SYSC_setsockopt net/socket.c:1849 [inline]
>  SyS_setsockopt+0x189/0x360 net/socket.c:1828
>  entry_SYSCALL_64_fastpath+0x29/0xa0
> RIP: 0033:0x446839
> RSP: 002b:7f0309d0fdb8 EFLAGS: 0246 ORIG_RAX: 0036
> RAX: 

[PATCH iproute2-next v2 1/6] ipaddress: Unify print_link_stats() and print_link_stats64()

2018-02-06 Thread Serhey Popovych
To show real differences between these two variants adjust whitespace
intendation and use print_uint() instead of print_int() as all members
in both @struct rtnl_link_stats and @struct rtnl_link_stats64 are
unsigned.

Signed-off-by: Serhey Popovych 
---
 ip/ipaddress.c |   30 ++
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 4707c2b..a80a385 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -609,8 +609,7 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
print_uint(PRINT_JSON, "multicast", NULL, s->multicast);
if (s->rx_compressed)
print_uint(PRINT_JSON,
-  "compressed",
-  NULL, s->rx_compressed);
+  "compressed", NULL, s->rx_compressed);
 
/* RX error stats */
if (show_stats > 1) {
@@ -647,8 +646,7 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
print_uint(PRINT_JSON, "collisions", NULL, s->collisions);
if (s->tx_compressed)
print_uint(PRINT_JSON,
-  "compressed",
-  NULL, s->tx_compressed);
+  "compressed", NULL, s->tx_compressed);
 
/* TX error stats */
if (show_stats > 1) {
@@ -668,9 +666,9 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
print_uint(PRINT_JSON, "carrier_changes", NULL,
   rta_getattr_u32(carrier_changes));
}
+
close_json_object();
close_json_object();
-
} else {
/* RX stats */
fprintf(fp, "RX: bytes  packets  errors  dropped overrun 
mcast   %s%s",
@@ -691,7 +689,6 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
fprintf(fp, "%s", _SL_);
fprintf(fp, "RX errors: length   crc frame   
fifomissed%s%s",
s->rx_nohandler ? "   nohandler" : "", _SL_);
-
fprintf(fp, "   ");
print_num(fp, 8, s->rx_length_errors);
print_num(fp, 7, s->rx_crc_errors);
@@ -700,7 +697,6 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
print_num(fp, 7, s->rx_missed_errors);
if (s->rx_nohandler)
print_num(fp, 7, s->rx_nohandler);
-
}
fprintf(fp, "%s", _SL_);
 
@@ -753,9 +749,8 @@ static void print_link_stats32(FILE *fp, const struct 
rtnl_link_stats *s,
print_uint(PRINT_JSON, "over_errors", NULL, s->rx_over_errors);
print_uint(PRINT_JSON, "multicast", NULL, s->multicast);
if (s->rx_compressed)
-   print_int(PRINT_JSON,
- "compressed",
- NULL, s->rx_compressed);
+   print_uint(PRINT_JSON,
+  "compressed", NULL, s->rx_compressed);
 
/* RX error stats */
if (show_stats > 1) {
@@ -775,9 +770,8 @@ static void print_link_stats32(FILE *fp, const struct 
rtnl_link_stats *s,
   "missed_errors",
   NULL, s->rx_missed_errors);
if (s->rx_nohandler)
-   print_int(PRINT_JSON,
- "nohandler",
- NULL, s->rx_nohandler);
+   print_uint(PRINT_JSON,
+  "nohandler", NULL, s->rx_nohandler);
}
close_json_object();
 
@@ -792,9 +786,8 @@ static void print_link_stats32(FILE *fp, const struct 
rtnl_link_stats *s,
   NULL, s->tx_carrier_errors);
print_uint(PRINT_JSON, "collisions", NULL, s->collisions);
if (s->tx_compressed)
-   print_int(PRINT_JSON,
- "compressed",
- NULL, s->tx_compressed);
+   print_uint(PRINT_JSON,
+  "compressed", NULL, s->tx_compressed);
 
/* TX error stats */
if (show_stats > 1) {
@@ -811,9 +804,7 @@ static void print_link_stats32(FILE *fp, const struct 
rtnl_link_stats *s,
   "heartbeat_errors",
   NULL, s->tx_heartbeat_errors);
if 

[PATCH iproute2-next v2 3/6] tunnel: Split statistic getting and printing

2018-02-06 Thread Serhey Popovych
This is first step to move tunnel code to use rtnl dump interface
instead of /proc/net/dev read.

Make tnl_print_stats() to accept @struct rtnl_link_stats64 parameter,
introduce tnl_get_stats() that will parse line from /proc/net/dev into
@struct rtnl_link_stats64.

Signed-off-by: Serhey Popovych 
---
 ip/ip6tunnel.c |8 ++--
 ip/iptunnel.c  |8 ++--
 ip/tunnel.c|   57 +++-
 ip/tunnel.h|5 -
 4 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
index 783e28a..3e99559 100644
--- a/ip/ip6tunnel.c
+++ b/ip/ip6tunnel.c
@@ -390,8 +390,12 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p)
if (!ip6_tnl_parm_match(p, ))
continue;
print_tunnel();
-   if (show_stats)
-   tnl_print_stats(ptr);
+   if (show_stats) {
+   struct rtnl_link_stats64 s;
+
+   if (!tnl_get_stats(ptr, ))
+   tnl_print_stats();
+   }
printf("\n");
}
err = 0;
diff --git a/ip/iptunnel.c b/ip/iptunnel.c
index 0aa3b33..6639055 100644
--- a/ip/iptunnel.c
+++ b/ip/iptunnel.c
@@ -425,8 +425,12 @@ static int do_tunnels_list(struct ip_tunnel_parm *p)
(p->i_key && p1.i_key != p->i_key))
continue;
print_tunnel();
-   if (show_stats)
-   tnl_print_stats(ptr);
+   if (show_stats) {
+   struct rtnl_link_stats64 s;
+
+   if (!tnl_get_stats(ptr, ))
+   tnl_print_stats();
+   }
printf("\n");
}
err = 0;
diff --git a/ip/tunnel.c b/ip/tunnel.c
index 948d5f7..06533cf 100644
--- a/ip/tunnel.c
+++ b/ip/tunnel.c
@@ -307,30 +307,45 @@ void tnl_print_endpoint(const char *name, const struct 
rtattr *rta, int family)
}
 }
 
-/* tnl_print_stats - print tunnel statistics
- *
- * @buf - tunnel interface's line in /proc/net/dev,
- *starting past the interface name and following colon
- */
-void tnl_print_stats(const char *buf)
+int tnl_get_stats(const char *buf, struct rtnl_link_stats64 *s)
 {
-   unsigned long rx_bytes, rx_packets, rx_errs, rx_drops,
- rx_fifo, rx_frame,
- tx_bytes, tx_packets, tx_errs, tx_drops,
- tx_fifo, tx_colls, tx_carrier, rx_multi;
-
-   if (sscanf(buf, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu",
-  _bytes, _packets, _errs, _drops,
-  _fifo, _frame, _multi,
-  _bytes, _packets, _errs, _drops,
-  _fifo, _colls, _carrier) != 14)
-   return;
+   /* rx */
+   __u64 *rx_bytes   = >rx_bytes;
+   __u64 *rx_packets = >rx_packets;
+   __u64 *rx_errs= >rx_errors;
+   __u64 *rx_drops   = >rx_dropped;
+   __u64 *rx_fifo= >rx_fifo_errors;
+   __u64 *rx_frame   = >rx_frame_errors;
+   __u64 *rx_multi   = >multicast;
+   /* tx */
+   __u64 *tx_bytes   = >tx_bytes;
+   __u64 *tx_packets = >tx_packets;
+   __u64 *tx_errs= >tx_errors;
+   __u64 *tx_drops   = >tx_dropped;
+   __u64 *tx_fifo= >tx_fifo_errors;
+   __u64 *tx_carrier = >tx_carrier_errors;
+   __u64 *tx_colls   = >collisions;
+
+   if (sscanf(buf,
+  
"%llu%llu%llu%llu%llu%llu%llu%*d%llu%llu%llu%llu%llu%llu%llu",
+  rx_bytes, rx_packets, rx_errs, rx_drops,
+  rx_fifo, rx_frame, rx_multi,
+  tx_bytes, tx_packets, tx_errs, tx_drops,
+  tx_fifo, tx_colls, tx_carrier) != 14)
+   return -1;
 
+   return 0;
+}
+
+void tnl_print_stats(const struct rtnl_link_stats64 *s)
+{
printf("%s", _SL_);
printf("RX: PacketsBytesErrors CsumErrs OutOfSeq Mcasts%s", 
_SL_);
-   printf("%-10ld %-12ld %-6ld %-8ld %-8ld %-8ld%s",
-  rx_packets, rx_bytes, rx_errs, rx_frame, rx_fifo, rx_multi, 
_SL_);
+   printf("%-10lld %-12lld %-6lld %-8lld %-8lld %-8lld%s",
+  s->rx_packets, s->rx_bytes, s->rx_errors, s->rx_frame_errors,
+  s->rx_fifo_errors, s->multicast, _SL_);
printf("TX: PacketsBytesErrors DeadLoop NoRoute  NoBufs%s", 
_SL_);
-   printf("%-10ld %-12ld %-6ld %-8ld %-8ld %-6ld",
-  tx_packets, tx_bytes, tx_errs, tx_colls, tx_carrier, tx_drops);
+   printf("%-10lld %-12lld %-6lld %-8lld %-8lld %-6lld",
+  s->tx_packets, s->tx_bytes, s->tx_errors, s->collisions,
+  s->tx_carrier_errors, s->tx_dropped);
 }
diff --git a/ip/tunnel.h b/ip/tunnel.h
index 5bd27c3..5fe488b 100644
--- a/ip/tunnel.h
+++ b/ip/tunnel.h
@@ -24,6 +24,7 @@
 #include 
 
 struct rtattr;
+struct 

[PATCH iproute2-next v2 5/6] iptunnel/ip6tunnel: Use netlink to walk through tunnels list

2018-02-06 Thread Serhey Popovych
Both tunnels use legacy /proc/net/dev interface to get tunnel device and
it's statistics. This may cause problems for cases when procfs either
not mounted or not unshare(2)d for given network namespace.

Use netlink to walk through list of tunnel devices which is network
namespace aware and provides additional information such as statistics
in the dump message.

Since both address family specific variants of do_tunnels_list() nearly
the same, except for tunnel parameters structure initialization,
matching and printing we can introduce common one in tunnel.c.

To implement address family specific parts introduce new data structure
@struct tnl_print_nlmsg_info what contains all necessary information as
well as pointers to ->init(), ->match() and ->print() callbacks.

Annotate data structures by const where appropriate.

Signed-off-by: Serhey Popovych 
---
 ip/ip6tunnel.c |  114 +++---
 ip/iptunnel.c  |  106 +-
 ip/tunnel.c|  117 +---
 ip/tunnel.h|   20 --
 4 files changed, 159 insertions(+), 198 deletions(-)

diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
index 2e6f513..c7fa082 100644
--- a/ip/ip6tunnel.c
+++ b/ip/ip6tunnel.c
@@ -67,8 +67,9 @@ static void usage(void)
exit(-1);
 }
 
-static void print_tunnel(struct ip6_tnl_parm2 *p)
+static void print_tunnel(const void *t)
 {
+   const struct ip6_tnl_parm2 *p = t;
char s1[1024];
char s2[1024];
 
@@ -313,13 +314,24 @@ static void ip6_tnl_parm_init(struct ip6_tnl_parm2 *p, 
int apply_default)
}
 }
 
-/*
- * @p1: user specified parameter
- * @p2: database entry
- */
-static int ip6_tnl_parm_match(const struct ip6_tnl_parm2 *p1,
- const struct ip6_tnl_parm2 *p2)
+static void ip6_tnl_parm_initialize(const struct tnl_print_nlmsg_info *info)
+{
+   const struct ifinfomsg *ifi = info->ifi;
+   const struct ip6_tnl_parm2 *p1 = info->p1;
+   struct ip6_tnl_parm2 *p2 = info->p2;
+
+   ip6_tnl_parm_init(p2, 0);
+   if (ifi->ifi_type == ARPHRD_IP6GRE)
+   p2->proto = IPPROTO_GRE;
+   p2->link = ifi->ifi_index;
+   strcpy(p2->name, p1->name);
+}
+
+static bool ip6_tnl_parm_match(const struct tnl_print_nlmsg_info *info)
 {
+   const struct ip6_tnl_parm2 *p1 = info->p1;
+   const struct ip6_tnl_parm2 *p2 = info->p2;
+
return ((!p1->link || p1->link == p2->link) &&
(!p1->name[0] || strcmp(p1->name, p2->name) == 0) &&
(IN6_IS_ADDR_UNSPECIFIED(>laddr) ||
@@ -336,91 +348,27 @@ static int ip6_tnl_parm_match(const struct ip6_tnl_parm2 
*p1,
(!p1->flags || (p1->flags & p2->flags)));
 }
 
-static int do_tunnels_list(struct ip6_tnl_parm2 *p)
-{
-   char buf[512];
-   int err = -1;
-   FILE *fp = fopen("/proc/net/dev", "r");
-
-   if (fp == NULL) {
-   perror("fopen");
-   return -1;
-   }
-
-   /* skip two lines at the begenning of the file */
-   if (!fgets(buf, sizeof(buf), fp) ||
-   !fgets(buf, sizeof(buf), fp)) {
-   fprintf(stderr, "/proc/net/dev read error\n");
-   goto end;
-   }
-
-   while (fgets(buf, sizeof(buf), fp) != NULL) {
-   char name[IFNAMSIZ];
-   int index, type;
-   struct ip6_tnl_parm2 p1;
-   char *ptr;
-
-   buf[sizeof(buf) - 1] = '\0';
-   ptr = strchr(buf, ':');
-   if (ptr == NULL ||
-   (*ptr++ = 0, sscanf(buf, "%s", name) != 1)) {
-   fprintf(stderr, "Wrong format for /proc/net/dev. Giving 
up.\n");
-   goto end;
-   }
-   if (p->name[0] && strcmp(p->name, name))
-   continue;
-   index = ll_name_to_index(name);
-   if (index == 0)
-   continue;
-   type = ll_index_to_type(index);
-   if (type == -1) {
-   fprintf(stderr, "Failed to get type of \"%s\"\n", name);
-   continue;
-   }
-   switch (type) {
-   case ARPHRD_TUNNEL6:
-   case ARPHRD_IP6GRE:
-   break;
-   default:
-   continue;
-   }
-   ip6_tnl_parm_init(, 0);
-   if (type == ARPHRD_IP6GRE)
-   p1.proto = IPPROTO_GRE;
-   p1.link = index;
-   strcpy(p1.name, name);
-   if (tnl_get_ioctl(name, ))
-   continue;
-   if (!ip6_tnl_parm_match(p, ))
-   continue;
-   print_tunnel();
-   if (show_stats) {
-   struct rtnl_link_stats64 s;
-
-   if (!tnl_get_stats(ptr, ))
-

[PATCH iproute2-next v2 0/6] ip: Use netlink to walk through network device list

2018-02-06 Thread Serhey Popovych
In this seris I replace /proc/net/dev and /sys/class/net usage for walk
through network device list in iptunnel/ip6tunnel and iptuntap with
netlink dump.

Following changed since RFC was sent:

  1) Treat @struct rtnl_link_stats and @struct rtnl_link_stats64 as
 array with __u32 and __u64 elements respectively in
 copy_rtnl_link_stats64() as suggested by Stephen Hemminger.

  2) Remove @name and @size parameters from @struct tnl_print_nlmsg_info
 since we can get them easily from other data.

Testing.


Following script is used to ensure I didn't broke things too much:

\#!/bin/bash

iproute2_dir="$1"
iface='gre1'

pushd "$iproute2_dir" &>/dev/null

for i in new old; do
DIR="/tmp/$i"
mkdir -p "$DIR"

ln -snf ip.$i ip/ip

for o in '' -s -d; do
ip/ip $o tunnel show   >"$DIR/ip${o}-tunnel-show"
ip/ip -4 $o tunnel show>"$DIR/ip-4${o}-tunnel-show"
ip/ip -6 $o tunnel show>"$DIR/ip-6${o}-tunnel-show"
ip/ip $o tunnel show dev "$iface" \
>"$DIR/ip${o}-tunnel-show-$iface"
ip/ip $o tuntap show   >"$DIR/ip${o}-tuntap-show"
done
done
rm -f ip/ip

diff -urN /tmp/{old,new} |sed -n -Ee'/^(-{3}|\+{3})[[:space:]]+/!p'
rc=$?

popd &>/dev/null
exit $rc

Results:


...
fopen /sys/class/net/ipip1/tun_flags: No such file or directory
fopen /sys/class/net/ipip2/tun_flags: No such file or directory
fopen /sys/class/net/gre10/tun_flags: No such file or directory
^^^
note that this comes from ip.old
...
diff -urN /tmp/old/ip-d-tuntap-show /tmp/new/ip-d-tuntap-show
@@ -1,4 +1,4 @@
-tun1: tap user 1004 group 27
-   Attached to processes:
 tun0: tun user 1000 group 27
Attached to processes:
+tun1: tap user 1004 group 27
+   Attached to processes:
diff -urN /tmp/old/ip-s-tuntap-show /tmp/new/ip-s-tuntap-show
@@ -1,2 +1,2 @@
-tun1: tap user 1004 group 27
 tun0: tun user 1000 group 27
+tun1: tap user 1004 group 27
diff -urN /tmp/old/ip-tuntap-show /tmp/new/ip-tuntap-show
@@ -1,2 +1,2 @@
-tun1: tap user 1004 group 27
 tun0: tun user 1000 group 27
+tun1: tap user 1004 group 27

So basically only print order for ip tuntap get changes. Rest is intact.

v2
  Fix build failure in 0/4 patch ("iptunnel/ip6tunnel: Code cleanups")
  and update it's description showing why this cleanup is necessary.

  Update cover letter to explain origins of fopen /sys/class/net/...
  error message sources.

Thanks,
Serhii

Serhey Popovych (6):
  ipaddress: Unify print_link_stats() and print_link_stats64()
  ip: Introduce get_rtnl_link_stats_rta() to get link statistics
  tunnel: Split statistic getting and printing
  iptunnel/ip6tunnel: Code cleanups
  iptunnel/ip6tunnel: Use netlink to walk through tunnels list
  tuntap: Use netlink to walk through tuntap list

 include/utils.h |3 +
 ip/ip6tunnel.c  |  115 +++--
 ip/ipaddress.c  |  189 ---
 ip/iptunnel.c   |   93 +--
 ip/iptuntap.c   |  121 ++-
 ip/tunnel.c |  114 ++---
 ip/tunnel.h |   17 -
 lib/utils.c |   45 +
 8 files changed, 324 insertions(+), 373 deletions(-)

-- 
1.7.10.4



[PATCH iproute2-next v2 2/6] ip: Introduce get_rtnl_link_stats_rta() to get link statistics

2018-02-06 Thread Serhey Popovych
Assume all statistics in ip(8) represented either by IFLA_STATS64 or
IFLA_STATS is 64 bit. It is clean that we can store __u32 counters of
@struct rtnl_link_stats in __u64 counters in @struct rtnl_link_stats64.

New get_rtnl_link_stats_rta() follows __print_link_stats() behaviour on
handling of stats attribute: copy no more than size of data structure
and no less than attribute length zeroing rest.

Drop print_link_stats32() as it's functionality can be handled by 64bit
variant. Move code from __print_link_stats() to print_link_stats64() and
finally rename print_link_stats64() to __print_link_stats().

More users of introduced function will come in future.

Signed-off-by: Serhey Popovych 
---
 include/utils.h |3 +
 ip/ipaddress.c  |  171 +++
 lib/utils.c |   45 +++
 3 files changed, 56 insertions(+), 163 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index f81928a..8b8ee2e 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -284,6 +284,9 @@ int make_path(const char *path, mode_t mode);
 char *find_cgroup2_mount(void);
 int get_command_name(const char *pid, char *comm, size_t len);
 
+int get_rtnl_link_stats_rta(struct rtnl_link_stats64 *stats64,
+   struct rtattr *tb[]);
+
 #ifdef NEED_STRLCPY
 size_t strlcpy(char *dst, const char *src, size_t size);
 size_t strlcat(char *dst, const char *src, size_t size);
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index a80a385..6990b81 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -593,152 +593,18 @@ static void print_vf_stats64(FILE *fp, struct rtattr 
*vfstats)
}
 }
 
-static void print_link_stats64(FILE *fp, const struct rtnl_link_stats64 *s,
-  const struct rtattr *carrier_changes)
+static void __print_link_stats(FILE *fp, struct rtattr *tb[])
 {
-   if (is_json_context()) {
-   open_json_object("stats64");
-
-   /* RX stats */
-   open_json_object("rx");
-   print_uint(PRINT_JSON, "bytes", NULL, s->rx_bytes);
-   print_uint(PRINT_JSON, "packets", NULL, s->rx_packets);
-   print_uint(PRINT_JSON, "errors", NULL, s->rx_errors);
-   print_uint(PRINT_JSON, "dropped", NULL, s->rx_dropped);
-   print_uint(PRINT_JSON, "over_errors", NULL, s->rx_over_errors);
-   print_uint(PRINT_JSON, "multicast", NULL, s->multicast);
-   if (s->rx_compressed)
-   print_uint(PRINT_JSON,
-  "compressed", NULL, s->rx_compressed);
-
-   /* RX error stats */
-   if (show_stats > 1) {
-   print_uint(PRINT_JSON,
-  "length_errors",
-  NULL, s->rx_length_errors);
-   print_uint(PRINT_JSON,
-  "crc_errors",
-  NULL, s->rx_crc_errors);
-   print_uint(PRINT_JSON,
-  "frame_errors",
-  NULL, s->rx_frame_errors);
-   print_uint(PRINT_JSON,
-  "fifo_errors",
-  NULL, s->rx_fifo_errors);
-   print_uint(PRINT_JSON,
-  "missed_errors",
-  NULL, s->rx_missed_errors);
-   if (s->rx_nohandler)
-   print_uint(PRINT_JSON,
-  "nohandler", NULL, s->rx_nohandler);
-   }
-   close_json_object();
-
-   /* TX stats */
-   open_json_object("tx");
-   print_uint(PRINT_JSON, "bytes", NULL, s->tx_bytes);
-   print_uint(PRINT_JSON, "packets", NULL, s->tx_packets);
-   print_uint(PRINT_JSON, "errors", NULL, s->tx_errors);
-   print_uint(PRINT_JSON, "dropped", NULL, s->tx_dropped);
-   print_uint(PRINT_JSON,
-  "carrier_errors",
-  NULL, s->tx_carrier_errors);
-   print_uint(PRINT_JSON, "collisions", NULL, s->collisions);
-   if (s->tx_compressed)
-   print_uint(PRINT_JSON,
-  "compressed", NULL, s->tx_compressed);
-
-   /* TX error stats */
-   if (show_stats > 1) {
-   print_uint(PRINT_JSON,
-  "aborted_errors",
-  NULL, s->tx_aborted_errors);
-   print_uint(PRINT_JSON,
-  "fifo_errors",
-  NULL, s->tx_fifo_errors);
-   print_uint(PRINT_JSON,
-  "window_errors",
-  

[PATCH iproute2-next v2 6/6] tuntap: Use netlink to walk through tuntap list

2018-02-06 Thread Serhey Popovych
It seems bad idea to depend on sysfs being mounted and reflected to the
current network namespace. Same applies to procfs.

Instead netlink should be used to talk to the kernel and get list of
specific network devices among with their parameters.

Support for kernel netlink message filtering by passing IFLA_INFO_KIND
in RTM_GETLINK request: if kernel does not support filtering by the kind
we will check it in reply anyway. Check for ifi->ifi_type to be either
ARPHRD_NONE or ARPHRD_ETHER to seed up things a bit without kernel level
filtering.

Unfortunately tun driver does not implement dumping it's configuration
via netlink and we still need to use read_prop() which depends on sysfs
to get additional tun device information.

Signed-off-by: Serhey Popovych 
---
 ip/iptuntap.c |  121 +++--
 1 file changed, 91 insertions(+), 30 deletions(-)

diff --git a/ip/iptuntap.c b/ip/iptuntap.c
index 09f2be2..4628db2 100644
--- a/ip/iptuntap.c
+++ b/ip/iptuntap.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -31,6 +32,8 @@
 #include "utils.h"
 #include "ip_common.h"
 
+static const char drv_name[] = "tun";
+
 #define TUNDEV "/dev/net/tun"
 
 static void usage(void) __attribute__((noreturn));
@@ -348,43 +351,101 @@ next:
globfree();
 }
 
+static int tuntap_filter_req(struct nlmsghdr *nlh, int reqlen)
+{
+   struct rtattr *linkinfo;
+   int err;
 
-static int do_show(int argc, char **argv)
+   linkinfo = addattr_nest(nlh, reqlen, IFLA_LINKINFO);
+
+   err = addattr_l(nlh, reqlen, IFLA_INFO_KIND,
+   drv_name, sizeof(drv_name) - 1);
+   if (err)
+   return err;
+
+   addattr_nest_end(nlh, linkinfo);
+
+   return 0;
+}
+
+static int print_tuntap(const struct sockaddr_nl *who,
+   struct nlmsghdr *n, void *arg)
 {
-   DIR *dir;
-   struct dirent *d;
+   struct ifinfomsg *ifi = NLMSG_DATA(n);
+   struct rtattr *tb[IFLA_MAX+1];
+   struct rtattr *linkinfo[IFLA_INFO_MAX+1];
+   const char *name, *kind;
long flags, owner = -1, group = -1;
 
-   dir = opendir("/sys/class/net");
-   if (!dir) {
-   perror("opendir");
+   if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
+   return 0;
+
+   if (n->nlmsg_len < NLMSG_LENGTH(sizeof(*ifi)))
return -1;
+
+   switch (ifi->ifi_type) {
+   case ARPHRD_NONE:
+   case ARPHRD_ETHER:
+   break;
+   default:
+   return 0;
}
-   while ((d = readdir(dir))) {
-   if (d->d_name[0] == '.' &&
-   (d->d_name[1] == 0 || d->d_name[1] == '.'))
-   continue;
-
-   if (read_prop(d->d_name, "tun_flags", ))
-   continue;
-
-   read_prop(d->d_name, "owner", );
-   read_prop(d->d_name, "group", );
-
-   printf("%s:", d->d_name);
-   print_flags(flags);
-   if (owner != -1)
-   printf(" user %ld", owner);
-   if (group != -1)
-   printf(" group %ld", group);
-   printf("\n");
-   if (show_details) {
-   printf("\tAttached to processes:");
-   show_processes(d->d_name);
-   printf("\n");
-   }
+
+   parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), IFLA_PAYLOAD(n));
+
+   if (!tb[IFLA_IFNAME])
+   return 0;
+
+   if (!tb[IFLA_LINKINFO])
+   return 0;
+
+   parse_rtattr_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
+
+   if (!linkinfo[IFLA_INFO_KIND])
+   return 0;
+
+   kind = rta_getattr_str(linkinfo[IFLA_INFO_KIND]);
+   if (strcmp(kind, drv_name))
+   return 0;
+
+   name = rta_getattr_str(tb[IFLA_IFNAME]);
+
+   if (read_prop(name, "tun_flags", ))
+   return 0;
+   if (read_prop(name, "owner", ))
+   return 0;
+   if (read_prop(name, "group", ))
+   return 0;
+
+   printf("%s:", name);
+   print_flags(flags);
+   if (owner != -1)
+   printf(" user %ld", owner);
+   if (group != -1)
+   printf(" group %ld", group);
+   fputc('\n', stdout);
+   if (show_details) {
+   printf("\tAttached to processes:");
+   show_processes(name);
+   fputc('\n', stdout);
}
-   closedir(dir);
+
+   return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+   if (rtnl_wilddump_req_filter_fn(, AF_UNSPEC, RTM_GETLINK,
+   tuntap_filter_req) < 0) {
+   perror("Cannot send dump request\n");
+   return -1;
+   }
+
+   if (rtnl_dump_filter(, print_tuntap, NULL) < 0) {
+   

[PATCH iproute2-next v2 4/6] iptunnel/ip6tunnel: Code cleanups

2018-02-06 Thread Serhey Popovych
Use switch () instead of if () to compare tunnel type to fit into 80
columns and make code more readable. Print "\n" using fputc().

In iptunnel.c abstract tunnel parameters matching code in iptunnel.c
into ip_tunnel_parm_match() helper to conform with ip6tunnel.c. Use
memset() to initialize @p1.

In ip6tunnel.c no need to call ll_name_to_index() with name twice: just
use found previously index. Do not initialize @p1: this is done in
ip6_tnl_parm_init().

This is to show real differences between ip and ipv6 do_tunnels_list()
implementations and prepare for upcoming unification of them.

Signed-off-by: Serhey Popovych 
---
 ip/ip6tunnel.c |   33 ++---
 ip/iptunnel.c  |   39 ---
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
index 3e99559..2e6f513 100644
--- a/ip/ip6tunnel.c
+++ b/ip/ip6tunnel.c
@@ -357,11 +357,12 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p)
while (fgets(buf, sizeof(buf), fp) != NULL) {
char name[IFNAMSIZ];
int index, type;
-   struct ip6_tnl_parm2 p1 = {};
+   struct ip6_tnl_parm2 p1;
char *ptr;
 
buf[sizeof(buf) - 1] = '\0';
-   if ((ptr = strchr(buf, ':')) == NULL ||
+   ptr = strchr(buf, ':');
+   if (ptr == NULL ||
(*ptr++ = 0, sscanf(buf, "%s", name) != 1)) {
fprintf(stderr, "Wrong format for /proc/net/dev. Giving 
up.\n");
goto end;
@@ -376,16 +377,19 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p)
fprintf(stderr, "Failed to get type of \"%s\"\n", name);
continue;
}
-   if (type != ARPHRD_TUNNEL6 && type != ARPHRD_IP6GRE)
+   switch (type) {
+   case ARPHRD_TUNNEL6:
+   case ARPHRD_IP6GRE:
+   break;
+   default:
continue;
+   }
ip6_tnl_parm_init(, 0);
if (type == ARPHRD_IP6GRE)
p1.proto = IPPROTO_GRE;
+   p1.link = index;
strcpy(p1.name, name);
-   p1.link = ll_name_to_index(p1.name);
-   if (p1.link == 0)
-   continue;
-   if (tnl_get_ioctl(p1.name, ))
+   if (tnl_get_ioctl(name, ))
continue;
if (!ip6_tnl_parm_match(p, ))
continue;
@@ -396,7 +400,7 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p)
if (!tnl_get_stats(ptr, ))
tnl_print_stats();
}
-   printf("\n");
+   fputc('\n', stdout);
}
err = 0;
  end:
@@ -416,14 +420,13 @@ static int do_show(int argc, char **argv)
return -1;
 
if (!p.name[0] || show_stats)
-   do_tunnels_list();
-   else {
-   if (tnl_get_ioctl(p.name, ))
-   return -1;
-   print_tunnel();
-   printf("\n");
-   }
+   return do_tunnels_list();
+
+   if (tnl_get_ioctl(p.name, ))
+   return -1;
 
+   print_tunnel();
+   fputc('\n', stdout);
return 0;
 }
 
diff --git a/ip/iptunnel.c b/ip/iptunnel.c
index 6639055..ff201a7 100644
--- a/ip/iptunnel.c
+++ b/ip/iptunnel.c
@@ -373,6 +373,20 @@ static void print_tunnel(struct ip_tunnel_parm *p)
printf("%s  Checksum output packets.", _SL_);
 }
 
+/*
+ * @p1: user specified parameter
+ * @p2: database entry
+ */
+static int ip_tunnel_parm_match(const struct ip_tunnel_parm *p1,
+   const struct ip_tunnel_parm *p2)
+{
+   return ((!p1->link || p1->link == p2->link) &&
+   (!p1->name[0] || strcmp(p1->name, p2->name) == 0) &&
+   (!p1->iph.daddr || p1->iph.daddr == p2->iph.daddr) &&
+   (!p1->iph.saddr || p1->iph.saddr == p2->iph.saddr) &&
+   (!p1->i_key || p1->i_key == p2->i_key));
+}
+
 static int do_tunnels_list(struct ip_tunnel_parm *p)
 {
char buf[512];
@@ -384,7 +398,7 @@ static int do_tunnels_list(struct ip_tunnel_parm *p)
return -1;
}
 
-   /* skip header lines */
+   /* skip two lines at the begenning of the file */
if (!fgets(buf, sizeof(buf), fp) ||
!fgets(buf, sizeof(buf), fp)) {
fprintf(stderr, "/proc/net/dev read error\n");
@@ -394,10 +408,10 @@ static int do_tunnels_list(struct ip_tunnel_parm *p)
while (fgets(buf, sizeof(buf), fp) != NULL) {
char name[IFNAMSIZ];
int index, type;
-   struct ip_tunnel_parm p1 = {};
+   struct ip_tunnel_parm p1;
char *ptr;
 
-   

Re: [PATCH iproute2-next 4/6] iptunnel/ip6tunnel: Code cleanups

2018-02-06 Thread Serhey Popovych
David Ahern wrote:
> On 2/6/18 7:16 PM, David Ahern wrote:
>> On 2/2/18 6:10 AM, Serhey Popovych wrote:
>>> @@ -414,15 +428,18 @@ static int do_tunnels_list(struct ip_tunnel_parm *p)
>>> fprintf(stderr, "Failed to get type of \"%s\"\n", name);
>>> continue;
>>> }
>>> -   if (type != ARPHRD_TUNNEL && type != ARPHRD_IPGRE && type != 
>>> ARPHRD_SIT)
>>> +   switch (type) {
>>> +   case ARPHRD_TUNNEL:
>>> +   case ARPHRD_IPGRE:
>>> +   case ARPHRD_SIT:
>>> +   break;
>>> +   default:
>>> continue;
>>> +   }
>>> +   memset(p1, 0, sizeof(p1));
>>
>> Shouldn't that be  for the first arg? I get a compile failure:
>>
>> ip
>> CC   iptunnel.o
>> CC   ip6tunnel.o
>> iptunnel.c: In function ‘do_tunnels_list’:
>> iptunnel.c:439:10: error: incompatible type for argument 1 of ‘memset’
>>memset(p1, 0, sizeof(p1));
>>   ^~
>> In file included from iptunnel.c:15:0:
>> /usr/include/string.h:62:14: note: expected ‘void *’ but argument is of
>> type ‘struct ip_tunnel_parm’
>>  extern void *memset (void *__s, int __c, size_t __n) __THROW __nonnull
>> ((1));
>>   ^~
>> ../config.mk:48: recipe for target 'iptunnel.o' failed
>>
> 
> Fixed by patch 5 which deletes do_tunnels_list. So why have a cleanup
> patch that changes code you then delete?
> 

There at least two reasons:

  1) Abstract tunnel matching code into a function that will be used as
 callback ->match() in upcoming change.

  2) Make do_tunnels_list() ip and ipv6 variants diff contain only
 real differences to show upcoming change where common
 do_tunnels_list() introduced is correct.

Will update comment for this patch in v2.



signature.asc
Description: OpenPGP digital signature


Re: [PATCH iproute2-next 4/6] iptunnel/ip6tunnel: Code cleanups

2018-02-06 Thread Serhey Popovych
David Ahern wrote:
> On 2/2/18 6:10 AM, Serhey Popovych wrote:
>> @@ -414,15 +428,18 @@ static int do_tunnels_list(struct ip_tunnel_parm *p)
>>  fprintf(stderr, "Failed to get type of \"%s\"\n", name);
>>  continue;
>>  }
>> -if (type != ARPHRD_TUNNEL && type != ARPHRD_IPGRE && type != 
>> ARPHRD_SIT)
>> +switch (type) {
>> +case ARPHRD_TUNNEL:
>> +case ARPHRD_IPGRE:
>> +case ARPHRD_SIT:
>> +break;
>> +default:
>>  continue;
>> +}
>> +memset(p1, 0, sizeof(p1));
> 
> Shouldn't that be  for the first arg? I get a compile failure:

Yes, definitely, sorry for that. Will fix in v2.

> 
> ip
> CC   iptunnel.o
> CC   ip6tunnel.o
> iptunnel.c: In function ‘do_tunnels_list’:
> iptunnel.c:439:10: error: incompatible type for argument 1 of ‘memset’
>memset(p1, 0, sizeof(p1));
>   ^~
> In file included from iptunnel.c:15:0:
> /usr/include/string.h:62:14: note: expected ‘void *’ but argument is of
> type ‘struct ip_tunnel_parm’
>  extern void *memset (void *__s, int __c, size_t __n) __THROW __nonnull
> ((1));
>   ^~
> ../config.mk:48: recipe for target 'iptunnel.o' failed
> 
>>  if (tnl_get_ioctl(name, ))
>>  continue;
>> -if ((p->link && p1.link != p->link) ||
>> -(p->name[0] && strcmp(p1.name, p->name)) ||
>> -(p->iph.daddr && p1.iph.daddr != p->iph.daddr) ||
>> -(p->iph.saddr && p1.iph.saddr != p->iph.saddr) ||
>> -(p->i_key && p1.i_key != p->i_key))
>> +if (!ip_tunnel_parm_match(p, ))
>>  continue;
>>  print_tunnel();
>>  if (show_stats) {




signature.asc
Description: OpenPGP digital signature


Re: [PATCH net-next] sun: Add SPDX license tags to Sun network drivers

2018-02-06 Thread zhuyj
 Reviewed-by: Zhu Yanjun 

On Wed, Feb 7, 2018 at 3:34 AM, Shannon Nelson
 wrote:
> Add the appropriate SPDX license tags to the Sun network drivers
> as outlined in Documentation/process/license-rules.rst.
>
> Signed-off-by: Shannon Nelson 
> ---
>  drivers/net/ethernet/sun/Kconfig  | 1 +
>  drivers/net/ethernet/sun/cassini.c| 1 +
>  drivers/net/ethernet/sun/cassini.h| 1 +
>  drivers/net/ethernet/sun/ldmvsw.c | 1 +
>  drivers/net/ethernet/sun/niu.c| 1 +
>  drivers/net/ethernet/sun/sunbmac.c| 1 +
>  drivers/net/ethernet/sun/sungem.c | 1 +
>  drivers/net/ethernet/sun/sunhme.c | 1 +
>  drivers/net/ethernet/sun/sunqe.c  | 1 +
>  drivers/net/ethernet/sun/sunvnet.c| 1 +
>  drivers/net/ethernet/sun/sunvnet_common.c | 1 +
>  11 files changed, 11 insertions(+)
>
> diff --git a/drivers/net/ethernet/sun/Kconfig 
> b/drivers/net/ethernet/sun/Kconfig
> index b2caf51..7b982e0 100644
> --- a/drivers/net/ethernet/sun/Kconfig
> +++ b/drivers/net/ethernet/sun/Kconfig
> @@ -1,3 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0
>  #
>  # Sun network device configuration
>  #
> diff --git a/drivers/net/ethernet/sun/cassini.c 
> b/drivers/net/ethernet/sun/cassini.c
> index 113bd57..9020b08 100644
> --- a/drivers/net/ethernet/sun/cassini.c
> +++ b/drivers/net/ethernet/sun/cassini.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* cassini.c: Sun Microsystems Cassini(+) ethernet driver.
>   *
>   * Copyright (C) 2004 Sun Microsystems Inc.
> diff --git a/drivers/net/ethernet/sun/cassini.h 
> b/drivers/net/ethernet/sun/cassini.h
> index 882ce16..13f3860 100644
> --- a/drivers/net/ethernet/sun/cassini.h
> +++ b/drivers/net/ethernet/sun/cassini.h
> @@ -1,3 +1,4 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
>  /* $Id: cassini.h,v 1.16 2004/08/17 21:15:16 zaumen Exp $
>   * cassini.h: Definitions for Sun Microsystems Cassini(+) ethernet driver.
>   *
> diff --git a/drivers/net/ethernet/sun/ldmvsw.c 
> b/drivers/net/ethernet/sun/ldmvsw.c
> index 5ea0376..a5dd627 100644
> --- a/drivers/net/ethernet/sun/ldmvsw.c
> +++ b/drivers/net/ethernet/sun/ldmvsw.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* ldmvsw.c: Sun4v LDOM Virtual Switch Driver.
>   *
>   * Copyright (C) 2016-2017 Oracle. All rights reserved.
> diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
> index 06001ba..8dd545f 100644
> --- a/drivers/net/ethernet/sun/niu.c
> +++ b/drivers/net/ethernet/sun/niu.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* niu.c: Neptune ethernet driver.
>   *
>   * Copyright (C) 2007, 2008 David S. Miller (da...@davemloft.net)
> diff --git a/drivers/net/ethernet/sun/sunbmac.c 
> b/drivers/net/ethernet/sun/sunbmac.c
> index 0b1f41f..f047b27 100644
> --- a/drivers/net/ethernet/sun/sunbmac.c
> +++ b/drivers/net/ethernet/sun/sunbmac.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* sunbmac.c: Driver for Sparc BigMAC 100baseT ethernet adapters.
>   *
>   * Copyright (C) 1997, 1998, 1999, 2003, 2008 David S. Miller 
> (da...@davemloft.net)
> diff --git a/drivers/net/ethernet/sun/sungem.c 
> b/drivers/net/ethernet/sun/sungem.c
> index a7afcee..7a16d40 100644
> --- a/drivers/net/ethernet/sun/sungem.c
> +++ b/drivers/net/ethernet/sun/sungem.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* $Id: sungem.c,v 1.44.2.22 2002/03/13 01:18:12 davem Exp $
>   * sungem.c: Sun GEM ethernet driver.
>   *
> diff --git a/drivers/net/ethernet/sun/sunhme.c 
> b/drivers/net/ethernet/sun/sunhme.c
> index 0431f1e..06da2f5 100644
> --- a/drivers/net/ethernet/sun/sunhme.c
> +++ b/drivers/net/ethernet/sun/sunhme.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* sunhme.c: Sparc HME/BigMac 10/100baseT half/full duplex auto switching,
>   *   auto carrier detecting ethernet driver.  Also known as the
>   *   "Happy Meal Ethernet" found on SunSwift SBUS cards.
> diff --git a/drivers/net/ethernet/sun/sunqe.c 
> b/drivers/net/ethernet/sun/sunqe.c
> index a6bcdcd..7fe0d5e 100644
> --- a/drivers/net/ethernet/sun/sunqe.c
> +++ b/drivers/net/ethernet/sun/sunqe.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* sunqe.c: Sparc QuadEthernet 10baseT SBUS card driver.
>   *  Once again I am out to prove that every ethernet
>   *  controller out there can be most efficiently programmed
> diff --git a/drivers/net/ethernet/sun/sunvnet.c 
> b/drivers/net/ethernet/sun/sunvnet.c
> index 27fb226..63d3d6b 100644
> --- a/drivers/net/ethernet/sun/sunvnet.c
> +++ b/drivers/net/ethernet/sun/sunvnet.c
> @@ -1,3 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
>  /* sunvnet.c: Sun LDOM Virtual Network Driver.
>   *
>   * Copyright (C) 2007, 2008 David S. Miller 
> diff --git a/drivers/net/ethernet/sun/sunvnet_common.c 
> b/drivers/net/ethernet/sun/sunvnet_common.c
> index 

Re: [PATCH iproute2-next 8/9] utils: Introduce and use print_name_and_link() to print name@link

2018-02-06 Thread David Ahern
On 2/5/18 12:49 PM, Serhey Popovych wrote:
> There is at least three places implementing same things: two in
> ipaddress.c print_linkinfo() & print_linkinfo_brief() and one in
> bridge/link.c.
> 
> They are diverge from each other very little: bridge/link.c does not
> support JSON output at the moment and print_linkinfo_brief() does not
> handle IFLA_LINK_NETNS case.
> 
> Introduce and use print_name_and_link() routine to handle name@link
> output in all possible variations; respect IFLA_LINK_NETNS attribute to
> handle case when link is in different namespace; use ll_idx_n2a() for
> interface name instead of "" to share logic with other code (e.g.
> ll_name_to_index() and ll_index_to_name()) supporting such template.
> 
> Signed-off-by: Serhey Popovych 
> ---
>  bridge/link.c   |   13 +++--
>  include/utils.h |4 
>  ip/ipaddress.c  |   44 ++--
>  lib/utils.c |   49 +
>  4 files changed, 58 insertions(+), 52 deletions(-)
> 

This patch is causing a diff on my system:

# ip  -br add sh > /tmp/1
# ip/ip  -br add sh > /tmp/2
# diff /tmp/1 /tmp/2
8c8
< veth-out@br3 UP fe80::18a8:89ff:fee7:55c5/64
---
> veth-out@if7 UP fe80::18a8:89ff:fee7:55c5/64

So the current ip resolves ifindex 7 to br3:

# ip li sh dev br3
7: br3:  mtu 1500 qdisc noqueue master
vrf3 state UP mode DEFAULT group default qlen 1000

where your patch causes if%d to be printed.


Re: Two net_sched fixes for stable

2018-02-06 Thread Cong Wang
On Tue, Feb 6, 2018 at 12:20 PM, David Miller  wrote:
> From: Cong Wang 
>>
>> Please let me know how you want to handle this for 4.14.
>
> Ok, I sent this off for 4.15 -stable but I need you to do the
> 4.14 backport.
>

OK. I assume you mean I should send the backports directly
to stable. I will do it.

Thanks.


Fwd: u32 ht filters

2018-02-06 Thread Cong Wang
Hi, Jiri

Your  commit 7fa9d974f3c2a016b9accb18f4ee2ed2a738585c
breaks the tc script by Paweł. Please find below for details.


commit 7fa9d974f3c2a016b9accb18f4ee2ed2a738585c
Author: Jiri Pirko 
Date:   Fri Oct 13 14:01:02 2017 +0200

net: sched: cls_u32: use block instead of q in tc_u_common

tc_u_common is now per-q. With blocks, it has to be converted to be
per-block.

Signed-off-by: Jiri Pirko 
Signed-off-by: David S. Miller 

Before this commit, u32 hashtables are per-qdisc, after this commit
it becomes per-block or per-class... this is why the script below is broken.


-- Forwarded message --
From: Paweł Staszewski 
Date: Tue, Feb 6, 2018 at 8:05 AM
Subject: u32 ht filters
To: Cong Wang 


Hi


Is there something changed in kernek 4.15 that makes problem with old
configuration of tc filters with hashing filters ?

for example this :

tc qdisc del root dev ifb1

tc qdisc add dev ifb1 root handle 1:0 hfsc default 8000
tc filter add dev ifb1 parent 1:0 protocol ip u32
tc class add dev ifb1 parent 1:0 classid 1:1 hfsc ls m2 1Mbit ul
m2 1Mbit
tc class add dev ifb1 parent 1:1 classid 1:2 hfsc ls m2 1Mbit ul
m2 1Mbit
tc class add dev ifb1 parent 1:1 classid 1:3 hfsc ls m2 5000Mbit ul m2 5000Mbit
tc class add dev ifb1 parent 1:2 classid 1:8000 hfsc ls m2 1Mbit
ul m2 1Mbit
tc qdisc add dev ifb1 parent 1:8000 handle 8000: sfq perturb 60
tc qdisc add dev ifb1 parent 1:3 handle 3: pfifo limit 1


tc filter add dev ifb1 protocol ip parent 1:0 handle 9: u32 divisor 256
tc filter add dev ifb1 protocol ip parent 1:0 u32 ht 800:: match ip
dst 192.168.0.0/24 hashkey mask 0x00ff at 16 link 9:
tc class add dev ifb1 parent 1:2 classid 1:60 hfsc ls m2 8kbit ul m2 51200kbit
echo 1
tc filter add dev ifb1 parent 1:2 protocol ip u32 ht 9:22 match ip dst
192.168.0.34 flowid 1:60
echo 2
tc qdisc add dev ifb1 parent 1:60 handle 60: pfifo limit 8192


Is working with 4.13


But it is not working with 4.15

error is when adding:

tc filter add dev ifb1 protocol ip parent 1:2 prio 4 u32 ht 9:0x22
match ip dst 192.168.0.34 flowid 1:60
RTNETLINK answers: Invalid argument
We have an error talking to the kernel




Thanks

Paweł Staszewski


Re: [PATCH net 1/1 v2] rtnetlink: require unique netns identifier

2018-02-06 Thread kbuild test robot
Hi Christian,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net/master]

url:
https://github.com/0day-ci/linux/commits/Christian-Brauner/rtnetlink-require-unique-netns-identifier/20180207-064207
config: x86_64-rhel (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817de851: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817de85f: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817decc2: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817decf1: 0f ff c3
ud0%ebx,%eax
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817def6c: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817df332: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e1947: 0f ff 44 8b ad  
ud0-0x53(%rbx,%rcx,4),%eax
   arch/x86/tools/insn_decoder_test: warning: objdump says 5 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e2552: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e2585: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e26d8: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e2752: 0f ff 48 8d 
ud0-0x73(%rax),%ecx
   arch/x86/tools/insn_decoder_test: warning: objdump says 4 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e2801: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e305e: 0f ff eb
ud0%ebx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e3559: 0f ff e9
ud0%ecx,%ebp
   arch/x86/tools/insn_decoder_test: warning: objdump says 3 bytes, but 
insn_get_length() says 2
   arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder 
bug, please report this.
   arch/x86/tools/insn_decoder_test: warning: 817e3fd8: 0f ff 48 8b 
ud0 

Re: qdisc_pkt_len_init: SCTP/GSO_BY_FRAGS and robustness questions

2018-02-06 Thread Eric Dumazet
On Wed, 2018-02-07 at 12:15 +1100, Daniel Axtens wrote:
> Hi Marcelo and Eric,
> 
> I'm working on checking code that might be impacted by GSO_BY_FRAGS -
> after finding that the token bucket filter qdisc code doesn't handle it
> properly, DaveM said I should look for other places where this might be
> an issue [0].
> 
> I'm currently looking at qdisc_pkt_len_init in net/core/dev.c. This is
> called by __dev_queue_xmit, before validate_xmit_skb, so before an SCTP
> skb would be segmented if the hardware doesn't support SCTP offload.
> 
> There are two things I was hoping you two could offer some advice on:
> 
> 1) Eric, in 7c68d1a6b4db ("net: qdisc_pkt_len_init() should be more
>robust") you replaced a chunk of code that is similar to the code
>found in skb_gso_transport_seglen() and replaced it with more robust
>code. Do we need to change skb_gso_transport_seglen() in a similar way?

I would prefer we get rid of DODGY ability to provide buggy packets.

It would be silly to 'fix' all the places in the kernel (like hundred
of drivers I guess), while we can simply validate packets at the time
they are provided by malicious sources (user space using
af_packet/tun/virtio_net ...)


> 
> 2) Marcelo, unlike skb_gso_transport_seglen(), where you added a case
>for SCTP in 90017accff61 ("sctp: Add GSO support"), there doesn't
>seem to be a GSO_BY_FRAGS or SCTP check in qdisc_pkt_len_init, so I
>think the accounting is probably wrong for SCTP. I'm not 100% sure
>how to fix this as it's now quite different from the calcuations in
>skb_gso_transport_seglen() - so I was hoping that you might have an
>idea.
> 
> Thanks in advance!
> 
> [0]: https://patchwork.ozlabs.org/patch/869145/#1852414
> 
> Regards,
> Daniel
> 
> 


[PATCH net] tcp: tracepoint: only call trace_tcp_send_reset with full socket

2018-02-06 Thread Song Liu
tracepoint tcp_send_reset requires a full socket to work. However, it
may be called when in TCP_TIME_WAIT:

case TCP_TW_RST:
tcp_v6_send_reset(sk, skb);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;

To avoid this problem, this patch checks the socket with sk_fullsock()
before calling trace_tcp_send_reset().

Fixes: c24b14c46bb8 ("tcp: add tracepoint trace_tcp_send_reset")
Signed-off-by: Song Liu 
Reviewed-by: Lawrence Brakmo 
---
 net/ipv4/tcp_ipv4.c | 3 ++-
 net/ipv6/tcp_ipv6.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95738aa..f8ad397 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -705,7 +705,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb)
 */
if (sk) {
arg.bound_dev_if = sk->sk_bound_dev_if;
-   trace_tcp_send_reset(sk, skb);
+   if (sk_fullsock(sk))
+   trace_tcp_send_reset(sk, skb);
}
 
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a1ab29e..412139f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -942,7 +942,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct 
sk_buff *skb)
 
if (sk) {
oif = sk->sk_bound_dev_if;
-   trace_tcp_send_reset(sk, skb);
+   if (sk_fullsock(sk))
+   trace_tcp_send_reset(sk, skb);
}
 
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
-- 
2.9.5



Re: [Android][Kernel][TCP/IP] report of packet discarding during tcp handshaking

2018-02-06 Thread Eric Dumazet
On Wed, 2018-02-07 at 11:16 +0900, 배석진 wrote:
> Hello, 
> this is bae working on samsung elec. 
> 

Hi Bae,  thanks for this detailed report and analysis.

> we have a problem that packet discarded during 3-way handshaking on TCP. 
> already looks like that Mr Dumazet try to fix the similar issue on this 
> patch, 
> https://android.googlesource.com/kernel/common/+/5e0724d027f0548511a2165a209572d48fe7a4c8
>  

This patch was fixing a more serious bug, since this was possibly
causing corruptions and kernel crashes.

What you describe is simply a packet being dropped, on a very low
probability.

Which is not a huge deal for TCP since this packet will eventually be
re-transmitted.

Also most TCP stacks/flows use DF bit set to not allow packets being
fragmented...

Anyway see my answer at the end of this mail.


> but we are still facing the another corner case.
> 
> it needs preconditions for this problem.
> (1) last ack packet of 3-way handshaking and next packet have been arrived at 
> almost same time 
> (2) next packet, the first data packet was fragmented 
> (3) enable rps
> 
> 
> [tcp dump]
> No. A-Time Source Destination  Len   Seq  Info 
>  1  08:35:18.115259  193.81.6.70  10.217.0.47  84 0   [SYN] Seq=0 
> Win=21504 Len=0 MSS=1460 
>  2  08:35:18.115888  10.217.0.47  193.81.6.70  84 0   6100 → 5063 [SYN, 
> ACK] Seq=0 Ack=1 Win=29200 Len=0 MSS=1460 
>  3  08:35:18.142385  193.81.6.70  10.217.0.47  80 1   5063 → 6100 [ACK] 
> Seq=1 Ack=1 Win=21504 Len=0 
>  4  08:35:18.142425  193.81.6.70  10.217.0.47  1516   Fragmented IP 
> protocol (proto=Encap Security Payload 50, off=0, ID=6e24) [Reassembled in 
> #5] 
>  5  08:35:18.142449  193.81.6.70  10.217.0.47  60 1   5063 → 6100 [ACK] 
> Seq=1 Ack=1 Win=21504 Len=1460 [TCP segment of a reassembled PDU] 
>  6  08:35:21.227070  193.81.6.70  10.217.0.47  1516   Fragmented IP 
> protocol (proto=Encap Security Payload 50, off=0, ID=71e9) [Reassembled in 
> #7] 
>  7  08:35:21.227191  193.81.6.70  10.217.0.47  60 1   [TCP 
> Retransmission] 5063 → 6100 [ACK] Seq=1 Ack=1 Win=21504 Len=1460 
>  8  08:35:21.228822  10.217.0.47  193.81.6.70  80 1   6100 → 5063 [ACK] 
> Seq=1 Ack=1461 Win=32120 Len=0
> 
> - last ack packet of handshaking(No.3) and next data packet(No4,5) were 
> arrived with just 40us time gap.
> 
> 
> [kernel log]
> - stage 1 
> <3>[ 1037.669229] I[0:  system_server: 3778] get_rps_cpu: skb(64), check hash 
> value:3412396090 
> <3>[ 1037.669261] I[0:  system_server: 3778] get_rps_cpu: skb(1500), check 
> hash value:158575680 
> <3>[ 1037.669285] I[0:  system_server: 3778] get_rps_cpu: skb(44), check hash 
> value:158575680 
> - stage 2 
> <3>[ 1037.669541] I[1: Binder:3778_13: 8391] tcp_v4_rcv: Enter! 
> skb(seq:A93E087B, len:1480) 
> <3>[ 1037.669552] I[2:Jit thread pool:12990] tcp_v4_rcv: Enter! 
> skb(seq:A93E087B, len:20) 
> <3>[ 1037.669564] I[2:Jit thread pool:12990] tcp_v4_rcv: check sk_state:12 
> skb(seq:A93E087B, len:20) 
> <3>[ 1037.669585] I[2:Jit thread pool:12990] tcp_check_req, Enter!: 
> skb(seq:A93E087B, len:20) 
> <3>[ 1037.669612] I[1: Binder:3778_13: 8391] tcp_v4_rcv: check sk_state:12 
> skb(seq:A93E087B, len:1480) 
> <3>[ 1037.669625] I[1: Binder:3778_13: 8391] tcp_check_req, Enter!: 
> skb(seq:A93E087B, len:1480) 
> <3>[ 1037.669653] I[2:Jit thread pool:12990] tcp_check_req, skb(seq:A93E087B, 
> len:20), own_req:1 
> <3>[ 1037.669668] I[1: Binder:3778_13: 8391] tcp_check_req, skb(seq:A93E087B, 
> len:1480), own_req:0 
> <3>[ 1037.669708] I[2:Jit thread pool:12990] tcp_rcv_state_process, 
> Established: skb(seq:A93E087B, len:20) 
> <3>[ 1037.669724] I[1: Binder:3778_13: 8391] tcp_v4_rcv: discard_relse 
> skb(seq:A93E087B, len:1480)
> 
> - stage 1 
> because of the data packet has been fragmented(No.4 & 5), 
> it was hashed to another core(cpu1) which was differnet with last ack 
> packet(cpu2), by rps. 
> so last ack and data packet handled in different core almost simultaniously, 
> at NEW_SYN_RECV state.
> 
> - stage 2, cpu2 
> one of them will be treated in tcp_check_req() function a little more 
> earlier, 
> then it got the true value for own_req from tcp_v4_syn_recv_sock(), and 
> return valid nsk. 
> finally going to ESTABLISHED state.
> 
> - stage 2, cpu1 
> but another, later one is got the false value for own_req, 
> and return null for nsk, because of own_req value is false in 
> inet_csk_complete_hashdance(). 
> so earlier packet was handled successfully but later one has gone to discard.
> 
> at this time, one of the ack or data packet could be discarded, by schedule 
> timing. (we saw both of them) 
> if the ack was discarded, that's ok. 
> tcp state goes to ESTABLISHED by piggyback on data packet, and payload will 
> be deliverd to upper layer. 
> but if the data packet was discarded, client can't receive the payload it 
> have to. 
> this is the problem we faced.
> 
> 
> although server retransmitted the dropped packet(No6,7), but it takes few 
> seconds delay. 
> 

Re: [vhost:vhost 20/20] ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] undefined!

2018-02-06 Thread Michael S. Tsirkin
On Wed, Feb 07, 2018 at 03:26:31AM +, Wang, Wei W wrote:
> On Wednesday, February 7, 2018 10:52 AM, Michael S. Tsirkin wrote:
> > On Wed, Feb 07, 2018 at 10:25:35AM +0800, Wei Wang wrote:
> > > On 02/07/2018 09:26 AM, kbuild test robot wrote:
> > > > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
> > vhost
> > > > head:   96bcd04462b99e2c80e09f6537770a0ca6b288d0
> > > > commit: 96bcd04462b99e2c80e09f6537770a0ca6b288d0 [20/20]
> > > > virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
> > > > config: ia64-allmodconfig (attached as .config)
> > > > compiler: ia64-linux-gcc (GCC) 7.2.0
> > > > reproduce:
> > > >  wget https://raw.githubusercontent.com/intel/lkp-
> > tests/master/sbin/make.cross -O ~/bin/make.cross
> > > >  chmod +x ~/bin/make.cross
> > > >  git checkout 96bcd04462b99e2c80e09f6537770a0ca6b288d0
> > > >  # save the attached .config to linux build tree
> > > >  make.cross ARCH=ia64
> > > >
> > > > All errors (new ones prefixed by >>):
> > > >
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/auxdisplay/img-ascii-lcd.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-
> > ath79.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-
> > iop.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/iio/accel/kxsd9-i2c.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/iio/adc/qcom-vadc-common.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/media/platform/mtk-vcodec/mtk-vcodec-common.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/media/platform/tegra-cec/tegra_cec.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/mtd/nand/denali_pci.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/pinctrl/pxa/pinctrl-pxa2xx.o
> > > > see include/linux/module.h for more information
> > > > WARNING: modpost: missing MODULE_LICENSE() in
> > drivers/power/reset/zx-reboot.o
> > > > see include/linux/module.h for more information
> > > > > > ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko]
> > undefined!
> > >
> > > page_poisoning_enabled needs to be exposed. I'll send a small patch to
> > > add EXPORT_SYMBOL_GPL(page_poisoning_enabled).
> > >
> > >
> > > Best,
> > > Wei
> > 
> > This will probably miss this release cycle.
> 
> OK if it's too difficult.  My bad, didn't capture that, too sad :(

There will always be the next release. There's work to do on the
qemu side meanwhile.

> I just resent that patch with the fix.
> 
> Best,
> Wei

I can park it on the vhost branch once the merge window closes.

-- 
MST


[PATCH net-next] sch_netem: Bug fixing in calculating Netem interval

2018-02-06 Thread Md. Islam
In Kernel 4.15.0+, Netem does not work properly.

Netem setup:

tc qdisc add dev h1-eth0 root handle 1: netem delay 10ms 2ms

Result:

PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data.
64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=22.8 ms
64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=10.9 ms
64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=10.9 ms
64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=11.4 ms
64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=4303 ms
64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.2 ms
64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.3 ms
64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=4304 ms
64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=4303 ms

Patch:

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7bbc13b..7c179ad 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -327,7 +327,7 @@ static s64 tabledist(s64 mu, s32 sigma,

 /* default uniform distribution */
 if (dist == NULL)
-return (rnd % (2 * sigma)) - sigma + mu;
+return ((rnd % (2 * sigma)) + mu) - sigma;

 t = dist->table[rnd % dist->size];
 x = (sigma % NETEM_DIST_SCALE) * t;


(rnd % (2 * sigma)) - sigma was overflowing s32. After applying the
patch, I found following output which is desirable.

PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data.
64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=21.1 ms
64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=8.46 ms
64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=9.00 ms
64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=8.36 ms
64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=8.11 ms
64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=10.0 ms
64 bytes from 172.16.101.2: icmp_seq=9 ttl=64 time=11.3 ms
64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.5 ms
64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.2 ms

Many thanks!
Tamim


RE: [vhost:vhost 20/20] ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] undefined!

2018-02-06 Thread Wang, Wei W
On Wednesday, February 7, 2018 10:52 AM, Michael S. Tsirkin wrote:
> On Wed, Feb 07, 2018 at 10:25:35AM +0800, Wei Wang wrote:
> > On 02/07/2018 09:26 AM, kbuild test robot wrote:
> > > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
> vhost
> > > head:   96bcd04462b99e2c80e09f6537770a0ca6b288d0
> > > commit: 96bcd04462b99e2c80e09f6537770a0ca6b288d0 [20/20]
> > > virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
> > > config: ia64-allmodconfig (attached as .config)
> > > compiler: ia64-linux-gcc (GCC) 7.2.0
> > > reproduce:
> > >  wget https://raw.githubusercontent.com/intel/lkp-
> tests/master/sbin/make.cross -O ~/bin/make.cross
> > >  chmod +x ~/bin/make.cross
> > >  git checkout 96bcd04462b99e2c80e09f6537770a0ca6b288d0
> > >  # save the attached .config to linux build tree
> > >  make.cross ARCH=ia64
> > >
> > > All errors (new ones prefixed by >>):
> > >
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/auxdisplay/img-ascii-lcd.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-
> ath79.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-
> iop.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/iio/accel/kxsd9-i2c.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/iio/adc/qcom-vadc-common.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/media/platform/mtk-vcodec/mtk-vcodec-common.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/media/platform/tegra-cec/tegra_cec.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/mtd/nand/denali_pci.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/pinctrl/pxa/pinctrl-pxa2xx.o
> > > see include/linux/module.h for more information
> > > WARNING: modpost: missing MODULE_LICENSE() in
> drivers/power/reset/zx-reboot.o
> > > see include/linux/module.h for more information
> > > > > ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko]
> undefined!
> >
> > page_poisoning_enabled needs to be exposed. I'll send a small patch to
> > add EXPORT_SYMBOL_GPL(page_poisoning_enabled).
> >
> >
> > Best,
> > Wei
> 
> This will probably miss this release cycle.

OK if it's too difficult.  My bad, didn't capture that, too sad :(

I just resent that patch with the fix.

Best,
Wei


Re: [PATCH] net: ethernet: ti: cpsw: fix net watchdog timeout

2018-02-06 Thread Ivan Khoronzhuk
On Tue, Feb 06, 2018 at 07:17:06PM -0600, Grygorii Strashko wrote:
> It was discovered that simple program which indefinitely sends 200b UDP
> packets and runs on TI AM574x SoC (SMP) under RT Kernel triggers network
> watchdog timeout in TI CPSW driver (<6 hours run). The network watchdog
> timeout is triggered due to race between cpsw_ndo_start_xmit() and
> cpsw_tx_handler() [NAPI]
> 
> cpsw_ndo_start_xmit()
>   if (unlikely(!cpdma_check_free_tx_desc(txch))) {
>   txq = netdev_get_tx_queue(ndev, q_idx);
>   netif_tx_stop_queue(txq);
> 
> ^^ as per [1] barier has to be used after set_bit() otherwise new value
> might not be visible to other cpus
>   }
> 
> cpsw_tx_handler()
>   if (unlikely(netif_tx_queue_stopped(txq)))
>   netif_tx_wake_queue(txq);
> 
> and when it happens ndev TX queue became disabled forever while driver's HW
> TX queue is empty.
I'm sure it fixes test case somehow but there is some strangeness.
(I've thought about this some X months ago):
1. If no free desc, then there is bunch of descs on the queue ready to be sent
2. If one of this desc while this process was missed then next will wake queue,
because there is bunch of them on the fly. So, if desc on top of the sent queue
missed to enable the queue, then next one more likely will enable it anyway..
then how it could happen? The described race is possible only on last
descriptor, yes, packets are small the speed is hight, possibility is very small
.but then next situation is also possible:
- packets are sent fast
- all packets were sent, but no any descriptors are freed now by sw interrupt 
(NAPI)
- when interrupt had started NAPI, the queue was enabled, all other next 
interrupts are throttled once NAPI not finished it's work yet.
- when new packet submitted, no free descs are present yet (NAPI has not freed
any yet), but all packets are sent, so no one can awake tx queue, as interrupt 
will not arise when NAPI is started to free first descriptor interrupts are 
disabled.because h/w queue to be sent is empty...
- how it can happen as submitting packet and handling packet operations is 
under 
channel lock? Not exactly, a period between handling and freeing the descriptor
to the pool is not under channel lock, here:

spin_unlock_irqrestore(>lock, flags);
if (unlikely(status & CPDMA_DESC_TD_COMPLETE))
cb_status = -ENOSYS;
else
cb_status = status;

__cpdma_chan_free(chan, desc, outlen, cb_status);
return status;

unlock_ret:
spin_unlock_irqrestore(>lock, flags);
return status;

And:
__cpdma_chan_free(chan, desc, outlen, cb_status);
-> cpdma_desc_free(pool, desc, 1);

As result, queue deadlock as you've described.
Just thought, not checked, but theoretically possible.
What do you think?

> 
> Fix this, by adding smp_mb__after_atomic() after netif_tx_stop_queue()
> calls and double check for free TX descriptors after stopping ndev TX queue
> - if there are free TX descriptors wake up ndev TX queue.
> 
> [1] https://www.kernel.org/doc/html/latest/core-api/atomic_ops.html
> Signed-off-by: Grygorii Strashko 
> ---
>  drivers/net/ethernet/ti/cpsw.c | 16 ++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index 10d7cbe..3805b13 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -1638,6 +1638,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff 
> *skb,
>   q_idx = q_idx % cpsw->tx_ch_num;
>  
>   txch = cpsw->txv[q_idx].ch;
> + txq = netdev_get_tx_queue(ndev, q_idx);
>   ret = cpsw_tx_packet_submit(priv, skb, txch);
>   if (unlikely(ret != 0)) {
>   cpsw_err(priv, tx_err, "desc submit failed\n");
> @@ -1648,15 +1649,26 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff 
> *skb,
>* tell the kernel to stop sending us tx frames.
>*/
>   if (unlikely(!cpdma_check_free_tx_desc(txch))) {
> - txq = netdev_get_tx_queue(ndev, q_idx);
>   netif_tx_stop_queue(txq);
> +
> + /* Barrier, so that stop_queue visible to other cpus */
> + smp_mb__after_atomic();
> +
> + if (cpdma_check_free_tx_desc(txch))
> + netif_tx_wake_queue(txq);
>   }
>  
>   return NETDEV_TX_OK;
>  fail:
>   ndev->stats.tx_dropped++;
> - txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
>   netif_tx_stop_queue(txq);
> +
> + /* Barrier, so that stop_queue visible to other cpus */
> + smp_mb__after_atomic();
> +
> + if (cpdma_check_free_tx_desc(txch))
> + netif_tx_wake_queue(txq);
> +
>   return NETDEV_TX_BUSY;
>  }
>  
> -- 
> 2.10.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-omap" in
> the body of a message to majord...@vger.kernel.org
> 

Re: [PATCH] atm: idt77252: Replace mdelay with usleep_range in idt77252_preset

2018-02-06 Thread Maciej W. Rozycki
On Fri, 26 Jan 2018, Jia-Ju Bai wrote:

> diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
> index 0277f36..cea4bf2 100644
> --- a/drivers/atm/idt77252.c
> +++ b/drivers/atm/idt77252.c
> @@ -3563,7 +3563,7 @@ static int idt77252_preset(struct idt77252_dev *card)
>  
>   /* Software reset */
>   writel(SAR_CFG_SWRST, SAR_REG_CFG);
> - mdelay(1);
> + usleep_range(500, 1000);
>   writel(0, SAR_REG_CFG);
>  
>   IPRINTK("%s: Software resetted.\n", card->name);

 This is only called from the driver's ->probe method, so it looks to me 
indeed safe to sleep here.  A similar, more extensive clean-up seems due 
for 77252 older brother's driver nicstar.c.

 Out of curiosity I have looked up the SAR manual and it requires the 
SWRST bit to be asserted for at least 2 PCI clock cycles for the reset to 
be valid, so having the lower bound of .5ms still looks completely safe if 
not an overkill to me for real world applications where PCI is driven in 
the MHz clock range.

Reviewed-by: Maciej W. Rozycki 

  Maciej


Re: [vhost:vhost 20/20] ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] undefined!

2018-02-06 Thread Michael S. Tsirkin
On Wed, Feb 07, 2018 at 10:25:35AM +0800, Wei Wang wrote:
> On 02/07/2018 09:26 AM, kbuild test robot wrote:
> > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost
> > head:   96bcd04462b99e2c80e09f6537770a0ca6b288d0
> > commit: 96bcd04462b99e2c80e09f6537770a0ca6b288d0 [20/20] virtio-balloon: 
> > VIRTIO_BALLOON_F_FREE_PAGE_HINT
> > config: ia64-allmodconfig (attached as .config)
> > compiler: ia64-linux-gcc (GCC) 7.2.0
> > reproduce:
> >  wget 
> > https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> > ~/bin/make.cross
> >  chmod +x ~/bin/make.cross
> >  git checkout 96bcd04462b99e2c80e09f6537770a0ca6b288d0
> >  # save the attached .config to linux build tree
> >  make.cross ARCH=ia64
> > 
> > All errors (new ones prefixed by >>):
> > 
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/auxdisplay/img-ascii-lcd.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-ath79.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-iop.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/iio/accel/kxsd9-i2c.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/iio/adc/qcom-vadc-common.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/media/platform/mtk-vcodec/mtk-vcodec-common.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/media/platform/tegra-cec/tegra_cec.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/mtd/nand/denali_pci.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/pinctrl/pxa/pinctrl-pxa2xx.o
> > see include/linux/module.h for more information
> > WARNING: modpost: missing MODULE_LICENSE() in 
> > drivers/power/reset/zx-reboot.o
> > see include/linux/module.h for more information
> > > > ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] 
> > > > undefined!
> 
> page_poisoning_enabled needs to be exposed. I'll send a small patch to add
> EXPORT_SYMBOL_GPL(page_poisoning_enabled).
> 
> 
> Best,
> Wei

This will probably miss this release cycle.

-- 
MST


Re: [vhost:vhost 20/20] ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] undefined!

2018-02-06 Thread Wei Wang

On 02/07/2018 09:26 AM, kbuild test robot wrote:

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost
head:   96bcd04462b99e2c80e09f6537770a0ca6b288d0
commit: 96bcd04462b99e2c80e09f6537770a0ca6b288d0 [20/20] virtio-balloon: 
VIRTIO_BALLOON_F_FREE_PAGE_HINT
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 7.2.0
reproduce:
 wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
 chmod +x ~/bin/make.cross
 git checkout 96bcd04462b99e2c80e09f6537770a0ca6b288d0
 # save the attached .config to linux build tree
 make.cross ARCH=ia64

All errors (new ones prefixed by >>):

WARNING: modpost: missing MODULE_LICENSE() in 
drivers/auxdisplay/img-ascii-lcd.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-ath79.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-iop.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in drivers/iio/accel/kxsd9-i2c.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in 
drivers/iio/adc/qcom-vadc-common.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in 
drivers/media/platform/mtk-vcodec/mtk-vcodec-common.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in 
drivers/media/platform/tegra-cec/tegra_cec.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in drivers/mtd/nand/denali_pci.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in 
drivers/pinctrl/pxa/pinctrl-pxa2xx.o
see include/linux/module.h for more information
WARNING: modpost: missing MODULE_LICENSE() in 
drivers/power/reset/zx-reboot.o
see include/linux/module.h for more information

ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] undefined!


page_poisoning_enabled needs to be exposed. I'll send a small patch to 
add EXPORT_SYMBOL_GPL(page_poisoning_enabled).



Best,
Wei


Re: [PATCH iproute2-next 4/6] iptunnel/ip6tunnel: Code cleanups

2018-02-06 Thread David Ahern
On 2/6/18 7:16 PM, David Ahern wrote:
> On 2/2/18 6:10 AM, Serhey Popovych wrote:
>> @@ -414,15 +428,18 @@ static int do_tunnels_list(struct ip_tunnel_parm *p)
>>  fprintf(stderr, "Failed to get type of \"%s\"\n", name);
>>  continue;
>>  }
>> -if (type != ARPHRD_TUNNEL && type != ARPHRD_IPGRE && type != 
>> ARPHRD_SIT)
>> +switch (type) {
>> +case ARPHRD_TUNNEL:
>> +case ARPHRD_IPGRE:
>> +case ARPHRD_SIT:
>> +break;
>> +default:
>>  continue;
>> +}
>> +memset(p1, 0, sizeof(p1));
> 
> Shouldn't that be  for the first arg? I get a compile failure:
> 
> ip
> CC   iptunnel.o
> CC   ip6tunnel.o
> iptunnel.c: In function ‘do_tunnels_list’:
> iptunnel.c:439:10: error: incompatible type for argument 1 of ‘memset’
>memset(p1, 0, sizeof(p1));
>   ^~
> In file included from iptunnel.c:15:0:
> /usr/include/string.h:62:14: note: expected ‘void *’ but argument is of
> type ‘struct ip_tunnel_parm’
>  extern void *memset (void *__s, int __c, size_t __n) __THROW __nonnull
> ((1));
>   ^~
> ../config.mk:48: recipe for target 'iptunnel.o' failed
> 

Fixed by patch 5 which deletes do_tunnels_list. So why have a cleanup
patch that changes code you then delete?



[Android][Kernel][TCP/IP] report of packet discarding during tcp handshaking

2018-02-06 Thread 배석진
Hello, 
this is bae working on samsung elec. 

we have a problem that packet discarded during 3-way handshaking on TCP. 
already looks like that Mr Dumazet try to fix the similar issue on this patch, 
https://android.googlesource.com/kernel/common/+/5e0724d027f0548511a2165a209572d48fe7a4c8
 
but we are still facing the another corner case.

it needs preconditions for this problem.
(1) last ack packet of 3-way handshaking and next packet have been arrived at 
almost same time 
(2) next packet, the first data packet was fragmented 
(3) enable rps


[tcp dump]
No. A-Time Source Destination  Len   Seq  Info 
 1  08:35:18.115259  193.81.6.70  10.217.0.47  84 0   [SYN] Seq=0 Win=21504 
Len=0 MSS=1460 
 2  08:35:18.115888  10.217.0.47  193.81.6.70  84 0   6100 → 5063 [SYN, 
ACK] Seq=0 Ack=1 Win=29200 Len=0 MSS=1460 
 3  08:35:18.142385  193.81.6.70  10.217.0.47  80 1   5063 → 6100 [ACK] 
Seq=1 Ack=1 Win=21504 Len=0 
 4  08:35:18.142425  193.81.6.70  10.217.0.47  1516   Fragmented IP 
protocol (proto=Encap Security Payload 50, off=0, ID=6e24) [Reassembled in #5] 
 5  08:35:18.142449  193.81.6.70  10.217.0.47  60 1   5063 → 6100 [ACK] 
Seq=1 Ack=1 Win=21504 Len=1460 [TCP segment of a reassembled PDU] 
 6  08:35:21.227070  193.81.6.70  10.217.0.47  1516   Fragmented IP 
protocol (proto=Encap Security Payload 50, off=0, ID=71e9) [Reassembled in #7] 
 7  08:35:21.227191  193.81.6.70  10.217.0.47  60 1   [TCP Retransmission] 
5063 → 6100 [ACK] Seq=1 Ack=1 Win=21504 Len=1460 
 8  08:35:21.228822  10.217.0.47  193.81.6.70  80 1   6100 → 5063 [ACK] 
Seq=1 Ack=1461 Win=32120 Len=0

- last ack packet of handshaking(No.3) and next data packet(No4,5) were arrived 
with just 40us time gap.


[kernel log]
- stage 1 
<3>[ 1037.669229] I[0:  system_server: 3778] get_rps_cpu: skb(64), check hash 
value:3412396090 
<3>[ 1037.669261] I[0:  system_server: 3778] get_rps_cpu: skb(1500), check hash 
value:158575680 
<3>[ 1037.669285] I[0:  system_server: 3778] get_rps_cpu: skb(44), check hash 
value:158575680 
- stage 2 
<3>[ 1037.669541] I[1: Binder:3778_13: 8391] tcp_v4_rcv: Enter! 
skb(seq:A93E087B, len:1480) 
<3>[ 1037.669552] I[2:Jit thread pool:12990] tcp_v4_rcv: Enter! 
skb(seq:A93E087B, len:20) 
<3>[ 1037.669564] I[2:Jit thread pool:12990] tcp_v4_rcv: check sk_state:12 
skb(seq:A93E087B, len:20) 
<3>[ 1037.669585] I[2:Jit thread pool:12990] tcp_check_req, Enter!: 
skb(seq:A93E087B, len:20) 
<3>[ 1037.669612] I[1: Binder:3778_13: 8391] tcp_v4_rcv: check sk_state:12 
skb(seq:A93E087B, len:1480) 
<3>[ 1037.669625] I[1: Binder:3778_13: 8391] tcp_check_req, Enter!: 
skb(seq:A93E087B, len:1480) 
<3>[ 1037.669653] I[2:Jit thread pool:12990] tcp_check_req, skb(seq:A93E087B, 
len:20), own_req:1 
<3>[ 1037.669668] I[1: Binder:3778_13: 8391] tcp_check_req, skb(seq:A93E087B, 
len:1480), own_req:0 
<3>[ 1037.669708] I[2:Jit thread pool:12990] tcp_rcv_state_process, 
Established: skb(seq:A93E087B, len:20) 
<3>[ 1037.669724] I[1: Binder:3778_13: 8391] tcp_v4_rcv: discard_relse 
skb(seq:A93E087B, len:1480)

- stage 1 
because of the data packet has been fragmented(No.4 & 5), 
it was hashed to another core(cpu1) which was differnet with last ack 
packet(cpu2), by rps. 
so last ack and data packet handled in different core almost simultaniously, at 
NEW_SYN_RECV state.

- stage 2, cpu2 
one of them will be treated in tcp_check_req() function a little more earlier, 
then it got the true value for own_req from tcp_v4_syn_recv_sock(), and return 
valid nsk. 
finally going to ESTABLISHED state.

- stage 2, cpu1 
but another, later one is got the false value for own_req, 
and return null for nsk, because of own_req value is false in 
inet_csk_complete_hashdance(). 
so earlier packet was handled successfully but later one has gone to discard.

at this time, one of the ack or data packet could be discarded, by schedule 
timing. (we saw both of them) 
if the ack was discarded, that's ok. 
tcp state goes to ESTABLISHED by piggyback on data packet, and payload will be 
deliverd to upper layer. 
but if the data packet was discarded, client can't receive the payload it have 
to. 
this is the problem we faced.


although server retransmitted the dropped packet(No6,7), but it takes few 
seconds delay. 
since of this problem occured in IMS-Call setup, this is appeared to call 
connection delay. 
these situation is serious problem in call service.

do you have any report about this or plan to fix it?


best regards,
bae.



 
  배 석 진 (Bae Souk-Jin) 
   System R Group 2
   Mobile Device Division Telecommunication Business
   SAMSUNG ELECTRONICS CO. LTD

   Mobile : 82-10-2888-2200
   E-mail : soukjin@samsung.com




Re: [PATCH iproute2-next 4/6] iptunnel/ip6tunnel: Code cleanups

2018-02-06 Thread David Ahern
On 2/2/18 6:10 AM, Serhey Popovych wrote:
> @@ -414,15 +428,18 @@ static int do_tunnels_list(struct ip_tunnel_parm *p)
>   fprintf(stderr, "Failed to get type of \"%s\"\n", name);
>   continue;
>   }
> - if (type != ARPHRD_TUNNEL && type != ARPHRD_IPGRE && type != 
> ARPHRD_SIT)
> + switch (type) {
> + case ARPHRD_TUNNEL:
> + case ARPHRD_IPGRE:
> + case ARPHRD_SIT:
> + break;
> + default:
>   continue;
> + }
> + memset(p1, 0, sizeof(p1));

Shouldn't that be  for the first arg? I get a compile failure:

ip
CC   iptunnel.o
CC   ip6tunnel.o
iptunnel.c: In function ‘do_tunnels_list’:
iptunnel.c:439:10: error: incompatible type for argument 1 of ‘memset’
   memset(p1, 0, sizeof(p1));
  ^~
In file included from iptunnel.c:15:0:
/usr/include/string.h:62:14: note: expected ‘void *’ but argument is of
type ‘struct ip_tunnel_parm’
 extern void *memset (void *__s, int __c, size_t __n) __THROW __nonnull
((1));
  ^~
../config.mk:48: recipe for target 'iptunnel.o' failed

>   if (tnl_get_ioctl(name, ))
>   continue;
> - if ((p->link && p1.link != p->link) ||
> - (p->name[0] && strcmp(p1.name, p->name)) ||
> - (p->iph.daddr && p1.iph.daddr != p->iph.daddr) ||
> - (p->iph.saddr && p1.iph.saddr != p->iph.saddr) ||
> - (p->i_key && p1.i_key != p->i_key))
> + if (!ip_tunnel_parm_match(p, ))
>   continue;
>   print_tunnel();
>   if (show_stats) {


[PATCH RFC 3/4] netfilter: nfnetlink: add support for netlink descriptions

2018-02-06 Thread Pablo Neira Ayuso
NETLINK_NETFILTER is shared by several netfilter subsystems, add new
infrastructure to allow subsystems to register their own descriptions.
Hence, nfnetlink routes description requests to the corresponding
subsystem backend.

Signed-off-by: Pablo Neira Ayuso 
---
 include/linux/netfilter/nfnetlink.h  |   9 +++
 include/net/nldesc.h |   3 +
 include/uapi/linux/netfilter/nfnetlink.h |   7 ++
 net/netfilter/nfnetlink.c| 108 +++
 4 files changed, 127 insertions(+)

diff --git a/include/linux/netfilter/nfnetlink.h 
b/include/linux/netfilter/nfnetlink.h
index 495ba4dd9da5..87b3d9860444 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -37,6 +37,15 @@ struct nfnetlink_subsystem {
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
 int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
+struct nfnl_desc_subsys {
+   u16 id;
+   const struct nl_desc_cmds   *cmds;
+   const struct nl_desc_objs   *objs;
+};
+
+int nfnl_desc_register_subsys(const struct nfnl_desc_subsys *subsys);
+void nfnl_desc_unregister_subsys(const struct nfnl_desc_subsys *subsys);
+
 int nfnetlink_has_listeners(struct net *net, unsigned int group);
 int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
   unsigned int group, int echo, gfp_t flags);
diff --git a/include/net/nldesc.h b/include/net/nldesc.h
index 19306a648f10..0d232846005a 100644
--- a/include/net/nldesc.h
+++ b/include/net/nldesc.h
@@ -19,6 +19,9 @@ struct nl_desc_objs {
 
 struct nl_desc_req {
u32 bus;
+   union {
+   u32 nf_subsys_id;
+   };
 };
 
 struct net;
diff --git a/include/uapi/linux/netfilter/nfnetlink.h 
b/include/uapi/linux/netfilter/nfnetlink.h
index 5bc960f220b3..7dacf264e0b5 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -62,6 +62,13 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_NFT_COMPAT 11
 #define NFNL_SUBSYS_COUNT  12
 
+enum nfnl_desc_attr {
+   NFNL_DESC_REQ_UNSPEC,
+   NFNL_DESC_REQ_SUBSYS,
+   __NFNL_DESC_REQ_MAX
+};
+#define NFNL_DESC_REQ_MAX  (__NFNL_DESC_REQ_MAX - 1)
+
 /* Reserved control nfnetlink messages */
 #define NFNL_MSG_BATCH_BEGIN   NLMSG_MIN_TYPE
 #define NFNL_MSG_BATCH_END NLMSG_MIN_TYPE+1
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 03ead8a9e90c..df5792534935 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -27,6 +27,7 @@
 #include 
 
 #include 
+#include 
 #include 
 
 MODULE_LICENSE("GPL");
@@ -40,6 +41,7 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
 static struct {
struct mutexmutex;
const struct nfnetlink_subsystem __rcu  *subsys;
+   const struct nfnl_desc_subsys __rcu *desc;
 } table[NFNL_SUBSYS_COUNT];
 
 static const int nfnl_group2type[NFNLGRP_MAX+1] = {
@@ -513,6 +515,107 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
 }
 
+int nfnl_desc_register_subsys(const struct nfnl_desc_subsys *subsys)
+{
+   if (subsys->id >= NFNL_SUBSYS_COUNT)
+   return -ENOENT;
+
+   nfnl_lock(subsys->id);
+   rcu_assign_pointer(table[subsys->id].desc, subsys);
+   nfnl_unlock(subsys->id);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(nfnl_desc_register_subsys);
+
+void nfnl_desc_unregister_subsys(const struct nfnl_desc_subsys *subsys)
+{
+   nfnl_lock(subsys->id);
+   rcu_assign_pointer(table[subsys->id].desc, NULL);
+   nfnl_unlock(subsys->id);
+
+   synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nfnl_desc_unregister_subsys);
+
+static const struct nfnl_desc_subsys *nfnl_desc_get(struct sk_buff *skb,
+   struct nlmsghdr *nlh,
+   struct nl_desc_req *req)
+{
+   const struct nfnl_desc_subsys *desc;
+
+   if (req->nf_subsys_id >= NFNL_SUBSYS_COUNT)
+   return ERR_PTR(-ENOENT);
+
+   desc = rcu_dereference(table[req->nf_subsys_id].desc);
+   if (!desc) {
+   rcu_read_unlock();
+   request_module("nfnetlink-subsys-%d", req->nf_subsys_id);
+   rcu_read_lock();
+   desc = rcu_dereference(table[req->nf_subsys_id].desc);
+   if (desc)
+   return ERR_PTR(-EAGAIN);
+   }
+   return desc;
+}
+
+static const struct nl_desc_cmds *nfnl_desc_getcmds(struct sk_buff *skb,
+   struct nlmsghdr *nlh,
+   struct nl_desc_req *req)
+{
+   const struct nfnl_desc_subsys *desc;
+
+   desc = nfnl_desc_get(skb, nlh, req);
+   if 

[PATCH RFC 1/4] netlink: add NLA_PAD definition

2018-02-06 Thread Pablo Neira Ayuso
The new generic netlink description infrastructure needs this new type
to describe padding attributes.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netlink.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 0c154f98e987..76e850ead593 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -180,6 +180,7 @@ enum {
NLA_S32,
NLA_S64,
NLA_BITFIELD32,
+   NLA_PAD,
__NLA_TYPE_MAX,
 };
 
@@ -209,6 +210,7 @@ enum {
  * given type fits, using it verifies minimum length
  * just like "All other"
  *NLA_BITFIELD32  A 32-bit bitmap/bitselector attribute
+ *NLA_PAD Empty attribute to align next attribute to 64-bits
  *All otherMinimum length of attribute payload
  *
  * Example:
-- 
2.11.0



[PATCH RFC 4/4] netfilter: nf_tables: add netlink description

2018-02-06 Thread Pablo Neira Ayuso
This patch adds the netlink description for nf_tables.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_tables.h |   2 +
 include/uapi/linux/netfilter/nf_tables_desc.h |  57 
 net/netfilter/Makefile|   7 +-
 net/netfilter/nf_tables_api.c |   2 +
 net/netfilter/nf_tables_desc.c| 471 ++
 5 files changed, 536 insertions(+), 3 deletions(-)
 create mode 100644 include/uapi/linux/netfilter/nf_tables_desc.h
 create mode 100644 net/netfilter/nf_tables_desc.c

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index 663b015dace5..91b52b365f7e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1345,4 +1345,6 @@ struct nft_trans_flowtable {
 #define nft_trans_flowtable(trans) \
(((struct nft_trans_flowtable *)trans->data)->flowtable)
 
+extern const struct nfnl_desc_subsys nft_nldesc;
+
 #endif /* _NET_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nf_tables_desc.h 
b/include/uapi/linux/netfilter/nf_tables_desc.h
new file mode 100644
index ..e596ad9f78c3
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_tables_desc.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_NF_TABLES_DESC_H
+#define _LINUX_NF_TABLES_DESC_H
+
+enum nft_nldesc_obj {
+   NFT_UNSPEC,
+   NFT_TABLE,
+   NFT_CHAIN,
+   NFT_CHAIN_COUNTER,
+   NFT_CHAIN_HOOK,
+   NFT_CHAIN_DEV,
+   NFT_RULE,
+   NFT_RULE_COMPAT,
+   NFT_SET,
+   NFT_SET_DESC,
+   NFT_SET_ELEM,
+   NFT_OBJ,
+   NFT_OBJ_COUNTER,
+   NFT_OBJ_QUOTA,
+   NFT_OBJ_LIMIT,
+   NFT_FLOWTABLE,
+   NFT_DATA,
+   NFT_EXPR,
+   NFT_EXPR_COUNTER,
+   NFT_EXPR_IMMEDIATE,
+   NFT_EXPR_BITWISE,
+   NFT_EXPR_BYTEORDER,
+   NFT_EXPR_CMP,
+   NFT_EXPR_RANGE,
+   NFT_EXPR_LOOKUP,
+   NFT_EXPR_DYNSET,
+   NFT_EXPR_PAYLOAD,
+   NFT_EXPR_EXTHDR,
+   NFT_EXPR_META,
+   NFT_EXPR_HASH,
+   NFT_EXPR_RT,
+   NFT_EXPR_CT,
+   NFT_EXPR_FLOW,
+   NFT_EXPR_LIMIT,
+   NFT_EXPR_LOG,
+   NFT_EXPR_QUEUE,
+   NFT_EXPR_QUOTA,
+   NFT_EXPR_REJECT,
+   NFT_EXPR_NAT,
+   NFT_EXPR_MASQ,
+   NFT_EXPR_REDIR,
+   NFT_EXPR_DUP,
+   NFT_EXPR_FWD,
+   NFT_EXPR_OBJREF,
+   NFT_EXPR_FIB,
+   NFT_EXPR_CT_HELPER,
+   NFT_EXPR_NUMGEN,
+   __NFT_MAX,
+};
+#define NFT_MAX(__NFT_MAX - 1)
+
+#endif
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5d9b8b959e58..38e048ea7e42 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -73,9 +73,10 @@ obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o
 obj-$(CONFIG_NF_DUP_NETDEV)+= nf_dup_netdev.o
 
 # nf_tables
-nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_trace.o \
- nft_immediate.o nft_cmp.o nft_range.o nft_bitwise.o \
- nft_byteorder.o nft_payload.o nft_lookup.o nft_dynset.o
+nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_desc.o \
+ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
+ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
+ nft_dynset.o
 
 obj-$(CONFIG_NF_TABLES)+= nf_tables.o
 obj-$(CONFIG_NF_TABLES_INET)   += nf_tables_inet.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0791813a1e7d..cb500aeaa729 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6601,6 +6601,7 @@ static int __init nf_tables_module_init(void)
if (err < 0)
goto err3;
 
+   nfnl_desc_register_subsys(_nldesc);
register_netdevice_notifier(_tables_flowtable_notifier);
 
return register_pernet_subsys(_tables_net_ops);
@@ -6617,6 +6618,7 @@ static void __exit nf_tables_module_exit(void)
unregister_pernet_subsys(_tables_net_ops);
nfnetlink_subsys_unregister(_tables_subsys);
unregister_netdevice_notifier(_tables_flowtable_notifier);
+   nfnl_desc_unregister_subsys(_nldesc);
rcu_barrier();
nf_tables_core_module_exit();
kfree(info);
diff --git a/net/netfilter/nf_tables_desc.c b/net/netfilter/nf_tables_desc.c
new file mode 100644
index ..2acaff69edb0
--- /dev/null
+++ b/net/netfilter/nf_tables_desc.c
@@ -0,0 +1,471 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static const struct nl_desc_attr nft_nldesc_table_attrs[NFTA_TABLE_MAX + 1] = {
+   NLDESC_ATTR_STRING(NFTA_TABLE_NAME, NFT_NAME_MAXLEN - 1),
+   NLDESC_ATTR_U32_MAX(NFTA_TABLE_FLAGS, NFT_TABLE_F_DORMANT),
+   NLDESC_ATTR_U32(NFTA_TABLE_USE),
+   NLDESC_ATTR_U64(NFTA_TABLE_HANDLE),
+   NLDESC_ATTR_PAD(NFTA_TABLE_PAD),
+};
+
+static const struct 

[PATCH RFC 2/4] netlink: add generic object description infrastructure

2018-02-06 Thread Pablo Neira Ayuso
This patch allows netlink busses to provide object descriptions to
userspace, in terms of supported attributes and its corresponding
datatypes.

Userspace sends a requests that looks like:

netlink header
NLA_DESC_REQ_BUS
NLA_DESC_REQ_DATA

Where NLA_DESC_REQ_BUS is the netlink bus/protocol number, eg.
NETLINK_NETFILTER, and NLA_DESC_REQ_DATA is an attribute layout is
specific to the bus that you are inspecting, this is useful for both
nfnetlink and genetlink since they need to what subsystem in the bus
specifically you're targeting to.

Then, the netlink description subsystem response via netlink dump looks
like this:

netlink header
NLA_DESC_NUM_OBJS
NLA_DESC_OBJS (nest)
NLA_DESC_LIST_ITEM (nest)
NLA_DESC_OBJ_ID
NLA_DESC_OBJ_ATTRS_MAX
NLA_DESC_OBJ_ATTRS (nest)
NLA_DESC_LIST_ITEM (nest)
NLA_DESC_ATTR_NUM
NLA_DESC_ATTR_TYPE
NLA_DESC_ATTR_LEN
NLA_DESC_ATTR_MAXVAL
NLA_DESC_ATTR_NEST_ID
NLA_DESC_LIST_ITEM (nest)
...

Each object definition is composed of an unique ID, the number of
attributes and the list of attribute definitions.

The NETLINK_DESC bus provides a generic interface to retrieve the list
of existing objects and its attributes via netlink dump. This new
description family autoloads module dependencies based on what userspace
requests.

Each bus needs to register a struct nl_desc_subsys definition, that
provides the lookup and parse callbacks. These route the description
requests to the corresponding backend subsystem for genetlink and
nfnetlink. The lookup callback returns struct nl_desc_objs that provides
the array of object descriptions.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/net_namespace.h  |   1 +
 include/net/nldesc.h | 160 ++
 include/uapi/linux/netlink.h |  67 ++
 net/netlink/Makefile |   2 +-
 net/netlink/desc.c   | 499 +++
 5 files changed, 728 insertions(+), 1 deletion(-)
 create mode 100644 include/net/nldesc.h
 create mode 100644 net/netlink/desc.c

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f8a84a2c2341..0921b1d7acfe 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -78,6 +78,7 @@ struct net {
 
struct sock *rtnl;  /* rtnetlink socket */
struct sock *genl_sock;
+   struct sock *nl_desc_sock;
 
struct list_headdev_base_head;
struct hlist_head   *dev_name_head;
diff --git a/include/net/nldesc.h b/include/net/nldesc.h
new file mode 100644
index ..19306a648f10
--- /dev/null
+++ b/include/net/nldesc.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_NLDESC_H
+#define __NET_NLDESC_H
+
+#include 
+
+struct nl_desc_cmd;
+struct nl_desc_obj;
+
+struct nl_desc_cmds {
+   int max;
+   const struct nl_desc_cmd*table;
+};
+
+struct nl_desc_objs {
+   int max;
+   const struct nl_desc_obj**table;
+};
+
+struct nl_desc_req {
+   u32 bus;
+};
+
+struct net;
+struct sk_buff;
+struct nlmsghdr;
+struct nlattr;
+
+struct nl_desc_subsys {
+   struct list_headlist;
+   u32 bus;
+   const struct nl_desc_cmds * (*getcmds)(struct sk_buff *skb,
+  struct nlmsghdr *nlh,
+  struct nl_desc_req *req);
+   const struct nl_desc_objs * (*getobjs)(struct sk_buff *skb,
+  struct nlmsghdr *nlh,
+  struct nl_desc_req *req);
+   int (*parse)(struct net *net,
+struct sk_buff *skb,
+struct nlmsghdr *nlh,
+const struct nlattr *data,
+struct nl_desc_req *req);
+};
+
+/**
+ * struct nl_desc_attr - netlink attribute description
+ * @nest: netlink description for nested attribute
+ * @attr: attribute number
+ * @type: attribute datatype (see NLA_* enumeration)
+ * @len: attribute payload length
+ * @max: attribute maximum value (upper limit if any, zero means unset)
+ */
+struct nl_desc_attr {
+   const struct nl_desc_obj*nest;
+   u16 attr;
+   u16 

[PATCH RFC 0/4] Netlink bus descriptions

2018-02-06 Thread Pablo Neira Ayuso
Hi,

Modern messaging systems usually provide facilities that allows you to
inquire about supported commands and message layouts. Netlink has no
such facility so far, hence people end up probing for features, which is
a bit sloppy. Sometimes there are also magic version numbers in place
that gives a hint to userspace on what this kernel supports, so
userspace makes assumptions based on this version number.

This patchset aims to improve this situation by adding a new
NETLINK_DESC bus with two commands, one to fetch the list of existing
commands and another one to describe supported objects. Command
descriptions also indicate what netlink objects the interface expects to
be used in the netlink message payload, thus, we can relate commands and
objects. Objects are represented as compounds of attributes, some of
these attributes are nesting other attributes. The netlink description
exports the attribute tree and, for simplicity, a linear list of netlink
objects identified via unique ID.

Patch 1 adds NLA_PAD, this datatype is needed by netlink description.
Patch 2 adds the new generic netlink description bus NETLINK_DESC.
Patch 3 adds support for netlink descriptions to nfnetlink.
Patch 4 adds a netlink description for nf_tables, so you have an initial
client for this infrastructure to look at as reference.

Not covered by this patchset, but I think it should be possible to
(fully?) generate the C file containing the description and the netlink
headers based on some generic notation that describes the netlink
interface, something like this:

netlink description  ---> nldesc_compiler ---> .c and .h files
 notation

This would probably make the work of describing the netlink interface
less error prone, given people may add a new attribute to the header
file and forget about updating the description. I also think this
description could be also useful to fuzz tests netlink interfaces, by
interpreting the description that the kernel provides.

I've been looking into this since NetDev 2.1 in Canada - already one
year ago - I think we need this in nftables so we can evolve more
freely.

Comments welcome.

Pablo Neira Ayuso (4):
  netlink: add NLA_PAD definition
  netlink: add generic object description infrastructure
  netfilter: nfnetlink: add support for netlink descriptions
  netfilter: nf_tables: add netlink description

 include/linux/netfilter/nfnetlink.h   |   9 +
 include/net/net_namespace.h   |   1 +
 include/net/netfilter/nf_tables.h |   2 +
 include/net/netlink.h |   2 +
 include/net/nldesc.h  | 163 +
 include/uapi/linux/netfilter/nf_tables_desc.h |  57 +++
 include/uapi/linux/netfilter/nfnetlink.h  |   7 +
 include/uapi/linux/netlink.h  |  67 
 net/netfilter/Makefile|   7 +-
 net/netfilter/nf_tables_api.c |   2 +
 net/netfilter/nf_tables_desc.c| 471 
 net/netfilter/nfnetlink.c | 108 ++
 net/netlink/Makefile  |   2 +-
 net/netlink/desc.c| 499 ++
 14 files changed, 1393 insertions(+), 4 deletions(-)
 create mode 100644 include/net/nldesc.h
 create mode 100644 include/uapi/linux/netfilter/nf_tables_desc.h
 create mode 100644 net/netfilter/nf_tables_desc.c
 create mode 100644 net/netlink/desc.c

-- 
2.11.0



[vhost:vhost 20/20] ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] undefined!

2018-02-06 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost
head:   96bcd04462b99e2c80e09f6537770a0ca6b288d0
commit: 96bcd04462b99e2c80e09f6537770a0ca6b288d0 [20/20] virtio-balloon: 
VIRTIO_BALLOON_F_FREE_PAGE_HINT
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 96bcd04462b99e2c80e09f6537770a0ca6b288d0
# save the attached .config to linux build tree
make.cross ARCH=ia64 

All errors (new ones prefixed by >>):

   WARNING: modpost: missing MODULE_LICENSE() in 
drivers/auxdisplay/img-ascii-lcd.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-ath79.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-iop.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in drivers/iio/accel/kxsd9-i2c.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in 
drivers/iio/adc/qcom-vadc-common.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in 
drivers/media/platform/mtk-vcodec/mtk-vcodec-common.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in 
drivers/media/platform/tegra-cec/tegra_cec.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in drivers/mtd/nand/denali_pci.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in 
drivers/pinctrl/pxa/pinctrl-pxa2xx.o
   see include/linux/module.h for more information
   WARNING: modpost: missing MODULE_LICENSE() in drivers/power/reset/zx-reboot.o
   see include/linux/module.h for more information
>> ERROR: "page_poisoning_enabled" [drivers/virtio/virtio_balloon.ko] undefined!
   ERROR: "ia64_delay_loop" [drivers/spi/spi-thunderx.ko] undefined!
   ERROR: "ia64_delay_loop" [drivers/net/phy/mdio-cavium.ko] undefined!

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH] net: ethernet: ti: cpsw: fix net watchdog timeout

2018-02-06 Thread Grygorii Strashko
It was discovered that simple program which indefinitely sends 200b UDP
packets and runs on TI AM574x SoC (SMP) under RT Kernel triggers network
watchdog timeout in TI CPSW driver (<6 hours run). The network watchdog
timeout is triggered due to race between cpsw_ndo_start_xmit() and
cpsw_tx_handler() [NAPI]

cpsw_ndo_start_xmit()
if (unlikely(!cpdma_check_free_tx_desc(txch))) {
txq = netdev_get_tx_queue(ndev, q_idx);
netif_tx_stop_queue(txq);

^^ as per [1] barier has to be used after set_bit() otherwise new value
might not be visible to other cpus
}

cpsw_tx_handler()
if (unlikely(netif_tx_queue_stopped(txq)))
netif_tx_wake_queue(txq);

and when it happens ndev TX queue became disabled forever while driver's HW
TX queue is empty.

Fix this, by adding smp_mb__after_atomic() after netif_tx_stop_queue()
calls and double check for free TX descriptors after stopping ndev TX queue
- if there are free TX descriptors wake up ndev TX queue.

[1] https://www.kernel.org/doc/html/latest/core-api/atomic_ops.html
Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 10d7cbe..3805b13 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1638,6 +1638,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff 
*skb,
q_idx = q_idx % cpsw->tx_ch_num;
 
txch = cpsw->txv[q_idx].ch;
+   txq = netdev_get_tx_queue(ndev, q_idx);
ret = cpsw_tx_packet_submit(priv, skb, txch);
if (unlikely(ret != 0)) {
cpsw_err(priv, tx_err, "desc submit failed\n");
@@ -1648,15 +1649,26 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff 
*skb,
 * tell the kernel to stop sending us tx frames.
 */
if (unlikely(!cpdma_check_free_tx_desc(txch))) {
-   txq = netdev_get_tx_queue(ndev, q_idx);
netif_tx_stop_queue(txq);
+
+   /* Barrier, so that stop_queue visible to other cpus */
+   smp_mb__after_atomic();
+
+   if (cpdma_check_free_tx_desc(txch))
+   netif_tx_wake_queue(txq);
}
 
return NETDEV_TX_OK;
 fail:
ndev->stats.tx_dropped++;
-   txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
netif_tx_stop_queue(txq);
+
+   /* Barrier, so that stop_queue visible to other cpus */
+   smp_mb__after_atomic();
+
+   if (cpdma_check_free_tx_desc(txch))
+   netif_tx_wake_queue(txq);
+
return NETDEV_TX_BUSY;
 }
 
-- 
2.10.5



qdisc_pkt_len_init: SCTP/GSO_BY_FRAGS and robustness questions

2018-02-06 Thread Daniel Axtens
Hi Marcelo and Eric,

I'm working on checking code that might be impacted by GSO_BY_FRAGS -
after finding that the token bucket filter qdisc code doesn't handle it
properly, DaveM said I should look for other places where this might be
an issue [0].

I'm currently looking at qdisc_pkt_len_init in net/core/dev.c. This is
called by __dev_queue_xmit, before validate_xmit_skb, so before an SCTP
skb would be segmented if the hardware doesn't support SCTP offload.

There are two things I was hoping you two could offer some advice on:

1) Eric, in 7c68d1a6b4db ("net: qdisc_pkt_len_init() should be more
   robust") you replaced a chunk of code that is similar to the code
   found in skb_gso_transport_seglen() and replaced it with more robust
   code. Do we need to change skb_gso_transport_seglen() in a similar way?

2) Marcelo, unlike skb_gso_transport_seglen(), where you added a case
   for SCTP in 90017accff61 ("sctp: Add GSO support"), there doesn't
   seem to be a GSO_BY_FRAGS or SCTP check in qdisc_pkt_len_init, so I
   think the accounting is probably wrong for SCTP. I'm not 100% sure
   how to fix this as it's now quite different from the calcuations in
   skb_gso_transport_seglen() - so I was hoping that you might have an
   idea.

Thanks in advance!

[0]: https://patchwork.ozlabs.org/patch/869145/#1852414

Regards,
Daniel




Re: [PATCH v1] r8169: switch to device-managed functions in probe (part 2)

2018-02-06 Thread kbuild test robot
Hi Andy,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[cannot apply to v4.15]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Andy-Shevchenko/r8169-switch-to-device-managed-functions-in-probe-part-2/20180207-053113
config: i386-randconfig-a0-02070503 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   drivers/net/ethernet/realtek/r8169.c: In function 'rtl_init_one':
>> drivers/net/ethernet/realtek/r8169.c:102:43: error: 'ioaddr' undeclared 
>> (first use in this function)
#define RTL_W8(reg, val8) writeb ((val8), ioaddr + (reg))
  ^
   drivers/net/ethernet/realtek/r8169.c:8513:2: note: in expansion of macro 
'RTL_W8'
 RTL_W8(Cfg9346, Cfg9346_Unlock);
 ^
   drivers/net/ethernet/realtek/r8169.c:102:43: note: each undeclared 
identifier is reported only once for each function it appears in
#define RTL_W8(reg, val8) writeb ((val8), ioaddr + (reg))
  ^
   drivers/net/ethernet/realtek/r8169.c:8513:2: note: in expansion of macro 
'RTL_W8'
 RTL_W8(Cfg9346, Cfg9346_Unlock);
 ^

vim +/ioaddr +102 drivers/net/ethernet/realtek/r8169.c

^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  100  
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  101  /* write/read MMIO 
register */
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16 @102  #define 
RTL_W8(reg, val8)  writeb ((val8), ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  103  #define 
RTL_W16(reg, val16)writew ((val16), ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  104  #define 
RTL_W32(reg, val32)writel ((val32), ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  105  #define 
RTL_R8(reg)readb (ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  106  #define 
RTL_R16(reg)   readw (ioaddr + (reg))
06f555f3 drivers/net/r8169.c Junchang Wang  2010-05-30  107  #define 
RTL_R32(reg)   readl (ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  108  

:: The code at line 102 was first introduced by commit
:: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2

:: TO: Linus Torvalds 
:: CC: Linus Torvalds 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH net] ibmvnic: Ensure that buffers are NULL after free

2018-02-06 Thread Thomas Falcon
This change will guard against a double free in the case that the
buffers were previously freed at some other time, such as during
a device reset. It resolves a kernel oops that occurred when changing
the VNIC device's MTU.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 8c3058d..e3e56a8 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -354,6 +354,8 @@ static void release_stats_buffers(struct ibmvnic_adapter 
*adapter)
 {
kfree(adapter->tx_stats_buffers);
kfree(adapter->rx_stats_buffers);
+   adapter->tx_stats_buffers = NULL;
+   adapter->rx_stats_buffers = NULL;
 }
 
 static int init_stats_buffers(struct ibmvnic_adapter *adapter)
@@ -599,6 +601,8 @@ static void release_vpd_data(struct ibmvnic_adapter 
*adapter)
 
kfree(adapter->vpd->buff);
kfree(adapter->vpd);
+
+   adapter->vpd = NULL;
 }
 
 static void release_tx_pools(struct ibmvnic_adapter *adapter)
@@ -909,6 +913,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
if (dma_mapping_error(dev, adapter->vpd->dma_addr)) {
dev_err(dev, "Could not map VPD buffer\n");
kfree(adapter->vpd->buff);
+   adapter->vpd->buff = NULL;
return -ENOMEM;
}
 
-- 
1.8.3.1



Re: [PATCH net 1/1 v2] rtnetlink: require unique netns identifier

2018-02-06 Thread Eric W. Biederman
Christian Brauner  writes:

> On Tue, Feb 06, 2018 at 12:47:46AM +0300, Kirill Tkhai wrote:
>> On 05.02.2018 18:55, Christian Brauner wrote:
>> > Since we've added support for IFLA_IF_NETNSID for RTM_{DEL,GET,SET,NEW}LINK
>> > it is possible for userspace to send us requests with three different
>> > properties to identify a target network namespace. This affects at least
>> > RTM_{NEW,SET}LINK. Each of them could potentially refer to a different
>> > network namespace which is confusing. For legacy reasons the kernel will
>> > pick the IFLA_NET_NS_PID property first and then look for the
>> > IFLA_NET_NS_FD property but there is no reason to extend this type of
>> > behavior to network namespace ids. The regression potential is quite
>> > minimal since the rtnetlink requests in question either won't allow
>> > IFLA_IF_NETNSID requests before 4.16 is out (RTM_{NEW,SET}LINK) or don't
>> > support IFLA_NET_NS_{PID,FD} (RTM_{DEL,GET}LINK) in the first place.
>> >> Signed-off-by: Christian Brauner 
>> > ---
>> > ChangeLog v1->v2:
>> > * return errno when the specified network namespace id is invalid
>> > * fill in struct netlink_ext_ack if the network namespace id is invalid
>> > * rename rtnl_ensure_unique_netns_attr() to rtnl_ensure_unique_netns() to
>> >   indicate that a request without any network namespace identifying 
>> > attributes
>> >   is also considered valid.
>> > 
>> > ChangeLog v0->v1:
>> > * report a descriptive error to userspace via struct netlink_ext_ack
>> > * do not fail when multiple properties specifiy the same network namespace
>> > ---
>> >  net/core/rtnetlink.c | 69 
>> > 
>> >  1 file changed, 69 insertions(+)
>> > 
>> > diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
>> > index 56af8e41abfc..c096c4ff9a00 100644
>> > --- a/net/core/rtnetlink.c
>> > +++ b/net/core/rtnetlink.c
>> > @@ -1951,6 +1951,59 @@ static struct net *rtnl_link_get_net_capable(const 
>> > struct sk_buff *skb,
>> >return net;
>> >  }
>> >  
>> > +/* Verify that rtnetlink requests supporting network namespace ids
>> > + * do not pass additional properties referring to different network
>> > + * namespaces.
>> > + */
>> > +static int rtnl_ensure_unique_netns(const struct sock *sk, struct nlattr 
>> > *tb[],
>> > +  struct netlink_ext_ack *extack)
>> > +{
>> > +  int ret = -EINVAL;
>> > +  struct net *net = NULL, *unique_net = NULL;
>> > +
>> > +  /* Requests without network namespace ids have been able to specify
>> > +   * multiple properties referring to different network namespaces so
>> > +   * don't regress them.
>> > +   */
>> > +  if (!tb[IFLA_IF_NETNSID])
>> > +  return 0;
>> > +
>> > +  /* Caller operates on the current network namespace. */
>> > +  if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD])
>> > +  return 0;
>> > +
>> > +  unique_net = get_net_ns_by_id(sock_net(sk), 
>> > nla_get_s32(tb[IFLA_IF_NETNSID]));
>> > +  if (!unique_net) {
>> > +  NL_SET_ERR_MSG(extack, "invalid network namespace id");
>> > +  return ret;
>> > +  }
>> > +
>> > +  if (tb[IFLA_NET_NS_PID]) {
>> > +  net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
>> > +  if (net != unique_net)
>> > +  goto on_error;
>> > +  }
>> > +
>> > +  if (tb[IFLA_NET_NS_FD]) {
>> > +  net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
>> > +  if (net != unique_net)
>> > +  goto on_error;
>> > +  }
>> > +
>> > +  ret = 0;
>> > +
>> > +on_error:
>> > +  put_net(unique_net);
>> > +
>> > +  if (net && !IS_ERR(net))
>> > +  put_net(net);
>> 
>> 1)When we have tb[IFLA_NET_NS_PID and tb[IFLA_NET_NS_FD] both set and 
>> pointing
>> to the same net, this function increments net::count in get_net_ns_by_pid() 
>> and
>> in get_net_ns_by_fd(), i.e. twice. But only single put_net(net) will be 
>> called.
>> So, after this function net::count will be incremented by 1, and it never 
>> will
>> die.
>
> Thanks for spotting this, Kirill.
>
>> 
>> 2)The whole approach does not seem good for me. The first reason is it's 
>> racy.
>> Even if rtnl_ensure_unique_netns() returns 0, this does not guarantees that
>> tb[IFLA_IF_NETNSID] and tb[IFLA_NET_NS_PID] will be point the same net later,
>> as the pid may die or do setns(). Racy check is worse than no check at all.
>> 
>> The second reason is after this patch get_net_ns_by_id/get_net_ns_by_pid()/
>> get_net_ns_by_fd() will be called twice: the first time is in your check
>> and the second time is where they are actually used. This is not good for
>> performance.
>
> If this is really a performance problem we can simply fix this by
> performing the check when the target network namespace is retrieved in
> each request. The intention for doing it in one function at the
> beginning of each request was to make it generic and easily
> understandable.
>
>> 

[PATCH net] ibmvnic: Fix rx queue cleanup for non-fatal resets

2018-02-06 Thread John Allen
At some point, a check was added to exit the polling routine during resets.
This makes sense for most reset conditions, but for a non-fatal error, we
expect the polling routine to continue running to properly clean up the rx
queues. This patch checks if we are performing a non-fatal reset and if we
are, continues normal polling operation.

Signed-off-by: John Allen 
---
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 8c3058d5d191..2a26b2ece7fe 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1831,7 +1831,8 @@ static int ibmvnic_poll(struct napi_struct *napi, int 
budget)
u16 offset;
u8 flags = 0;

-   if (unlikely(adapter->resetting)) {
+   if (unlikely(adapter->resetting &&
+adapter->reset_reason != VNIC_RESET_NON_FATAL)) {
enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]);
napi_complete_done(napi, frames_processed);
return frames_processed;



Re: [PATCH v1] r8169: switch to device-managed functions in probe (part 2)

2018-02-06 Thread kbuild test robot
Hi Andy,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[also build test ERROR on next-20180206]
[cannot apply to v4.15]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Andy-Shevchenko/r8169-switch-to-device-managed-functions-in-probe-part-2/20180207-053113
config: x86_64-randconfig-x001-201805 (attached as .config)
compiler: gcc-7 (Debian 7.2.0-12) 7.2.1 20171025
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All error/warnings (new ones prefixed by >>):

   drivers/net//ethernet/realtek/r8169.c: In function 'rtl_init_one':
>> drivers/net//ethernet/realtek/r8169.c:102:43: error: 'ioaddr' undeclared 
>> (first use in this function); did you mean 'in_addr'?
#define RTL_W8(reg, val8) writeb ((val8), ioaddr + (reg))
  ^
>> drivers/net//ethernet/realtek/r8169.c:8513:2: note: in expansion of macro 
>> 'RTL_W8'
 RTL_W8(Cfg9346, Cfg9346_Unlock);
 ^~
   drivers/net//ethernet/realtek/r8169.c:102:43: note: each undeclared 
identifier is reported only once for each function it appears in
#define RTL_W8(reg, val8) writeb ((val8), ioaddr + (reg))
  ^
>> drivers/net//ethernet/realtek/r8169.c:8513:2: note: in expansion of macro 
>> 'RTL_W8'
 RTL_W8(Cfg9346, Cfg9346_Unlock);
 ^~

vim +102 drivers/net//ethernet/realtek/r8169.c

^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  100  
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  101  /* write/read MMIO 
register */
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16 @102  #define 
RTL_W8(reg, val8)  writeb ((val8), ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  103  #define 
RTL_W16(reg, val16)writew ((val16), ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  104  #define 
RTL_W32(reg, val32)writel ((val32), ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  105  #define 
RTL_R8(reg)readb (ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  106  #define 
RTL_R16(reg)   readw (ioaddr + (reg))
06f555f3 drivers/net/r8169.c Junchang Wang  2010-05-30  107  #define 
RTL_R32(reg)   readl (ioaddr + (reg))
^1da177e drivers/net/r8169.c Linus Torvalds 2005-04-16  108  

:: The code at line 102 was first introduced by commit
:: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2

:: TO: Linus Torvalds <torva...@ppc970.osdl.org>
:: CC: Linus Torvalds <torva...@ppc970.osdl.org>

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH 1/1] tcp: Honor the eor bit in tcp_mtu_probe

2018-02-06 Thread kbuild test robot
Hi Ilya,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net/master]
[also build test WARNING on v4.15 next-20180206]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Ilya-Lesokhin/tcp-Honor-the-eor-bit-in-tcp_mtu_probe/20180207-045040
config: i386-randconfig-x007-201805 (attached as .config)
compiler: gcc-7 (Debian 7.2.0-12) 7.2.1 20171025
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   In file included from include/linux/tcp.h:21:0,
from include/net/tcp.h:24,
from net//ipv4/tcp_output.c:39:
   net//ipv4/tcp_output.c: In function 'tcp_write_xmit':
>> include/linux/skbuff.h:3196:12: warning: 'skb' may be used uninitialized in 
>> this function [-Wmaybe-uninitialized]
  for (tmp = skb->next;  \
   ^
   net//ipv4/tcp_output.c:2043:18: note: 'skb' was declared here
 struct sk_buff *skb, *nskb, *next;
 ^~~
--
   In file included from include/linux/tcp.h:21:0,
from include/net/tcp.h:24,
from net/ipv4/tcp_output.c:39:
   net/ipv4/tcp_output.c: In function 'tcp_write_xmit':
>> include/linux/skbuff.h:3196:12: warning: 'skb' may be used uninitialized in 
>> this function [-Wmaybe-uninitialized]
  for (tmp = skb->next;  \
   ^
   net/ipv4/tcp_output.c:2043:18: note: 'skb' was declared here
 struct sk_buff *skb, *nskb, *next;
 ^~~

vim +/skb +3196 include/linux/skbuff.h

18a4c0eab Eric Dumazet2017-10-05  3168  
^1da177e4 Linus Torvalds  2005-04-16  3169  #define skb_queue_walk(queue, skb) \
^1da177e4 Linus Torvalds  2005-04-16  3170  for (skb = 
(queue)->next;   \
a1e4891fd Linus Torvalds  2011-05-22  3171   skb != (struct 
sk_buff *)(queue);  \
^1da177e4 Linus Torvalds  2005-04-16  3172   skb = skb->next)
^1da177e4 Linus Torvalds  2005-04-16  3173  
46f8914e5 James Chapman   2007-04-30  3174  #define skb_queue_walk_safe(queue, 
skb, tmp)\
46f8914e5 James Chapman   2007-04-30  3175  for (skb = 
(queue)->next, tmp = skb->next;  \
46f8914e5 James Chapman   2007-04-30  3176   skb != (struct 
sk_buff *)(queue);  \
46f8914e5 James Chapman   2007-04-30  3177   skb = tmp, tmp = 
skb->next)
46f8914e5 James Chapman   2007-04-30  3178  
1164f52a2 David S. Miller 2008-09-23  3179  #define skb_queue_walk_from(queue, 
skb) \
a1e4891fd Linus Torvalds  2011-05-22  3180  for (; skb != (struct 
sk_buff *)(queue);\
1164f52a2 David S. Miller 2008-09-23  3181   skb = skb->next)
1164f52a2 David S. Miller 2008-09-23  3182  
18a4c0eab Eric Dumazet2017-10-05  3183  #define skb_rbtree_walk(skb, root)  
\
18a4c0eab Eric Dumazet2017-10-05  3184  for (skb = 
skb_rb_first(root); skb != NULL; \
18a4c0eab Eric Dumazet2017-10-05  3185   skb = 
skb_rb_next(skb))
18a4c0eab Eric Dumazet2017-10-05  3186  
18a4c0eab Eric Dumazet2017-10-05  3187  #define skb_rbtree_walk_from(skb)   
\
18a4c0eab Eric Dumazet2017-10-05  3188  for (; skb != NULL; 
\
18a4c0eab Eric Dumazet2017-10-05  3189   skb = 
skb_rb_next(skb))
18a4c0eab Eric Dumazet2017-10-05  3190  
18a4c0eab Eric Dumazet2017-10-05  3191  #define 
skb_rbtree_walk_from_safe(skb, tmp) \
18a4c0eab Eric Dumazet2017-10-05  3192  for (; tmp = skb ? 
skb_rb_next(skb) : NULL, (skb != NULL);  \
18a4c0eab Eric Dumazet2017-10-05  3193   skb = tmp)
18a4c0eab Eric Dumazet2017-10-05  3194  
1164f52a2 David S. Miller 2008-09-23  3195  #define 
skb_queue_walk_from_safe(queue, skb, tmp)   \
1164f52a2 David S. Miller 2008-09-23 @3196  for (tmp = skb->next;   
\
1164f52a2 David S. Miller 2008-09-23  3197   skb != (struct 
sk_buff *)(queue);  \
1164f52a2 David S. Miller 2008-09-23  3198   skb = tmp, tmp = 
skb->next)
1164f52a2 David S. Miller 2008-09-23  3199  

:: The code at line 3196 was first introduced by commit
:: 1164f52a244204830c7625b3c22812781996d7b4 net: Add skb_queue_wa

[PATCH net] net/ipv6: onlink nexthop checks should default to main table

2018-02-06 Thread David Ahern
Because of differences in how ipv4 and ipv6 handle fib lookups,
verification of nexthops with onlink flag need to default to the main
table rather than the local table used by IPv4. As it stands an
address within a connected route on device 1 can be used with
onlink on device 2. Updating the table properly rejects the route
due to the egress device mismatch.

Update the extack message as well to show it could be a device
mismatch for the nexthop spec.

Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag")
Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 69c43d289c69..9dcfa800 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2479,7 +2479,7 @@ static int ip6_route_check_nh_onlink(struct net *net,
 struct net_device *dev,
 struct netlink_ext_ack *extack)
 {
-   u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+   u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
const struct in6_addr *gw_addr = >fc_gateway;
u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
struct rt6_info *grt;
@@ -2490,7 +2490,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
if (grt) {
if (!grt->dst.error &&
(grt->rt6i_flags & flags || dev != grt->dst.dev)) {
-   NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+   NL_SET_ERR_MSG(extack,
+  "Nexthop has invalid gateway or device 
mismatch");
err = -EINVAL;
}
 
-- 
2.11.0



[net] i40e: Fix the number of queues available to be mapped for use

2018-02-06 Thread Jeff Kirsher
From: Amritha Nambiar 

Fix the number of queues per enabled TC and report available queues
to the kernel without having to limit them to the max RSS limit so
they are available to be mapped for XPS. This allows a queue per
processing thread available for handling traffic for the given
traffic class.

Signed-off-by: Amritha Nambiar 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f95ce9b5e4fb..e31adbc75f9c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1785,7 +1785,7 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
struct i40e_pf *pf = vsi->back;
u16 sections = 0;
u8 netdev_tc = 0;
-   u16 numtc = 0;
+   u16 numtc = 1;
u16 qcount;
u8 offset;
u16 qmap;
@@ -1795,9 +1795,11 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi 
*vsi,
sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
offset = 0;
 
+   /* Number of queues per enabled TC */
+   num_tc_qps = vsi->alloc_queue_pairs;
if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) {
/* Find numtc from enabled TC bitmap */
-   for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+   for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
if (enabled_tc & BIT(i)) /* TC is enabled */
numtc++;
}
@@ -1805,18 +1807,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi 
*vsi,
dev_warn(>pdev->dev, "DCB is enabled but no TC 
enabled, forcing TC0\n");
numtc = 1;
}
-   } else {
-   /* At least TC0 is enabled in non-DCB, non-MQPRIO case */
-   numtc = 1;
+   num_tc_qps = num_tc_qps / numtc;
+   num_tc_qps = min_t(int, num_tc_qps,
+  i40e_pf_get_max_q_per_tc(pf));
}
 
vsi->tc_config.numtc = numtc;
vsi->tc_config.enabled_tc = enabled_tc ? enabled_tc : 1;
-   /* Number of queues per enabled TC */
-   qcount = vsi->alloc_queue_pairs;
-
-   num_tc_qps = qcount / numtc;
-   num_tc_qps = min_t(int, num_tc_qps, i40e_pf_get_max_q_per_tc(pf));
 
/* Do not allow use more TC queue pairs than MSI-X vectors exist */
if (pf->flags & I40E_FLAG_MSIX_ENABLED)
@@ -1831,9 +1828,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi 
*vsi,
 
switch (vsi->type) {
case I40E_VSI_MAIN:
-   qcount = min_t(int, pf->alloc_rss_size,
-  num_tc_qps);
-   break;
+   if (!(pf->flags & (I40E_FLAG_FD_SB_ENABLED |
+   I40E_FLAG_FD_ATR_ENABLED)) ||
+   vsi->tc_config.enabled_tc != 1) {
+   qcount = min_t(int, pf->alloc_rss_size,
+  num_tc_qps);
+   break;
+   }
case I40E_VSI_FDIR:
case I40E_VSI_SRIOV:
case I40E_VSI_VMDQ2:
-- 
2.14.3



[PATCH] selftests: bpf: test_kmod.sh: check the module path before insmod

2018-02-06 Thread Naresh Kamboju
test_kmod.sh reported false failure when module not present.
Check test_bpf.ko is present in the path before loading it.

Stop using "insmod $SRC_TREE/lib/test_bpf.ko" instead use
"modprobe test_bpf"

Signed-off-by: Naresh Kamboju 
---
 tools/testing/selftests/bpf/test_kmod.sh | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_kmod.sh 
b/tools/testing/selftests/bpf/test_kmod.sh
index ed4774d..54177b1 100755
--- a/tools/testing/selftests/bpf/test_kmod.sh
+++ b/tools/testing/selftests/bpf/test_kmod.sh
@@ -1,8 +1,6 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
-SRC_TREE=../../../../
-
 test_run()
 {
sysctl -w net.core.bpf_jit_enable=$1 2>&1 > /dev/null
@@ -10,8 +8,13 @@ test_run()
 
echo "[ JIT enabled:$1 hardened:$2 ]"
dmesg -C
-   insmod $SRC_TREE/lib/test_bpf.ko 2> /dev/null
-   if [ $? -ne 0 ]; then
+   # Use modprobe dry run to check for missing test_bpf module
+   if ! /sbin/modprobe -q -n test_bpf; then
+   echo "test_bpf: [SKIP]"
+   elif /sbin/modprobe -q test_bpf; then
+   echo "test_bpf: ok"
+   else
+   echo "test_bpf: [FAIL]"
rc=1
fi
rmmod  test_bpf 2> /dev/null
-- 
2.7.4



Re: [PATCH] igb: Fix a test with HWTSTAMP_TX_ON

2018-02-06 Thread Richard Cochran
On Tue, Feb 06, 2018 at 08:47:59PM +0100, Christophe JAILLET wrote:
> 'HWTSTAMP_TX_ON' should be handled as a value, not as a bit mask.
> The modified code should behave the same, because HWTSTAMP_TX_ON is 1
> and no other possible values of 'tx_type' would match the test.
> However, this is more future-proof, should other values be allowed one day.

Nice catch, and I am going to introduce more tx_type values soon.

Thanks,
Richard


Re: Two net_sched fixes for stable

2018-02-06 Thread David Miller
From: Cong Wang 
Date: Tue, 6 Feb 2018 11:25:10 -0800

> Can you queue the following commits for stable? They fix a
> sleep-in-atomic warning reported by Roland.
> 
> commit efbf78973978b0d25af59bc26c8013a942af6e64
> Author: Cong Wang 
> Date:   Mon Dec 4 10:48:18 2017 -0800
> 
> net_sched: get rid of rcu_barrier() in tcf_block_put_ext()
> 
> commit df45bf84e4f5a48f23d4b1a07d21d566e8b587b2
> Author: Jiri Pirko 
> Date:   Fri Dec 8 19:27:27 2017 +0100
> 
> net: sched: fix use-after-free in tcf_block_put_ext
> 
> This problem was introduced by:
> 
> commit e2ef75445340ca7ec2c4558f84ae6c8c5d650fc8
> Author: Cong Wang 
> Date:   Mon Sep 11 16:33:31 2017 -0700
> 
> net_sched: fix reference counting of tc filter chain
> 
> so should apply to 4.14 too.
> 
> For 4.15, they can just apply nearly cleanly. However, it is not
> straight-forward to cherry pick them to 4.14 due to some big changes
> between 4.14 and 4.15.
> 
> Please let me know how you want to handle this for 4.14.

Ok, I sent this off for 4.15 -stable but I need you to do the
4.14 backport.

Thank you.


[PATCH] cxgb4: Fix error handling path in 'init_one()'

2018-02-06 Thread Christophe JAILLET
Commit baf5086840ab1 ("cxgb4: restructure VF mgmt code") has reordered
some code but an error handling label has not been updated accordingly.
So fix it and free 'adapter' if 't4_wait_dev_ready()' fails.

Fixes: baf5086840ab1 ("cxgb4: restructure VF mgmt code")
Signed-off-by: Christophe JAILLET 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 725643b4a8ab..79bffaa3c1af 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5125,7 +5125,7 @@ static int init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
adapter->regs = regs;
err = t4_wait_dev_ready(regs);
if (err < 0)
-   goto out_unmap_bar0;
+   goto out_free_adapter;
 
/* We control everything through one PF */
whoami = readl(regs + PL_WHOAMI_A);
-- 
2.14.1


---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus



[PATCH net] net/ipv6: Handle reject routes with onlink flag

2018-02-06 Thread David Ahern
Verification of nexthops with onlink flag need to handle unreachable
routes. The lookup is only intended to validate the gateway address
is not a local address and if the gateway resolves the egress device
must match the given device. Hence, hitting any default reject route
is ok.

Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag")
Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb2d251c0500..69c43d289c69 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2488,7 +2488,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
err = 0;
grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
if (grt) {
-   if (grt->rt6i_flags & flags || dev != grt->dst.dev) {
+   if (!grt->dst.error &&
+   (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
err = -EINVAL;
}
-- 
2.11.0



[GIT] Networking

2018-02-06 Thread David Miller

1) Fix error path in netdevsim, from Jakub Kicinski.

2) Default values listed in tcp_wmem and tcp_rmem documentation
   were inaccurate, from Tonghao Zhang.

3) Fix route leaks in SCTP, both for ipv4 and ipv6.  From Alexey
   Kodanev and Tommi Rantala.

4) Fix "MASK < Y" meant to be "MASK << Y" in xgbe driver, from
   Wolfram Sang.

5) Use after free in u32_destroy_key(), from Paolo Abeni.

6) Fix two TX issues in be2net driver, from Suredh Reddy.

Please pull, thanks a lot!

The following changes since commit 35277995e17919ab838beae765f440674e8576eb:

  Merge branch 'x86-pti-for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (2018-02-04 11:45:55 
-0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to 176bfb406d735655f9a69d868a7af0c3da959d51:

  Merge branch 'be2net-patch-set' (2018-02-06 11:48:40 -0500)


Alexei Starovoitov (2):
  bpf: fix bpf_prog_array_copy_to_user() issues
  Merge branch 'libbpf-xdp-support'

Alexey Kodanev (1):
  sctp: fix dst refcnt leak in sctp_v6_get_dst()

Andrew Lunn (1):
  net: phy: Handle not having GPIO enabled in the kernel

Daniel Borkmann (1):
  bpf: fix null pointer deref in bpf_prog_test_run_xdp

David S. Miller (3):
  Merge git://git.kernel.org/.../bpf/bpf
  Merge branch 'net-erspan-fixes'
  Merge branch 'be2net-patch-set'

Desnes Augusto Nunes do Rosario (1):
  ibmvnic: fix empty firmware version and errors cleanup

Eric Leblond (5):
  tools: add netlink.h and if_link.h in tools uapi
  libbpf: add function to setup XDP
  libbpf: add error reporting in XDP
  libbpf: add missing SPDX-License-Identifier
  samples/bpf: use bpf_set_link_xdp_fd

Guanglei Li (1):
  RDS: IB: Fix null pointer issue

Jakub Kicinski (2):
  netdevsim: fix overflow on the error path
  nfp: fix kdoc warnings on nested structures

Jie Deng (1):
  dwc-xlgmac: remove Jie Deng as co-maintainer

Paolo Abeni (1):
  cls_u32: fix use after free in u32_destroy_key()

Suresh Reddy (2):
  be2net: Fix HW stall issue in Lancer
  be2net: Handle transmit completion errors in Lancer

Tommi Rantala (1):
  sctp: fix dst refcnt leak in sctp_v4_get_dst

Tonghao Zhang (1):
  doc: Change the min default value of tcp_wmem/tcp_rmem.

William Tu (3):
  net: erspan: fix metadata extraction
  net: erspan: fix erspan config overwrite
  sample/bpf: fix erspan metadata

Wolfram Sang (1):
  net: amd-xgbe: fix comparison to bitshift when dealing with a mask

Yonghong Song (2):
  tools/bpf: permit selftests/bpf to be built in a different directory
  bpf: add documentation to compare clang "-target bpf" and default target

 Documentation/bpf/bpf_devel_QA.txt|  31 +++
 Documentation/networking/ip-sysctl.txt|   4 +-
 MAINTAINERS   |   1 -
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  |   2 +-
 drivers/net/ethernet/emulex/benet/be.h|   7 +-
 drivers/net/ethernet/emulex/benet/be_ethtool.c|   1 +
 drivers/net/ethernet/emulex/benet/be_hw.h |   1 +
 drivers/net/ethernet/emulex/benet/be_main.c   | 113 +-
 drivers/net/ethernet/ibm/ibmvnic.c|  14 +-
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  24 +--
 drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c   |  10 +-
 drivers/net/ethernet/netronome/nfp/nfp_net.h  |   6 +-
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h  |  43 ++--
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c |  21 +-
 drivers/net/netdevsim/bpf.c   |   5 +-
 drivers/net/phy/mdio_bus.c|   3 +-
 include/linux/netdevice.h |   6 +
 include/net/erspan.h  |  26 +--
 kernel/bpf/core.c |  32 ++-
 net/bpf/test_run.c|   4 +
 net/ipv4/ip_gre.c |  14 +-
 net/ipv6/ip6_gre.c|  15 +-
 net/rds/ib.c  |   3 +-
 net/sched/cls_u32.c   |  21 +-
 net/sctp/ipv6.c   |  10 +-
 net/sctp/protocol.c   |  10 +-
 samples/bpf/Makefile  |   2 +-
 samples/bpf/bpf_load.c| 102 -
 samples/bpf/bpf_load.h|   2 +-
 samples/bpf/tcbpf2_kern.c |  41 ++--
 samples/bpf/test_tunnel_bpf.sh|   4 +-
 samples/bpf/xdp1_user.c   

Re: Potential deadlock BUG in drivers/net/wireless/st/cw1200/sta.c (Linux 4.9)

2018-02-06 Thread Iago Abal
Hi,

This still looks like a deadlock bug to me, could someone take a look
as well and confirm? I will help preparing a patch if needed.

Thanks,

-- iago


On Fri, Nov 18, 2016 at 10:58 PM, Iago Abal  wrote:
> Hi,
>
> With the help of a static bug finder (EBA -
> https://github.com/models-team/eba) I have found a potential deadlock
> in drivers/net/wireless/st/cw1200/
> sta.c. This happens due to a recursive mutex_lock on `priv->conf_mutex'.
>
> If this is indeed a bug, I will be happy to help with a patch.
>
> A quick (not elegant) fix could be to unlock before the call to
> `cw1200_do_unjoin' in line 1174, and lock again afterwards. It seems
> that `cw1200_join_complete' is always called with `priv->conf_mutex'
> held. Another option could be to add a Boolean parameter to
> `cw1200_do_unjoin' to choose whether this function should take the
> lock itself. Yet another option would be to have a
> `__cw1200_do_unjoin' that does not lock, and make `cw1200_do_unjoin' a
> wrapper over this that adds the locking; `cw1200_join_complete' would
> call `__cw1200_do_unjoin' instead.
>
> Someone who is actually familiar with this code may have a better
> proposal though.
>
> The trace is as follows:
>
> 1. Function `cw1200_join_complete_work' takes the first lock in line 1189:
>
> // see 
> https://github.com/torvalds/linux/blob/v4.9-rc5/drivers/net/wireless/st/cw1200/sta.c#L1189
> mutex_lock(& priv->conf_mutex);
>
> 2. and subsequently calls `cw1200_join_complete';
> 3. which calls `cw1200_do_unjoin' in line 1174;
> 4. and this latter function takes the lock for the second time in line 1387:
>
> // see 
> https://github.com/torvalds/linux/blob/v4.9-rc5/drivers/net/wireless/st/cw1200/sta.c#L1387
> mutex_lock(& priv->conf_mutex);
>
> Hope it helps!
>
> --
> iago


[PATCH] igb: Fix a test with HWTSTAMP_TX_ON

2018-02-06 Thread Christophe JAILLET
'HWTSTAMP_TX_ON' should be handled as a value, not as a bit mask.
The modified code should behave the same, because HWTSTAMP_TX_ON is 1
and no other possible values of 'tx_type' would match the test.
However, this is more future-proof, should other values be allowed one day.

See 'struct hwtstamp_config' in 'include/uapi/linux/net_tstamp.h'

This fixes a warning reported by smatch:
   igb_xmit_frame_ring() warn: bit shifter 'HWTSTAMP_TX_ON' used for logical '&'

Fixes: 26bd4e2db06be ("igb: protect TX timestamping from API misuse")
Signed-off-by: Christophe JAILLET 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index c208753ff5b7..e945d1f7c7fe 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5727,7 +5727,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) {
struct igb_adapter *adapter = netdev_priv(tx_ring->netdev);
 
-   if (adapter->tstamp_config.tx_type & HWTSTAMP_TX_ON &&
+   if (adapter->tstamp_config.tx_type == HWTSTAMP_TX_ON &&
!test_and_set_bit_lock(__IGB_PTP_TX_IN_PROGRESS,
   >state)) {
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-- 
2.14.1


---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus



Re: [RFC PATCH 00/10] PTP support for DSA and mv88e6xxx driver.

2018-02-06 Thread Brandon Streiff
On 2/3/2018 3:40 PM, Andrew Lunn wrote:
> This patchset adds support for using the PTP hardware in switches
> supported by the mv88e6xxx driver. The code was produces in
> collaboration with Brandon Streiff doing the initial implementation,
> and then Richard Cochran and Andrew Lunn making further changes and
> cleanups.
> 
> ...

With regard to the updates that you and Richard have made to my
initial patchset:

Acked-by: Brandon Streiff 

Thank you for getting these into a more polished form.

-- brandon


[PATCH net-next] sun: Add SPDX license tags to Sun network drivers

2018-02-06 Thread Shannon Nelson
Add the appropriate SPDX license tags to the Sun network drivers
as outlined in Documentation/process/license-rules.rst.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/sun/Kconfig  | 1 +
 drivers/net/ethernet/sun/cassini.c| 1 +
 drivers/net/ethernet/sun/cassini.h| 1 +
 drivers/net/ethernet/sun/ldmvsw.c | 1 +
 drivers/net/ethernet/sun/niu.c| 1 +
 drivers/net/ethernet/sun/sunbmac.c| 1 +
 drivers/net/ethernet/sun/sungem.c | 1 +
 drivers/net/ethernet/sun/sunhme.c | 1 +
 drivers/net/ethernet/sun/sunqe.c  | 1 +
 drivers/net/ethernet/sun/sunvnet.c| 1 +
 drivers/net/ethernet/sun/sunvnet_common.c | 1 +
 11 files changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/sun/Kconfig b/drivers/net/ethernet/sun/Kconfig
index b2caf51..7b982e0 100644
--- a/drivers/net/ethernet/sun/Kconfig
+++ b/drivers/net/ethernet/sun/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
 #
 # Sun network device configuration
 #
diff --git a/drivers/net/ethernet/sun/cassini.c 
b/drivers/net/ethernet/sun/cassini.c
index 113bd57..9020b08 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* cassini.c: Sun Microsystems Cassini(+) ethernet driver.
  *
  * Copyright (C) 2004 Sun Microsystems Inc.
diff --git a/drivers/net/ethernet/sun/cassini.h 
b/drivers/net/ethernet/sun/cassini.h
index 882ce16..13f3860 100644
--- a/drivers/net/ethernet/sun/cassini.h
+++ b/drivers/net/ethernet/sun/cassini.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* $Id: cassini.h,v 1.16 2004/08/17 21:15:16 zaumen Exp $
  * cassini.h: Definitions for Sun Microsystems Cassini(+) ethernet driver.
  *
diff --git a/drivers/net/ethernet/sun/ldmvsw.c 
b/drivers/net/ethernet/sun/ldmvsw.c
index 5ea0376..a5dd627 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* ldmvsw.c: Sun4v LDOM Virtual Switch Driver.
  *
  * Copyright (C) 2016-2017 Oracle. All rights reserved.
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 06001ba..8dd545f 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* niu.c: Neptune ethernet driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller (da...@davemloft.net)
diff --git a/drivers/net/ethernet/sun/sunbmac.c 
b/drivers/net/ethernet/sun/sunbmac.c
index 0b1f41f..f047b27 100644
--- a/drivers/net/ethernet/sun/sunbmac.c
+++ b/drivers/net/ethernet/sun/sunbmac.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunbmac.c: Driver for Sparc BigMAC 100baseT ethernet adapters.
  *
  * Copyright (C) 1997, 1998, 1999, 2003, 2008 David S. Miller 
(da...@davemloft.net)
diff --git a/drivers/net/ethernet/sun/sungem.c 
b/drivers/net/ethernet/sun/sungem.c
index a7afcee..7a16d40 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* $Id: sungem.c,v 1.44.2.22 2002/03/13 01:18:12 davem Exp $
  * sungem.c: Sun GEM ethernet driver.
  *
diff --git a/drivers/net/ethernet/sun/sunhme.c 
b/drivers/net/ethernet/sun/sunhme.c
index 0431f1e..06da2f5 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunhme.c: Sparc HME/BigMac 10/100baseT half/full duplex auto switching,
  *   auto carrier detecting ethernet driver.  Also known as the
  *   "Happy Meal Ethernet" found on SunSwift SBUS cards.
diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c
index a6bcdcd..7fe0d5e 100644
--- a/drivers/net/ethernet/sun/sunqe.c
+++ b/drivers/net/ethernet/sun/sunqe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunqe.c: Sparc QuadEthernet 10baseT SBUS card driver.
  *  Once again I am out to prove that every ethernet
  *  controller out there can be most efficiently programmed
diff --git a/drivers/net/ethernet/sun/sunvnet.c 
b/drivers/net/ethernet/sun/sunvnet.c
index 27fb226..63d3d6b 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunvnet.c: Sun LDOM Virtual Network Driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller 
diff --git a/drivers/net/ethernet/sun/sunvnet_common.c 
b/drivers/net/ethernet/sun/sunvnet_common.c
index 8aa3ce4..d8f4c3f 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunvnet.c: Sun LDOM Virtual Network Driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller 
-- 
2.7.4



Two net_sched fixes for stable

2018-02-06 Thread Cong Wang
Hi, David

Can you queue the following commits for stable? They fix a
sleep-in-atomic warning reported by Roland.


commit efbf78973978b0d25af59bc26c8013a942af6e64
Author: Cong Wang 
Date:   Mon Dec 4 10:48:18 2017 -0800

net_sched: get rid of rcu_barrier() in tcf_block_put_ext()

commit df45bf84e4f5a48f23d4b1a07d21d566e8b587b2
Author: Jiri Pirko 
Date:   Fri Dec 8 19:27:27 2017 +0100

net: sched: fix use-after-free in tcf_block_put_ext


This problem was introduced by:

commit e2ef75445340ca7ec2c4558f84ae6c8c5d650fc8
Author: Cong Wang 
Date:   Mon Sep 11 16:33:31 2017 -0700

net_sched: fix reference counting of tc filter chain

so should apply to 4.14 too.

For 4.15, they can just apply nearly cleanly. However, it is not
straight-forward to cherry pick them to 4.14 due to some big changes
between 4.14 and 4.15.

Please let me know how you want to handle this for 4.14.

Thanks.


Re: xfrm, ip tunnel: non released device reference upon device unregistration

2018-02-06 Thread Eyal Birger
On Tue, 6 Feb 2018 14:15:09 +0100
Florian Westphal  wrote:

> Steffen Klassert  wrote:
> > I gave the patch a quick try, but still I get this:
> > 
> > unregister_netdevice: waiting for dummy1 to become free. Usage
> > count = 2  
> 
> Was that with Eyals setup or the bridge one I posted?
> 
> If it was Eyals setup, its possible the patch missed hookup
> to whatever tunnel infra is used (the setup I used has ipip tunnel,
> everything is ipv4).
> 

Thanks!

Indeed the setup I'm testing uses ip6_tunnel.
I have tested a fix in the spirit of the patch and it looks valid 
for ip6_tunnel as well.

It looks though that this change would need to be added to any tunnel
device using dst_cache (vxlan, geneve, gre, ...).

> Also, perhaps it would be best to not bother with checking the
> device in question at all and unconditionally put device reference
> of all the dst_caches.  With setups that have e.g. 1k devices going
> down per second (ppp dialin and the like) doing the full search for
> every notify event would be rather expensive.
> 

I'm wondering - non-xfrm dsts are already correctly invalidated,
so do you think it makes sense to invalidate caches for devices that
have no xfrm dsts? or maybe I didn't understand the suggestion?

Eyal.


Re: [bpf-next V2 PATCH 5/5] tools/libbpf: handle issues with bpf ELF objects containing .eh_frames

2018-02-06 Thread Daniel Borkmann
On 02/06/2018 06:03 PM, Jesper Dangaard Brouer wrote:
> On Tue, 6 Feb 2018 08:00:59 -0800 Alexei Starovoitov 
>  wrote:
>> On Tue, Feb 06, 2018 at 03:54:28PM +0100, Jesper Dangaard Brouer wrote:
>>> If clang >= 4.0.1 is missing the option '-target bpf', it will cause
>>> llc/llvm to create two ELF sections for "Exception Frames", with
>>> section names '.eh_frame' and '.rel.eh_frame'.
>>>
>>> The BPF ELF loader library libbpf fails when loading files with these
>>> sections.  The other in-kernel BPF ELF loader in samples/bpf/bpf_load.c,
>>> handle this gracefully. And iproute2 loader also seems to work with these
>>> "eh" sections.
>>>
>>> The issue in libbpf is caused by bpf_object__elf_collect() skip the
>>> '.eh_frame' and thus doesn't create an internal data structure
>>> pointing to this ELF section index.  Later when the relocation section
>>> '.rel.eh_frame' is processed, it tries to find the '.eh_frame' via the
>>> ELF section idx, which is that fails (in bpf_object__collect_reloc).
>>>
>>> I couldn't find a way to see that the '.rel.eh_frame' was irrelevant
>>> (that is only determined by looking at the section it reference, which
>>> we no longer have info available on).  
>>
>> but does this approach work for all extra sections and relocations emitted
>> when source is compiled with -g ?
> 
> No, but I plan to follow up and do a more complete solution later. This
> is a workaround to get the Suricata use-case working and also that
> samples/bpf/ can be loaded.

Aside from a needed fix in any case, is there a specifc reason why Suricata
cannot rely on 'clang -target bpf'? Is it asm inline headers in your case?

>> To address this case bpf_load.c does:
>>   if (shdr.sh_type == SHT_REL) {
>>   struct bpf_insn *insns;
>>
>>   /* locate prog sec that need map fixup (relocations) */
>>   if (get_sec(elf, shdr.sh_info, , _prog,
>>   _prog, _prog))
>>   continue;
>>
>>   if (shdr_prog.sh_type != SHT_PROGBITS ||
>>   !(shdr_prog.sh_flags & SHF_EXECINSTR))
>>   continue;
>>
>> why the same approach is not applicable here?
> 
> As described above bpf_object__elf_collect() skip the "real" section
> that the relo-section want to lookup (based on the same kind of
> check), but libbpf is now missing the section idx in its internal
> structures... and thus the relo lookup of the idx fails. (bpf_load.c
> does the lookup in the ELF obj directly, thus it does not have this
> problem).

Out of curiosity, I just double checked iproute2 loader (examples/bpf/):

$ clang -O2 -g -emit-llvm -c bpf_cyclic.c -o - | llc -march=bpf -mcpu=probe 
-filetype=obj -o bpf_cyclic.o
$ readelf -a bpf_cyclic.o | grep "\["
  [Nr] Name  Type Address   Offset
  [ 0]   NULL   
  [ 1] .strtab   STRTAB     16b0
  [ 2] .text PROGBITS   0040
  [ 3] 0xabccba/0PROGBITS   0040
  [ 4] .rel0xabccba/0REL    1120
  [ 5] classifierPROGBITS   00e8
  [ 6] .relclassifierREL    1130
  [ 7] maps  PROGBITS   0118
  [ 8] license   PROGBITS   013c
  [ 9] .debug_strPROGBITS   0140
  [10] .debug_locPROGBITS   03d5
  [11] .rel.debug_locREL    1140
  [12] .debug_abbrev PROGBITS   045a
  [13] .debug_info   PROGBITS   055c
  [14] .rel.debug_info   REL    11c0
  [15] .debug_ranges PROGBITS   088c
  [16] .rel.debug_ranges REL    15d0
  [17] .debug_macinfoPROGBITS   08ec
  [18] .debug_pubnames   PROGBITS   08ed
  [19] .rel.debug_pubnam REL    1650
  [20] .debug_pubtypes   PROGBITS   0954
  [21] .rel.debug_pubtyp REL    1660
  [22] .eh_frame PROGBITS   09c0
  [23] .rel.eh_frame REL    1670
  [24] .debug_line   PROGBITS   0a10
  [25] .rel.debug_line   REL    1690
  [26] .symtab   SYMTAB     0b08
# tc qdisc add dev lo clsact
# tc filter add dev lo ingress bpf da obj bpf_cyclic.o
# tc filter show dev lo ingress
filter protocol all pref 49152 bpf chain 0
filter protocol all pref 49152 bpf chain 0 handle 0x1 bpf_cyclic.o:[classifier] 
direct-action not_in_hw id 6 tag 736a8a004dead229

So no 

Re: [PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Andrew Lunn
On Tue, Feb 06, 2018 at 11:58:17AM -0500, Sven Van Asbroeck wrote:
> On Tue, Feb 6, 2018 at 11:50 AM, Andrew Lunn  wrote:
> > And a DSA driver does not need to be complex. You can start simple,
> > and add more features later.
> 
> I see. Would it be possible/practical to start with just phy_read/write,
> port_enable/disable in dsa_switch_ops ? And just add a sysfs bin file
> for userspace register access ?

I would NACK sysfs bin file. Do it right, or don't do it at all.

I think i would first want to know what Tristam/Microchip plans
are. Does he intend to keep on working on the patches from last year?

   Andrew


Re: [PATCH v2] socket: Provide put_cmsg_whitelist() for constant size copies

2018-02-06 Thread David Miller
From: Kees Cook 
Date: Wed, 7 Feb 2018 05:36:02 +1100

> Making put_cmsg() inline would help quite a bit with tracking the
> builtin_const-ness, and that could speed things up a little bit too.
> Would you be opposed to inlining?

Nope.


[PATCH iproute2 v1] ip netns: allow negative nsid

2018-02-06 Thread Christian Brauner
If the kernel receives a negative nsid it will automatically assign the
next available nsid. In this case alloc_netid() will set min and max to
0 for ird_alloc(). And when max == 0 idr_alloc() will interpret this as
the maxium range, i.e. specific to nsids it will try to find an id in
the range [0,INT_MAX). This is intentionally supported in the kernel for
nsids. Commit acbe9118ce8086f765ffb0da15f80c7c01a8903a regressed ip
netns in that respect although previously the use-case was either
accidentally supported or opaquely supported such that it triggered the
original commit. From what I can gather it went as follows before:
atoi() was called with a string indicating a negative value which caused
it to return -1 which was passed to the kernel. Let's make it less
opaque by introducing the keyword "auto":

ip netns set  auto

will cause nsid to be set to -1 and the kernel will select an available
nsid.

Signed-off-by: Christian Brauner 
---
ChangeLog v0->v1:
* introduce "auto" keyword for ip netns to automatically allocate an
  available nsid
---
 ip/ipnetns.c| 5 -
 man/man8/ip-netns.8 | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index 059a4220..631794b8 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -718,7 +718,10 @@ static int netns_set(int argc, char **argv)
return -1;
}
name = argv[0];
-   if (get_unsigned(, argv[1], 0))
+   /* If a negative nsid is specified the kernel will select the nsid. */
+   if (strcmp(argv[1], "auto") == 0)
+   nsid = -1;
+   else if (get_unsigned(, argv[1], 0))
invarg("Invalid \"netnsid\" value\n", argv[1]);
 
snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
diff --git a/man/man8/ip-netns.8 b/man/man8/ip-netns.8
index c5310e24..d539f18b 100644
--- a/man/man8/ip-netns.8
+++ b/man/man8/ip-netns.8
@@ -137,6 +137,7 @@ $ ip netns del net0
 .sp
 This command assigns a id to a peer network namespace. This id is valid
 only in the current network namespace.
+If the keyword "auto" is specified an available nsid will be chosen.
 This id will be used by the kernel in some netlink messages. If no id is
 assigned when the kernel needs it, it will be automatically assigned by
 the kernel.
-- 
2.14.1



Re: [PATCH v2] socket: Provide put_cmsg_whitelist() for constant size copies

2018-02-06 Thread Kees Cook
On Wed, Feb 7, 2018 at 3:19 AM, David Miller  wrote:
> From: Kees Cook 
> Date: Tue, 6 Feb 2018 04:31:50 +1100
>
>> On Tue, Feb 6, 2018 at 2:03 AM, David Miller  wrote:
>>> From: Kees Cook 
>>> Date: Fri, 2 Feb 2018 02:27:49 -0800
>>>
 @@ -343,6 +343,14 @@ struct ucred {

  extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct 
 sockaddr_storage *kaddr);
  extern int put_cmsg(struct msghdr*, int level, int type, int len, void 
 *data);
 +/*
 + * Provide a bounce buffer for copying cmsg data to userspace when the
 + * target memory isn't already whitelisted for hardened usercopy.
 + */
 +#define put_cmsg_whitelist(_msg, _level, _type, _ptr) ({ \
 + typeof(*(_ptr)) _val = *(_ptr); \
 + put_cmsg(_msg, _level, _type, sizeof(_val), &_val); \
 + })
>>>
>>> I understand what you are trying to achieve, but it's at a real cost
>>> here.  Some of these objects are structures, for example the struct
>>> sock_extended_err is 16 bytes.
>>
>> It didn't look like put_cmsg() was on a fast path, so it seemed like a
>> bounce buffer was the best solution here (and it's not without
>> precedent).
>
> For some things like timestamps it can be important.

Making put_cmsg() inline would help quite a bit with tracking the
builtin_const-ness, and that could speed things up a little bit too.
Would you be opposed to inlining?

-Kees

-- 
Kees Cook
Pixel Security


Re: [PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Sven Van Asbroeck
Andrew and Florian, thanks for your input.

On Tue, Feb 6, 2018 at 12:05 PM, Andrew Lunn  wrote:
> I would NACK sysfs bin file. Do it right, or don't do it at all.

On Tue, Feb 6, 2018 at 12:47 PM, Florian Fainelli  wrote:
> Sven, there is a standard ethtool register dump interface that can be
> used to provide register dumps, if necessary, that might fit your use case.

I'm not keen on the sysfs bin file either, the existing driver has this,
and we use it because it's there.

Our use case is as follows: all we need is a reset on boot, followed by
a few static register bitfiddles which reflect the way we've integrated
the IC into our product. There is no further Linux interaction.
I guess the devicetree would be a natural place to store the required
register changes. No need for sysfs.

As I said before, not sure how others use this chip.

Would the above be attainable by a (trivial) DSA driver?


Re: possible deadlock in rtnl_lock (3)

2018-02-06 Thread Dmitry Vyukov
On Tue, Feb 6, 2018 at 6:58 PM, syzbot
 wrote:
> Hello,
>
> syzbot hit the following crash on net-next commit
> 617aebe6a97efa539cc4b8a52adccd89596e6be0 (Sun Feb 4 00:25:42 2018 +)
> Merge tag 'usercopy-v4.16-rc1' of
> git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
>
> So far this crash happened 2510 times on net-next, upstream.
> C reproducer is attached.
> syzkaller reproducer is attached.
> Raw console output is attached.
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+63682ce11532e0da2...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.


Paolo, was this also fixed by "netfilter: on sockopt() acquire sock
lock only in the required scope"?


> ==
> WARNING: possible circular locking dependency detected
> 4.15.0+ #221 Not tainted
> --
> syzkaller414214/4173 is trying to acquire lock:
>  (rtnl_mutex){+.+.}, at: [<3cc93f9b>] rtnl_lock+0x17/0x20
> net/core/rtnetlink.c:74
>
> but task is already holding lock:
>  ([i].mutex){+.+.}, at: [<59cfac75>]
> xt_find_table_lock+0x3e/0x3e0 net/netfilter/x_tables.c:1041
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #2 ([i].mutex){+.+.}:
>__mutex_lock_common kernel/locking/mutex.c:756 [inline]
>__mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
>mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
>xt_find_table_lock+0x3e/0x3e0 net/netfilter/x_tables.c:1041
>xt_request_find_table_lock+0x28/0xc0 net/netfilter/x_tables.c:1088
>get_info+0x154/0x690 net/ipv6/netfilter/ip6_tables.c:989
>do_ipt_get_ctl+0x159/0xac0 net/ipv4/netfilter/ip_tables.c:1699
>nf_sockopt net/netfilter/nf_sockopt.c:104 [inline]
>nf_getsockopt+0x6a/0xc0 net/netfilter/nf_sockopt.c:122
>ip_getsockopt+0x15c/0x220 net/ipv4/ip_sockglue.c:1571
>tcp_getsockopt+0x82/0xd0 net/ipv4/tcp.c:3359
>sock_common_getsockopt+0x95/0xd0 net/core/sock.c:2934
>SYSC_getsockopt net/socket.c:1880 [inline]
>SyS_getsockopt+0x178/0x340 net/socket.c:1862
>entry_SYSCALL_64_fastpath+0x29/0xa0
>
> -> #1 (sk_lock-AF_INET){+.+.}:
>lock_sock_nested+0xc2/0x110 net/core/sock.c:2777
>lock_sock include/net/sock.h:1463 [inline]
>do_ip_setsockopt.isra.12+0x1d9/0x3210 net/ipv4/ip_sockglue.c:646
>ip_setsockopt+0x3a/0xa0 net/ipv4/ip_sockglue.c:1252
>udp_setsockopt+0x45/0x80 net/ipv4/udp.c:2401
>sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2975
>SYSC_setsockopt net/socket.c:1849 [inline]
>SyS_setsockopt+0x189/0x360 net/socket.c:1828
>entry_SYSCALL_64_fastpath+0x29/0xa0
>
> -> #0 (rtnl_mutex){+.+.}:
>lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
>__mutex_lock_common kernel/locking/mutex.c:756 [inline]
>__mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
>mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
>rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74
>unregister_netdevice_notifier+0x91/0x4e0 net/core/dev.c:1673
>clusterip_config_entry_put net/ipv4/netfilter/ipt_CLUSTERIP.c:114
> [inline]
>clusterip_tg_destroy+0x389/0x6e0
> net/ipv4/netfilter/ipt_CLUSTERIP.c:518
>cleanup_entry+0x218/0x350 net/ipv4/netfilter/ip_tables.c:654
>__do_replace+0x79d/0xa50 net/ipv4/netfilter/ip_tables.c:1089
>do_replace net/ipv4/netfilter/ip_tables.c:1145 [inline]
>do_ipt_set_ctl+0x40f/0x5f0 net/ipv4/netfilter/ip_tables.c:1675
>nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
>nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115
>ip_setsockopt+0x97/0xa0 net/ipv4/ip_sockglue.c:1259
>tcp_setsockopt+0x82/0xd0 net/ipv4/tcp.c:2905
>sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2975
>SYSC_setsockopt net/socket.c:1849 [inline]
>SyS_setsockopt+0x189/0x360 net/socket.c:1828
>entry_SYSCALL_64_fastpath+0x29/0xa0
>
> other info that might help us debug this:
>
> Chain exists of:
>   rtnl_mutex --> sk_lock-AF_INET --> [i].mutex
>
>  Possible unsafe locking scenario:
>
>CPU0CPU1
>
>   lock([i].mutex);
>lock(sk_lock-AF_INET);
>lock([i].mutex);
>   lock(rtnl_mutex);
>
>  *** DEADLOCK ***
>
> 1 lock held by syzkaller414214/4173:
>  #0:  ([i].mutex){+.+.}, at: [<59cfac75>]
> xt_find_table_lock+0x3e/0x3e0 net/netfilter/x_tables.c:1041
>
> stack backtrace:
> CPU: 1 PID: 4173 Comm: syzkaller414214 Not 

Re: Regression in 39461af74125 "bitmap: replace bitmap_{from,to}_u32array"

2018-02-06 Thread David Miller
From: Yury Norov 
Date: Tue, 6 Feb 2018 20:03:13 +0300

> On Tue, Feb 06, 2018 at 11:42:18AM -0500, David Miller wrote:
>> From: Yury Norov 
>> Date: Tue, 6 Feb 2018 19:26:23 +0300
>> 
>> > On Tue, Feb 06, 2018 at 11:17:36AM -0500, David Miller wrote:
>> >> From: Heiner Kallweit 
>> >> Date: Mon, 5 Feb 2018 07:21:32 +0100
>> >> 
>> >> > Recently ethtool started to give incomplete values for supported and
>> >> > advertised modes. There seems to be a regression in this commit:
>> >> > The bit number parameter in the calls to bitmap_to_arr32() in
>> >> > store_link_ksettings_for_user() should be __ETHTOOL_LINK_MODE_MASK_NBITS
>> >> > instead of __ETHTOOL_LINK_MODE_MASK_NU32.
>> >> > 
>> >> > After having changed this ethtool behaves normally again.
>> >> 
>> >> I do not see the commit with the SHA1 ID from the Subject line in
>> >> either Linus's nor my networking tree.
>> >> 
>> >> Where is that change?
>> > 
>> > It's in next-20180202.
>> 
>> Hmmm, then the fix should probably go into whatever tree that commit
>> ultimately came from.
> 
> It's already in Andrew's tree. The original patch is not about networking,
> and Andrew's tree is probably better place for fix. My apologies for noise
> in your tree.
> 
> http://ozlabs.org/~akpm/mmots/broken-out/bitmap-replace-bitmap_fromto_u32array-fix-2.patch

Thanks for the clarification.


Re: [PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Florian Fainelli
On 02/06/2018 09:05 AM, Andrew Lunn wrote:
> On Tue, Feb 06, 2018 at 11:58:17AM -0500, Sven Van Asbroeck wrote:
>> On Tue, Feb 6, 2018 at 11:50 AM, Andrew Lunn  wrote:
>>> And a DSA driver does not need to be complex. You can start simple,
>>> and add more features later.
>>
>> I see. Would it be possible/practical to start with just phy_read/write,
>> port_enable/disable in dsa_switch_ops ? And just add a sysfs bin file
>> for userspace register access ?
> 
> I would NACK sysfs bin file. Do it right, or don't do it at all.

Sven, there is a standard ethtool register dump interface that can be
used to provide register dumps, if necessary, that might fit your use case.

Aside from the phy_read/write, port_enable/disable, you will likely need
to have a get_tag_protocol() implementation returning
DSA_TAG_PROTO_NONE, and a setup() operation, other should be entirely
optional for now.

Out of curiosity, are there any break out boards with a KS8995 switch
available that we could e.g: plug to a Raspberry Pi or any similar board?

> 
> I think i would first want to know what Tristam/Microchip plans
> are. Does he intend to keep on working on the patches from last year?

Yes, this would be good to keep alive, the patches look good, they just
need to get in now.
-- 
Florian


Re: [PATCH iproute2] ip netns: allow negative nsid

2018-02-06 Thread Stephen Hemminger
On Tue,  6 Feb 2018 16:16:15 +0100
Christian Brauner  wrote:

> If the kernel receives a negative nsid it will automatically assign the
> next available nsid. In this case alloc_netid() will set min and max to
> 0 for ird_alloc(). And when max == 0 idr_alloc() will interpret this as
> the maxium range, i.e. specific to nsids it will try to find an id in
> the range [0,INT_MAX). This is intentionally supported in the kernel for
> nsids. Commit acbe9118ce8086f765ffb0da15f80c7c01a8903a regressed ip
> netns in that respect although previously the use-case was either
> accidentally supported or opaquely supported such that it triggered the
> original commit. From what I can gather it went as follows before:
> atoi() was called with a string indicating a negative value which caused
> it to return -1 which was passed to the kernel. Let's make it less
> opaque and use get_integer() and set to -1 when a negative nsid was
> requested. This restores the old behavior.
> 
> Signed-off-by: Christian Brauner 

Rather than negative value, it would be better from user interface
point of view to use a user readable value like "auto" which encapsulates
the kernel behavior.




Re: [PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Andrew Lunn
On Tue, Feb 06, 2018 at 11:41:14AM -0500, Sven Van Asbroeck wrote:
> On Tue, Feb 6, 2018 at 11:14 AM, Andrew Lunn  wrote:
> > Rather than invest time in this driver, it would be better to look
> > into writing a DSA driver.
> 
> Thank you Andrew. I know little of DSA, but at first sight it appears to
> be a _very_ complicated beast for a switch PHY which only
> needs a few static register settings applied to it on startup, and has
> no further Linux interaction?

Hi Sven

If you want to treat it as a dumb switch, then a simple driver is
sufficient. But it can do a lot more. Do you need spanning tree, or
are you happy for your network to collapse if there is a loop? Do you
want access to statistics? Know if links are up/down? VLAN support?
Save some power by enabling EEE? DSA will give you these features.

And a DSA driver does not need to be complex. You can start simple,
and add more features later.

 Andrew


Re: [PATCH] Carrier detect ok, don't turn off negotiation

2018-02-06 Thread Denis Du


Ok, I submit it  again.


In drivers/net/wan/hdlc_ppp.c, some noise on physical line can cause the 
carrier detect still ok, but the protocol will fail. So if carrier detect ok, 
don't turn off protocol negotiation

This patch is against the kernel version Linux 4.15-rc8





On Tuesday, February 6, 2018, 10:29:53 AM EST, David Miller 
 wrote: 





From: Denis Du 

Date: Tue, 6 Feb 2018 15:15:28 + (UTC)

> How  do you think my patch?
> 
> As you see, Krzysztof  think my patch is ok to be accepted.
> But if you have a better idea to fix it,I am glad to see it. Anyway, this 
> issue have to be fixed.


Please resubmit it and I'll think about it again, thank you.

From b5902a4dfc709b62b704997ab64f31c9ef69a6db Mon Sep 17 00:00:00 2001
From: Denis Du 
Date: Mon, 15 Jan 2018 17:26:06 -0500
Subject: [PATCH] netdev: carrier detect ok, don't turn off negotiation

Sometimes when physical lines have a just good noise to make the protocol
handshaking fail, but the carrier detect still good. Then after remove of
the noise, nobody will trigger this protocol to be start again to cause
the link to never come back. The fix is when the carrier is still on, not
terminate the protocol handshaking.

Signed-off-by: Denis Du 
---
 drivers/net/wan/hdlc_ppp.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index afeca6b..ab8b3cb 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -574,7 +574,10 @@ static void ppp_timer(struct timer_list *t)
 			ppp_cp_event(proto->dev, proto->pid, TO_GOOD, 0, 0,
  0, NULL);
 			proto->restart_counter--;
-		} else
+		} else if (netif_carrier_ok(proto->dev))
+			ppp_cp_event(proto->dev, proto->pid, TO_GOOD, 0, 0,
+ 0, NULL);
+		else
 			ppp_cp_event(proto->dev, proto->pid, TO_BAD, 0, 0,
  0, NULL);
 		break;
-- 
2.1.4



Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue

2018-02-06 Thread Eric Dumazet
On Tue, Feb 6, 2018 at 8:27 AM, Tal Gilboa  wrote:
> On 2/6/2018 5:52 PM, Eric Dumazet wrote:
>>
>> On Tue, 2018-02-06 at 15:22 +, David Laight wrote:
>>>
>>> From: Eric Dumazet

 Sent: 06 February 2018 14:20
>>>
>>>
>>> ...

 Please give exact details.
 Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little
 sense.
 We are not optimizing stack for pathological cases, sorry.
>>>
>>>
>>> There are plenty of workloads which are not bulk data and where multiple
>>> small buffers get sent at unknown intervals (which may be back to back).
>>> Such connections have to have Nagle disabled because the Nagle delays
>>> are 'horrid'.
>>> Clearly lost packets can cause delays, but they are rare on local
>>> networks.
>>
>>
>> Auto corking makes sure aggregation happens, even for when Nagle is in
>> the picture.
>
>
>>
>> netperf -- -m 256will still cook 64KB TSO packets
>
>
> This is what we would have liked to see, but auto corking isn't forcing 64KB
> TSO packets. Under certain conditions, specifically when TX queue is empty,
> it would send the SKB to transmit even if it isn't full:

Yes.

Auto corking does not predict the future, nor arm a high resolution
timer when application
does a send(small_size)

This packet is sent immaediately, as instructed by application and TCP
normal behavior.

But second or third packet would detect the condition.

Unless a driver does skb_orphan() too early, breaking back pressure.

> static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
> int size_goal)
> {
> return skb->len < size_goal &&
>sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
>skb != tcp_write_queue_head(sk) &&
>refcount_read(>sk_wmem_alloc) > skb->truesize;
> }
> When skb == tcp_write_queue_head(sk) corking is done. This is part of the
> optimization for mlx5 driver I've mentioned. If we can better utilize auto
> corking we shouldn't have an issue.

Or not issue expensive system calls for small payloads. stdio was
invented a while back :)

Meltdown/Spectre mitigation put high price to system calls nowadays.


Re: [bpf-next V2 PATCH 5/5] tools/libbpf: handle issues with bpf ELF objects containing .eh_frames

2018-02-06 Thread Jesper Dangaard Brouer

On Tue, 6 Feb 2018 08:00:59 -0800 Alexei Starovoitov 
 wrote:

> On Tue, Feb 06, 2018 at 03:54:28PM +0100, Jesper Dangaard Brouer wrote:
> > If clang >= 4.0.1 is missing the option '-target bpf', it will cause
> > llc/llvm to create two ELF sections for "Exception Frames", with
> > section names '.eh_frame' and '.rel.eh_frame'.
> > 
> > The BPF ELF loader library libbpf fails when loading files with these
> > sections.  The other in-kernel BPF ELF loader in samples/bpf/bpf_load.c,
> > handle this gracefully. And iproute2 loader also seems to work with these
> > "eh" sections.
> > 
> > The issue in libbpf is caused by bpf_object__elf_collect() skip the
> > '.eh_frame' and thus doesn't create an internal data structure
> > pointing to this ELF section index.  Later when the relocation section
> > '.rel.eh_frame' is processed, it tries to find the '.eh_frame' via the
> > ELF section idx, which is that fails (in bpf_object__collect_reloc).
> > 
> > I couldn't find a way to see that the '.rel.eh_frame' was irrelevant
> > (that is only determined by looking at the section it reference, which
> > we no longer have info available on).  
> 
> but does this approach work for all extra sections and relocations emitted
> when source is compiled with -g ?

No, but I plan to follow up and do a more complete solution later. This
is a workaround to get the Suricata use-case working and also that
samples/bpf/ can be loaded.

> To address this case bpf_load.c does:
>   if (shdr.sh_type == SHT_REL) {
>   struct bpf_insn *insns;
> 
>   /* locate prog sec that need map fixup (relocations) */
>   if (get_sec(elf, shdr.sh_info, , _prog,
>   _prog, _prog))
>   continue;
> 
>   if (shdr_prog.sh_type != SHT_PROGBITS ||
>   !(shdr_prog.sh_flags & SHF_EXECINSTR))
>   continue;
> 
> why the same approach is not applicable here?

As described above bpf_object__elf_collect() skip the "real" section
that the relo-section want to lookup (based on the same kind of
check), but libbpf is now missing the section idx in its internal
structures... and thus the relo lookup of the idx fails. (bpf_load.c
does the lookup in the ELF obj directly, thus it does not have this
problem).


> I guess we can apply this workaround as-is but it looks incomplete.

Yes, it is a workaround to move forward... it requires a larger change
to libbpf, so it stores idx'es of skipped sections.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: Regression in 39461af74125 "bitmap: replace bitmap_{from,to}_u32array"

2018-02-06 Thread Yury Norov
On Tue, Feb 06, 2018 at 11:42:18AM -0500, David Miller wrote:
> From: Yury Norov 
> Date: Tue, 6 Feb 2018 19:26:23 +0300
> 
> > On Tue, Feb 06, 2018 at 11:17:36AM -0500, David Miller wrote:
> >> From: Heiner Kallweit 
> >> Date: Mon, 5 Feb 2018 07:21:32 +0100
> >> 
> >> > Recently ethtool started to give incomplete values for supported and
> >> > advertised modes. There seems to be a regression in this commit:
> >> > The bit number parameter in the calls to bitmap_to_arr32() in
> >> > store_link_ksettings_for_user() should be __ETHTOOL_LINK_MODE_MASK_NBITS
> >> > instead of __ETHTOOL_LINK_MODE_MASK_NU32.
> >> > 
> >> > After having changed this ethtool behaves normally again.
> >> 
> >> I do not see the commit with the SHA1 ID from the Subject line in
> >> either Linus's nor my networking tree.
> >> 
> >> Where is that change?
> > 
> > It's in next-20180202.
> 
> Hmmm, then the fix should probably go into whatever tree that commit
> ultimately came from.

It's already in Andrew's tree. The original patch is not about networking,
and Andrew's tree is probably better place for fix. My apologies for noise
in your tree.

http://ozlabs.org/~akpm/mmots/broken-out/bitmap-replace-bitmap_fromto_u32array-fix-2.patch


Re: [PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Sven Van Asbroeck
On Tue, Feb 6, 2018 at 11:50 AM, Andrew Lunn  wrote:
> And a DSA driver does not need to be complex. You can start simple,
> and add more features later.

I see. Would it be possible/practical to start with just phy_read/write,
port_enable/disable in dsa_switch_ops ? And just add a sysfs bin file
for userspace register access ?


Re: [PATCH net 0/2] be2net: patch-set

2018-02-06 Thread David Miller
From: Suresh Reddy 
Date: Tue,  6 Feb 2018 08:52:40 -0500

> Hi Dave, Please consider applying these two patches to net

Series applied, thank you.


Re: [PATCH v3] RDS: IB: Fix null pointer issue

2018-02-06 Thread David Miller
From: Guanglei Li 
Date: Tue,  6 Feb 2018 10:43:21 +0800

> Scenario:
> 1. Port down and do fail over
> 2. Ap do rds_bind syscall
 ...
> PID: 45659  PID: 47039
> rds_ib_laddr_check
>   /* create id_priv with a null event_handler */
>   rdma_create_id
>   rdma_bind_addr
> cma_acquire_dev
>   /* add id_priv to cma_dev->id_list */
>   cma_attach_to_dev
> cma_ndev_work_handler
>   /* event_hanlder is null */
>   id_priv->id.event_handler
> 
> Signed-off-by: Guanglei Li 
> Signed-off-by: Honglei Wang 
> Reviewed-by: Junxiao Bi 
> Reviewed-by: Yanjun Zhu 
> Reviewed-by: Leon Romanovsky 
> Acked-by: Santosh Shilimkar 
> Acked-by: Doug Ledford 

Applied, thanks.


[PATCH net] rxrpc: Fix received abort handling

2018-02-06 Thread David Howells
AF_RXRPC is incorrectly sending back to the server any abort it receives
for a client connection.  This is due to the final-ACK offload to the
connection event processor patch.  The abort code is copied into the
last-call information on the connection channel and then the event
processor is set.

Instead, the following should be done:

 (1) In the case of a final-ACK for a successful call, the ACK should be
 scheduled as before.

 (2) In the case of a locally generated ABORT, the ABORT details should be
 cached for sending in response to further packets related to that
 call and no further action scheduled at call disconnect time.

 (3) In the case of an ACK received from the peer, the call should be
 considered dead, no ABORT should be transmitted at this time.  In
 response to further non-ABORT packets from the peer relating to this
 call, an RX_USER_ABORT ABORT should be transmitted.

 (4) In the case of a call killed due to network error, an RX_USER_ABORT
 ABORT should be cached for transmission in response to further
 packets, but no ABORT should be sent at this time.

Fixes: 3136ef49a14c ("rxrpc: Delay terminal ACK transmission on a client call")
Signed-off-by: David Howells 
---

 net/rxrpc/conn_client.c |3 ++-
 net/rxrpc/conn_object.c |   16 
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 7f74ca3059f8..064175068059 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -834,7 +834,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 * can be skipped if we find a follow-on call.  The first DATA packet
 * of the follow on call will implicitly ACK this call.
 */
-   if (test_bit(RXRPC_CALL_EXPOSED, >flags)) {
+   if (call->completion == RXRPC_CALL_SUCCEEDED &&
+   test_bit(RXRPC_CALL_EXPOSED, >flags)) {
unsigned long final_ack_at = jiffies + 2;
 
WRITE_ONCE(chan->final_ack_at, final_ack_at);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index c628351eb900..ccbac190add1 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -177,13 +177,21 @@ void __rxrpc_disconnect_call(struct rxrpc_connection 
*conn,
 * through the channel, whilst disposing of the actual call 
record.
 */
trace_rxrpc_disconnect_call(call);
-   if (call->abort_code) {
-   chan->last_abort = call->abort_code;
-   chan->last_type = RXRPC_PACKET_TYPE_ABORT;
-   } else {
+   switch (call->completion) {
+   case RXRPC_CALL_SUCCEEDED:
chan->last_seq = call->rx_hard_ack;
chan->last_type = RXRPC_PACKET_TYPE_ACK;
+   break;
+   case RXRPC_CALL_LOCALLY_ABORTED:
+   chan->last_abort = call->abort_code;
+   chan->last_type = RXRPC_PACKET_TYPE_ABORT;
+   break;
+   default:
+   chan->last_abort = RX_USER_ABORT;
+   chan->last_type = RXRPC_PACKET_TYPE_ABORT;
+   break;
}
+
/* Sync with rxrpc_conn_retransmit(). */
smp_wmb();
chan->last_call = chan->call_id;



Re: [PATCH net] nfp: fix kdoc warnings on nested structures

2018-02-06 Thread David Miller
From: Jakub Kicinski 
Date: Mon,  5 Feb 2018 15:29:27 -0800

> Commit 84ce5b987783 ("scripts: kernel-doc: improve nested logic to
> handle multiple identifiers") improved the handling of nested structure
> definitions in scripts/kernel-doc, and changed the expected format of
> documentation.  This causes new warnings to appear on W=1 builds.
> 
> Only comment changes.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Simon Horman 

Applied, thanks Jakub.


Re: Regression in 39461af74125 "bitmap: replace bitmap_{from,to}_u32array"

2018-02-06 Thread David Miller
From: Yury Norov 
Date: Tue, 6 Feb 2018 19:26:23 +0300

> On Tue, Feb 06, 2018 at 11:17:36AM -0500, David Miller wrote:
>> From: Heiner Kallweit 
>> Date: Mon, 5 Feb 2018 07:21:32 +0100
>> 
>> > Recently ethtool started to give incomplete values for supported and
>> > advertised modes. There seems to be a regression in this commit:
>> > The bit number parameter in the calls to bitmap_to_arr32() in
>> > store_link_ksettings_for_user() should be __ETHTOOL_LINK_MODE_MASK_NBITS
>> > instead of __ETHTOOL_LINK_MODE_MASK_NU32.
>> > 
>> > After having changed this ethtool behaves normally again.
>> 
>> I do not see the commit with the SHA1 ID from the Subject line in
>> either Linus's nor my networking tree.
>> 
>> Where is that change?
> 
> It's in next-20180202.

Hmmm, then the fix should probably go into whatever tree that commit
ultimately came from.


Re: [PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Sven Van Asbroeck
On Tue, Feb 6, 2018 at 11:14 AM, Andrew Lunn  wrote:
> Rather than invest time in this driver, it would be better to look
> into writing a DSA driver.

Thank you Andrew. I know little of DSA, but at first sight it appears to
be a _very_ complicated beast for a switch PHY which only
needs a few static register settings applied to it on startup, and has
no further Linux interaction? That's how we use the part, anyway.
Not sure what its typical use case would be.

Woojung and Tristam, what do you think ?
ksz8795 datasheet:
http://ww1.microchip.com/downloads/en/DeviceDoc/2112B.pdf


Re: [NetDev-info] Distributed Switch Architecture for 88E6390

2018-02-06 Thread Andrew Lunn
> Let me know if you're interested in any testing.  Unfortunately I'm going
> to be limited to kernel 4.4.38 though as the tegra BSP would need to be
> ported in its entirety (a code-base that I don't control).

Hi Dave

You might be interested in:

https://lkml.org/lkml/2018/2/6/24

Andrew


Re: [PATCH net 0/3] net: erspan fixes

2018-02-06 Thread David Miller
From: William Tu 
Date: Mon,  5 Feb 2018 13:35:33 -0800

> The first patch fixes erspan metadata extraction issue from packet
> header due to commit d350a823020e ("net: erspan: create erspan metadata
> uapi header").  The commit moves the erspan 'version' in
> 'struct erspan_metadata' in front of 'struct erspan_md2' for later
> extensibility, but breaks the existing metadata extraction code due
> to extra 4-byte size 'version'.  The second patch fixes the case where
> tunnel device receives an erspan packet with different tunnel metadata
> (ex: version, index, hwid, direction), existing code overwrites the
> tunnel device's erspan configuration.  The third patch fixes the bpf
> tests due to the above patches.

Applied, thanks William.


Re: [PATCH net v4] cls_u32: fix use after free in u32_destroy_key()

2018-02-06 Thread David Miller
From: Paolo Abeni 
Date: Mon,  5 Feb 2018 22:23:01 +0100

> Li Shuang reported an Oops with cls_u32 due to an use-after-free
> in u32_destroy_key(). The use-after-free can be triggered with:
> 
> dev=lo
> tc qdisc add dev $dev root handle 1: htb default 10
> tc filter add dev $dev parent 1: prio 5 handle 1: protocol ip u32 divisor 256
> tc filter add dev $dev protocol ip parent 1: prio 5 u32 ht 800:: match ip dst\
>  10.0.0.0/8 hashkey mask 0xff00 at 16 link 1:
> tc qdisc del dev $dev root
> 
> Which causes the following kasan splat:
 ...
> The problem is that the htnode is freed before the linked knodes and the
> latter will try to access the first at u32_destroy_key() time.
> This change addresses the issue using the htnode refcnt to guarantee
> the correct free order. While at it also add a RCU annotation,
> to keep sparse happy.
> 
> v1 -> v2: use rtnl_derefence() instead of RCU read locks
> v2 -> v3:
>   - don't check refcnt in u32_destroy_hnode()
>   - cleaned-up u32_destroy() implementation
>   - cleaned-up code comment
> v3 -> v4:
>   - dropped unneeded comment
> 
> Reported-by: Li Shuang 
> Fixes: c0d378ef1266 ("net_sched: use tcf_queue_work() in u32 filter")
> Signed-off-by: Paolo Abeni 

Applied and queued up for -stable, thanks!


Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue

2018-02-06 Thread Tal Gilboa

On 2/6/2018 5:52 PM, Eric Dumazet wrote:

On Tue, 2018-02-06 at 15:22 +, David Laight wrote:

From: Eric Dumazet

Sent: 06 February 2018 14:20


...

Please give exact details.
Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little sense.
We are not optimizing stack for pathological cases, sorry.


There are plenty of workloads which are not bulk data and where multiple
small buffers get sent at unknown intervals (which may be back to back).
Such connections have to have Nagle disabled because the Nagle delays
are 'horrid'.
Clearly lost packets can cause delays, but they are rare on local networks.


Auto corking makes sure aggregation happens, even for when Nagle is in
the picture.




netperf -- -m 256will still cook 64KB TSO packets


This is what we would have liked to see, but auto corking isn't forcing 
64KB TSO packets. Under certain conditions, specifically when TX queue 
is empty, it would send the SKB to transmit even if it isn't full:

static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
int size_goal)
{
return skb->len < size_goal &&
   sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
   skb != tcp_write_queue_head(sk) &&
   refcount_read(>sk_wmem_alloc) > skb->truesize;
}
When skb == tcp_write_queue_head(sk) corking is done. This is part of 
the optimization for mlx5 driver I've mentioned. If we can better 
utilize auto corking we shouldn't have an issue.




netperf is not adding delays between each send(), unless it has been
modified.




I ran this command:
./super_netperf 2000 -H  -l 30 -f g -- -m $size
didn't change netperf in any way.


Re: Regression in 39461af74125 "bitmap: replace bitmap_{from,to}_u32array"

2018-02-06 Thread Yury Norov
On Tue, Feb 06, 2018 at 11:17:36AM -0500, David Miller wrote:
> From: Heiner Kallweit 
> Date: Mon, 5 Feb 2018 07:21:32 +0100
> 
> > Recently ethtool started to give incomplete values for supported and
> > advertised modes. There seems to be a regression in this commit:
> > The bit number parameter in the calls to bitmap_to_arr32() in
> > store_link_ksettings_for_user() should be __ETHTOOL_LINK_MODE_MASK_NBITS
> > instead of __ETHTOOL_LINK_MODE_MASK_NU32.
> > 
> > After having changed this ethtool behaves normally again.
> 
> I do not see the commit with the SHA1 ID from the Subject line in
> either Linus's nor my networking tree.
> 
> Where is that change?

It's in next-20180202.

Yury


Re: [PATCH 4/4] net: amd-xgbe: fix comparison to bitshift when dealing with a mask

2018-02-06 Thread David Miller
From: Wolfram Sang 
Date: Mon,  5 Feb 2018 21:10:01 +0100

> Due to a typo, the mask was destroyed by a comparison instead of a bit
> shift.
> 
> Signed-off-by: Wolfram Sang 

Applied and queued up for -stable, thanks.


Re: [PATCH v2 1/1] tcp: Honor the eor bit in tcp_mtu_probe

2018-02-06 Thread David Miller
From: Ilya Lesokhin 
Date: Mon,  5 Feb 2018 20:18:32 +0200

> Avoid SKB coalescing if eor bit is set in one of the relevant
> SKBs.
> 
> Fixes: c134ecb87817 ("tcp: Make use of MSG_EOR in tcp_sendmsg")
> Signed-off-by: Ilya Lesokhin 

This adds a build warning, please fix:

  CC  net/ipv4/tcp_output.o
In file included from ./include/linux/tcp.h:21:0,
 from ./include/net/tcp.h:24,
 from net/ipv4/tcp_output.c:39:
net/ipv4/tcp_output.c: In function ‘tcp_write_xmit’:
./include/linux/skbuff.h:3196:12: warning: ‘skb’ may be used uninitialized in 
this function [-Wmaybe-uninitialized]
   for (tmp = skb->next;  \
^
net/ipv4/tcp_output.c:2043:18: note: ‘skb’ was declared here
  struct sk_buff *skb, *nskb, *next;
  ^~~

This is with:

[davem@dhcp-10-15-49-227 net]$ gcc --version
gcc (GCC) 7.3.1 20180130 (Red Hat 7.3.1-2)
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

[davem@dhcp-10-15-49-227 net]$


Re: [PATCH net] net: phy: Handle not having GPIO enabled in the kernel

2018-02-06 Thread David Miller
From: Andrew Lunn 
Date: Mon,  5 Feb 2018 19:17:23 +0100

> If CONFIG_GPIOLIB is disabled, fwnode_get_named_gpiod() becomes a stub
> function, which return -ENOSYS. Handle this in the same way as
> -ENOENT, i.e. assume there is no GPIO used to reset the PHYs.
> 
> Reported-by: Christian Zigotzky 
> Tested-by: Christian Zigotzky 
> Signed-off-by: Andrew Lunn 
> Reviewed-by: Florian Fainelli 
> Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support")

Applied, thanks Andrew.


Re: [PATCH v2] socket: Provide put_cmsg_whitelist() for constant size copies

2018-02-06 Thread David Miller
From: Kees Cook 
Date: Tue, 6 Feb 2018 04:31:50 +1100

> On Tue, Feb 6, 2018 at 2:03 AM, David Miller  wrote:
>> From: Kees Cook 
>> Date: Fri, 2 Feb 2018 02:27:49 -0800
>>
>>> @@ -343,6 +343,14 @@ struct ucred {
>>>
>>>  extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct 
>>> sockaddr_storage *kaddr);
>>>  extern int put_cmsg(struct msghdr*, int level, int type, int len, void 
>>> *data);
>>> +/*
>>> + * Provide a bounce buffer for copying cmsg data to userspace when the
>>> + * target memory isn't already whitelisted for hardened usercopy.
>>> + */
>>> +#define put_cmsg_whitelist(_msg, _level, _type, _ptr) ({ \
>>> + typeof(*(_ptr)) _val = *(_ptr); \
>>> + put_cmsg(_msg, _level, _type, sizeof(_val), &_val); \
>>> + })
>>
>> I understand what you are trying to achieve, but it's at a real cost
>> here.  Some of these objects are structures, for example the struct
>> sock_extended_err is 16 bytes.
> 
> It didn't look like put_cmsg() was on a fast path, so it seemed like a
> bounce buffer was the best solution here (and it's not without
> precedent).

For some things like timestamps it can be important.


Re: Regression in 39461af74125 "bitmap: replace bitmap_{from,to}_u32array"

2018-02-06 Thread David Miller
From: Heiner Kallweit 
Date: Mon, 5 Feb 2018 07:21:32 +0100

> Recently ethtool started to give incomplete values for supported and
> advertised modes. There seems to be a regression in this commit:
> The bit number parameter in the calls to bitmap_to_arr32() in
> store_link_ksettings_for_user() should be __ETHTOOL_LINK_MODE_MASK_NBITS
> instead of __ETHTOOL_LINK_MODE_MASK_NU32.
> 
> After having changed this ethtool behaves normally again.

I do not see the commit with the SHA1 ID from the Subject line in
either Linus's nor my networking tree.

Where is that change?

[davem@dhcp-10-15-49-227 linux]$ git show 39461af74125 >x.diff
fatal: ambiguous argument '39461af74125': unknown revision or path not in the 
working tree.
Use '--' to separate paths from revisions, like this:
'git  [...] -- [...]'
[davem@dhcp-10-15-49-227 linux]$ cd ../net
[davem@dhcp-10-15-49-227 net]$ git show 39461af74125 >x.diff
fatal: ambiguous argument '39461af74125': unknown revision or path not in the 
working tree.
Use '--' to separate paths from revisions, like this:
'git  [...] -- [...]'
[davem@dhcp-10-15-49-227 net]$


Re: [PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Andrew Lunn
On Tue, Feb 06, 2018 at 10:13:55AM -0500, Sven Van Asbroeck wrote:
> v1:
>   starting point.
>   is there a way to test-run this on supported devices that I don't
>   have physical access to - (ks8995, ksz8864) ?
> 
> Sven Van Asbroeck (1):
>   spi_ks8995: use regmap to access chip registers.

Hi Sven

Rather than invest time in this driver, it would be better to look
into writing a DSA driver. There was some effort last year to make the
current microchip DSA driver more generic, but that has gone quiet
recently. Maybe you can talk to tristram...@microchip.com and
woojung@microchip.com

Andrew


Re: [PATCH v2] net: sched: tbf: handle GSO_BY_FRAGS case in enqueue

2018-02-06 Thread David Miller
From: Daniel Axtens 
Date: Mon,  5 Feb 2018 15:02:06 +1100

> tbf_enqueue() checks the size of a packet before enqueuing it.
> However, the GSO size check does not consider the GSO_BY_FRAGS
> case, and so will drop GSO SCTP packets, causing a massive drop
> in throughput.
> 
> Use skb_gso_validate_mac_len() instead, as it does consider that
> case.
> 
> Signed-off-by: Daniel Axtens 
> 
> ---
> 
> skb_gso_validate_mac_len() is an out-of-line call, but so is
> skb_gso_mac_seglen(), so this is slower but not much slower. I
> will send a patch to make the skb_gso_validate_* functions
> inline-able shortly.
> 
> Also, GSO_BY_FRAGS considered harmful - I'm pretty sure this is
> not the only place it causes issues.
> 
> v2: put S-o-b in the right spot, thanks Andrew Donnellan

It's not good that our GSO helpers are not universal, and do
not properly handle all kinds of GSO encodings the kernel can
produce.

Like you said this problem probably exists elsewhere.

Therefore, I would much rather you fix the helpers to handle
GSO_BY_FRAGS properly.


Re: [bpf-next V2 PATCH 5/5] tools/libbpf: handle issues with bpf ELF objects containing .eh_frames

2018-02-06 Thread Alexei Starovoitov
On Tue, Feb 06, 2018 at 03:54:28PM +0100, Jesper Dangaard Brouer wrote:
> If clang >= 4.0.1 is missing the option '-target bpf', it will cause
> llc/llvm to create two ELF sections for "Exception Frames", with
> section names '.eh_frame' and '.rel.eh_frame'.
> 
> The BPF ELF loader library libbpf fails when loading files with these
> sections.  The other in-kernel BPF ELF loader in samples/bpf/bpf_load.c,
> handle this gracefully. And iproute2 loader also seems to work with these
> "eh" sections.
> 
> The issue in libbpf is caused by bpf_object__elf_collect() skip the
> '.eh_frame' and thus doesn't create an internal data structure
> pointing to this ELF section index.  Later when the relocation section
> '.rel.eh_frame' is processed, it tries to find the '.eh_frame' via the
> ELF section idx, which is that fails (in bpf_object__collect_reloc).
> 
> I couldn't find a way to see that the '.rel.eh_frame' was irrelevant
> (that is only determined by looking at the section it reference, which
> we no longer have info available on).

but does this approach work for all extra sections and relocations emitted
when source is compiled with -g ?
To address this case bpf_load.c does:
  if (shdr.sh_type == SHT_REL) {
  struct bpf_insn *insns;

  /* locate prog sec that need map fixup (relocations) */
  if (get_sec(elf, shdr.sh_info, , _prog,
  _prog, _prog))
  continue;

  if (shdr_prog.sh_type != SHT_PROGBITS ||
  !(shdr_prog.sh_flags & SHF_EXECINSTR))
  continue;

why the same approach is not applicable here?

I guess we can apply this workaround as-is but it looks incomplete.

> Thus, my solution is simply to match on the name of the relocation
> section, to skip that too.
> 
> Note, for samples/bpf/ the '-target bpf' parameter to clang cannot be used
> due to incompatibility with asm embedded headers, that some of the samples
> include. This is explained in more details by Yonghong Song in bpf_devel_QA.
> 
> Signed-off-by: Jesper Dangaard Brouer 
> ---
>  tools/lib/bpf/libbpf.c |7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index b4eeaa3ebff5..84e8bbe07347 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -822,6 +822,13 @@ static int bpf_object__elf_collect(struct bpf_object 
> *obj)
>   void *reloc = obj->efile.reloc;
>   int nr_reloc = obj->efile.nr_reloc + 1;
>  
> + /* Skip decoding of "eh" exception frames */
> + if (strcmp(name, ".rel.eh_frame") == 0) {
> + pr_debug("skip relo section %s(%d) for 
> section(%d)\n",
> +  name, idx, sh.sh_info);
> + continue;
> + }
> +
>   reloc = realloc(reloc,
>   sizeof(*obj->efile.reloc) * nr_reloc);
>   if (!reloc) {
> 


Re: [PATCH net-next 0/7] tcp: implement rb-tree based retransmit queue

2018-02-06 Thread Eric Dumazet
On Tue, 2018-02-06 at 15:22 +, David Laight wrote:
> From: Eric Dumazet
> > Sent: 06 February 2018 14:20
> 
> ...
> > Please give exact details.
> > Sending 64, 128, 256 or 512 bytes at a time on TCP_STREAM makes little 
> > sense.
> > We are not optimizing stack for pathological cases, sorry.
> 
> There are plenty of workloads which are not bulk data and where multiple
> small buffers get sent at unknown intervals (which may be back to back).
> Such connections have to have Nagle disabled because the Nagle delays
> are 'horrid'.
> Clearly lost packets can cause delays, but they are rare on local networks.

Auto corking makes sure aggregation happens, even for when Nagle is in
the picture.

netperf -- -m 256will still cook 64KB TSO packets

netperf is not adding delays between each send(), unless it has been
modified.




Re: [PATCH] Carrier detect ok, don't turn off negotiation

2018-02-06 Thread David Miller
From: Denis Du 
Date: Tue, 6 Feb 2018 15:15:28 + (UTC)

> How  do you think my patch?
> 
> As you see, Krzysztof  think my patch is ok to be accepted.
> But if you have a better idea to fix it,I am glad to see it. Anyway, this 
> issue have to be fixed.

Please resubmit it and I'll think about it again, thank you.


[PATCH v1 0/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Sven Van Asbroeck
v1:
starting point.
is there a way to test-run this on supported devices that I don't
have physical access to - (ks8995, ksz8864) ?

Sven Van Asbroeck (1):
  spi_ks8995: use regmap to access chip registers.

 drivers/net/phy/spi_ks8995.c | 163 +--
 1 file changed, 50 insertions(+), 113 deletions(-)

-- 
1.9.1



[PATCH v1 1/1] spi_ks8995: use regmap to access chip registers.

2018-02-06 Thread Sven Van Asbroeck
The register map layouts used in this driver are well suited to
being accessed through a regmap. This makes the driver simpler
and shorter, by eliminating some spi boilerplate code.

Testing:
- tested on a ksz8785.
- not tested on the other supported chips (ks8995, ksz8864)
  because I don't have access to them.
  However, I instrumented the spi layer to verify that the
  correct spi transactions are generated to read the ID
  registers on those chips.

Signed-off-by: Sven Van Asbroeck 
---
 drivers/net/phy/spi_ks8995.c | 163 +--
 1 file changed, 50 insertions(+), 113 deletions(-)

diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c
index 1e2d4f1..34223ff 100644
--- a/drivers/net/phy/spi_ks8995.c
+++ b/drivers/net/phy/spi_ks8995.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -106,11 +107,27 @@ enum ks8995_chip_variant {
 
 struct ks8995_chip_params {
char *name;
+   const struct regmap_config *regmap_cfg;
int family_id;
int chip_id;
int regs_size;
-   int addr_width;
-   int addr_shift;
+};
+
+static const struct regmap_config ksz8795_regmap_cfg = {
+   .reg_bits = 15,
+   .pad_bits = 1,
+   .val_bits = 8,
+   .write_flag_mask = KS8995_CMD_WRITE << 5,
+   .read_flag_mask = KS8995_CMD_READ << 5,
+   /* max_register filled in at runtime */
+};
+
+static const struct regmap_config ks8995_regmap_cfg = {
+   .reg_bits = 16,
+   .val_bits = 8,
+   .write_flag_mask = KS8995_CMD_WRITE,
+   .read_flag_mask = KS8995_CMD_READ,
+   /* max_register filled in at runtime */
 };
 
 static const struct ks8995_chip_params ks8995_chip[] = {
@@ -119,24 +136,21 @@ struct ks8995_chip_params {
.family_id = FAMILY_KS8995,
.chip_id = KS8995_CHIP_ID,
.regs_size = KS8995_REGS_SIZE,
-   .addr_width = 8,
-   .addr_shift = 0,
+   .regmap_cfg = _regmap_cfg,
},
[ksz8864] = {
.name = "KSZ8864RMN",
.family_id = FAMILY_KS8995,
.chip_id = KSZ8864_CHIP_ID,
.regs_size = KSZ8864_REGS_SIZE,
-   .addr_width = 8,
-   .addr_shift = 0,
+   .regmap_cfg = _regmap_cfg,
},
[ksz8795] = {
.name = "KSZ8795CLX",
.family_id = FAMILY_KSZ8795,
.chip_id = KSZ8795_CHIP_ID,
.regs_size = KSZ8795_REGS_SIZE,
-   .addr_width = 12,
-   .addr_shift = 1,
+   .regmap_cfg = _regmap_cfg,
},
 };
 
@@ -152,6 +166,7 @@ struct ks8995_switch {
struct bin_attributeregs_attr;
const struct ks8995_chip_params *chip;
int revision_id;
+   struct regmap *regmap;
 };
 
 static const struct spi_device_id ks8995_id[] = {
@@ -162,118 +177,24 @@ struct ks8995_switch {
 };
 MODULE_DEVICE_TABLE(spi, ks8995_id);
 
-static inline u8 get_chip_id(u8 val)
+static inline u8 get_chip_id(u32 val)
 {
return (val >> ID1_CHIPID_S) & ID1_CHIPID_M;
 }
 
-static inline u8 get_chip_rev(u8 val)
+static inline u8 get_chip_rev(u32 val)
 {
return (val >> ID1_REVISION_S) & ID1_REVISION_M;
 }
 
-/* create_spi_cmd - create a chip specific SPI command header
- * @ks: pointer to switch instance
- * @cmd: SPI command for switch
- * @address: register address for command
- *
- * Different chip families use different bit pattern to address the switches
- * registers:
- *
- * KS8995: 8bit command + 8bit address
- * KSZ8795: 3bit command + 12bit address + 1bit TR (?)
- */
-static inline __be16 create_spi_cmd(struct ks8995_switch *ks, int cmd,
-   unsigned address)
-{
-   u16 result = cmd;
-
-   /* make room for address (incl. address shift) */
-   result <<= ks->chip->addr_width + ks->chip->addr_shift;
-   /* add address */
-   result |= address << ks->chip->addr_shift;
-   /* SPI protocol needs big endian */
-   return cpu_to_be16(result);
-}
-/*  */
-static int ks8995_read(struct ks8995_switch *ks, char *buf,
-unsigned offset, size_t count)
-{
-   __be16 cmd;
-   struct spi_transfer t[2];
-   struct spi_message m;
-   int err;
-
-   cmd = create_spi_cmd(ks, KS8995_CMD_READ, offset);
-   spi_message_init();
-
-   memset(, 0, sizeof(t));
-
-   t[0].tx_buf = 
-   t[0].len = sizeof(cmd);
-   spi_message_add_tail([0], );
-
-   t[1].rx_buf = buf;
-   t[1].len = count;
-   spi_message_add_tail([1], );
-
-   mutex_lock(>lock);
-   err = spi_sync(ks->spi, );
-   mutex_unlock(>lock);
-
-   return err ? err : count;
-}
-
-static int ks8995_write(struct ks8995_switch *ks, char *buf,
-unsigned offset, size_t count)
-{
- 

  1   2   >