date:20180726

[PATCH 5/5] esp6: fix memleak on error path in esp6_input

2018-07-26 Thread Steffen Klassert

From: Zhen Lei 

This ought to be an omission in e6194923237 ("esp: Fix memleaks on error
paths."). The memleak on error path in esp6_input is similar to esp_input
of esp4.

Fixes: e6194923237 ("esp: Fix memleaks on error paths.")
Fixes: 3f29770723f ("ipsec: check return value of skb_to_sgvec always")
Signed-off-by: Zhen Lei 
Signed-off-by: Steffen Klassert 
---
 net/ipv6/esp6.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 97513f35bcc5..88a7579c23bd 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -669,8 +669,10 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff 
*skb)
 
sg_init_table(sg, nfrags);
ret = skb_to_sgvec(skb, sg, 0, skb->len);
-   if (unlikely(ret < 0))
+   if (unlikely(ret < 0)) {
+   kfree(tmp);
goto out;
+   }
 
skb->ip_summed = CHECKSUM_NONE;
 
-- 
2.14.1

[PATCH 2/5] xfrm_user: prevent leaking 2 bytes of kernel memory

2018-07-26 Thread Steffen Klassert

From: Eric Dumazet 

struct xfrm_userpolicy_type has two holes, so we should not
use C99 style initializer.

KMSAN report:

BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:140 [inline]
BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x1b14/0x2800 lib/iov_iter.c:571
CPU: 1 PID: 4520 Comm: syz-executor841 Not tainted 4.17.0+ #5
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1117
 kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1211
 kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1253
 copyout lib/iov_iter.c:140 [inline]
 _copy_to_iter+0x1b14/0x2800 lib/iov_iter.c:571
 copy_to_iter include/linux/uio.h:106 [inline]
 skb_copy_datagram_iter+0x422/0xfa0 net/core/datagram.c:431
 skb_copy_datagram_msg include/linux/skbuff.h:3268 [inline]
 netlink_recvmsg+0x6f1/0x1900 net/netlink/af_netlink.c:1959
 sock_recvmsg_nosec net/socket.c:802 [inline]
 sock_recvmsg+0x1d6/0x230 net/socket.c:809
 ___sys_recvmsg+0x3fe/0x810 net/socket.c:2279
 __sys_recvmmsg+0x58e/0xe30 net/socket.c:2391
 do_sys_recvmmsg+0x2a6/0x3e0 net/socket.c:2472
 __do_sys_recvmmsg net/socket.c:2485 [inline]
 __se_sys_recvmmsg net/socket.c:2481 [inline]
 __x64_sys_recvmmsg+0x15d/0x1c0 net/socket.c:2481
 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x446ce9
RSP: 002b:7fc307918db8 EFLAGS: 0293 ORIG_RAX: 012b
RAX: ffda RBX: 006dbc24 RCX: 00446ce9
RDX: 000a RSI: 20005040 RDI: 0003
RBP: 006dbc20 R08: 20004e40 R09: 
R10: 4000 R11: 0293 R12: 
R13: 7ffc8d2df32f R14: 7fc3079199c0 R15: 0001

Uninit was stored to memory at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
 kmsan_save_stack mm/kmsan/kmsan.c:294 [inline]
 kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:685
 kmsan_memcpy_origins+0x11d/0x170 mm/kmsan/kmsan.c:527
 __msan_memcpy+0x109/0x160 mm/kmsan/kmsan_instr.c:413
 __nla_put lib/nlattr.c:569 [inline]
 nla_put+0x276/0x340 lib/nlattr.c:627
 copy_to_user_policy_type net/xfrm/xfrm_user.c:1678 [inline]
 dump_one_policy+0xbe1/0x1090 net/xfrm/xfrm_user.c:1708
 xfrm_policy_walk+0x45a/0xd00 net/xfrm/xfrm_policy.c:1013
 xfrm_dump_policy+0x1c0/0x2a0 net/xfrm/xfrm_user.c:1749
 netlink_dump+0x9b5/0x1550 net/netlink/af_netlink.c:2226
 __netlink_dump_start+0x1131/0x1270 net/netlink/af_netlink.c:2323
 netlink_dump_start include/linux/netlink.h:214 [inline]
 xfrm_user_rcv_msg+0x8a3/0x9b0 net/xfrm/xfrm_user.c:2577
 netlink_rcv_skb+0x37e/0x600 net/netlink/af_netlink.c:2448
 xfrm_netlink_rcv+0xb2/0xf0 net/xfrm/xfrm_user.c:2598
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1680/0x1750 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec8/0x1320 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Local variable description: upt.i@dump_one_policy
Variable was created at:
 dump_one_policy+0x78/0x1090 net/xfrm/xfrm_user.c:1689
 xfrm_policy_walk+0x45a/0xd00 net/xfrm/xfrm_policy.c:1013

Byte 130 of 137 is uninitialized
Memory access starts at 88019550407f

Fixes: c0144beaeca42 ("[XFRM] netlink: Use nla_put()/NLA_PUT() variantes")
Signed-off-by: Eric Dumazet 
Reported-by: syzbot 
Cc: Steffen Klassert 
Cc: Herbert Xu 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_user.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 080035f056d9..1e50b70ad668 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1671,9 +1671,11 @@ static inline unsigned int userpolicy_type_attrsize(void)
 #ifdef CONFIG_XFRM_SUB_POLICY
 static int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
 {
-   struct xfrm_userpolicy_type upt = {
-   .type = type,
-   };
+   struct xfrm_userpolicy_type upt;
+
+   /* Sadly there are two holes in struct xfrm_userpolicy_type */
+   memset(&upt, 0, sizeof(upt));
+   upt.type = type;
 
return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
 }
-- 
2.14.1

pull request (net): ipsec 2018-07-27

2018-07-26 Thread Steffen Klassert

1) Fix PMTU handling of vti6. We update the PMTU on
   the xfrm dst_entry which is not cached anymore
   after the flowchache removal. So update the
   PMTU of the original dst_entry instead.
   From Eyal Birger.

2) Fix a leak of kernel memory to userspace.
   From Eric Dumazet.

3) Fix a possible dst_entry memleak in xfrm_lookup_route.
   From Tommi Rantala.

4) Fix a skb leak in case we can't call nlmsg_multicast
   from xfrm_nlmsg_multicast. From Florian Westphal.

5) Fix a leak of a temporary buffer in the error path of
   esp6_input. From Zhen Lei.

Please pull or let me know if there are problems.

Thanks!

The following changes since commit 1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next 
(2018-06-06 18:39:49 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git master

for you to fetch changes up to 7284fdf39a912322ce97de2d30def3c6068a418c:

  esp6: fix memleak on error path in esp6_input (2018-06-27 17:32:11 +0200)


Eric Dumazet (1):
  xfrm_user: prevent leaking 2 bytes of kernel memory

Eyal Birger (1):
  vti6: fix PMTU caching and reporting on xmit

Florian Westphal (1):
  xfrm: free skb if nlsk pointer is NULL

Tommi Rantala (1):
  xfrm: fix missing dst_release() after policy blocking lbcast and multicast

Zhen Lei (1):
  esp6: fix memleak on error path in esp6_input

 net/ipv6/esp6.c|  4 +++-
 net/ipv6/ip6_vti.c | 11 ++-
 net/xfrm/xfrm_policy.c |  3 +++
 net/xfrm/xfrm_user.c   | 18 +++---
 4 files changed, 23 insertions(+), 13 deletions(-)

[PATCH 4/5] xfrm: free skb if nlsk pointer is NULL

2018-07-26 Thread Steffen Klassert

From: Florian Westphal 

nlmsg_multicast() always frees the skb, so in case we cannot call
it we must do that ourselves.

Fixes: 21ee543edc0dea ("xfrm: fix race between netns cleanup and state expire 
notification")
Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_user.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 1e50b70ad668..33878e6e0d0a 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1025,10 +1025,12 @@ static inline int xfrm_nlmsg_multicast(struct net *net, 
struct sk_buff *skb,
 {
struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
 
-   if (nlsk)
-   return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
-   else
-   return -1;
+   if (!nlsk) {
+   kfree_skb(skb);
+   return -EPIPE;
+   }
+
+   return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
 }
 
 static inline unsigned int xfrm_spdinfo_msgsize(void)
-- 
2.14.1

[PATCH 3/5] xfrm: fix missing dst_release() after policy blocking lbcast and multicast

2018-07-26 Thread Steffen Klassert

From: Tommi Rantala 

Fix missing dst_release() when local broadcast or multicast traffic is
xfrm policy blocked.

For IPv4 this results to dst leak: ip_route_output_flow() allocates
dst_entry via __ip_route_output_key() and passes it to
xfrm_lookup_route(). xfrm_lookup returns ERR_PTR(-EPERM) that is
propagated. The dst that was allocated is never released.

IPv4 local broadcast testcase:
 ping -b 192.168.1.255 &
 sleep 1
 ip xfrm policy add src 0.0.0.0/0 dst 192.168.1.255/32 dir out action block

IPv4 multicast testcase:
 ping 224.0.0.1 &
 sleep 1
 ip xfrm policy add src 0.0.0.0/0 dst 224.0.0.1/32 dir out action block

For IPv6 the missing dst_release() causes trouble e.g. when used in netns:
 ip netns add TEST
 ip netns exec TEST ip link set lo up
 ip link add dummy0 type dummy
 ip link set dev dummy0 netns TEST
 ip netns exec TEST ip addr add fd00:: dev dummy0
 ip netns exec TEST ip link set dummy0 up
 ip netns exec TEST ping -6 -c 5 ff02::1%dummy0 &
 sleep 1
 ip netns exec TEST ip xfrm policy add src ::/0 dst ff02::1 dir out action block
 wait
 ip netns del TEST

After netns deletion we see:
[  258.239097] unregister_netdevice: waiting for lo to become free. Usage count 
= 2
[  268.279061] unregister_netdevice: waiting for lo to become free. Usage count 
= 2
[  278.367018] unregister_netdevice: waiting for lo to become free. Usage count 
= 2
[  288.375259] unregister_netdevice: waiting for lo to become free. Usage count 
= 2

Fixes: ac37e2515c1a ("xfrm: release dst_orig in case of error in xfrm_lookup()")
Signed-off-by: Tommi Rantala 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_policy.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5f48251c1319..7c5e8978aeaa 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2286,6 +2286,9 @@ struct dst_entry *xfrm_lookup_route(struct net *net, 
struct dst_entry *dst_orig,
if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
return make_blackhole(net, dst_orig->ops->family, dst_orig);
 
+   if (IS_ERR(dst))
+   dst_release(dst_orig);
+
return dst;
 }
 EXPORT_SYMBOL(xfrm_lookup_route);
-- 
2.14.1

[PATCH 1/5] vti6: fix PMTU caching and reporting on xmit

2018-07-26 Thread Steffen Klassert

From: Eyal Birger 

When setting the skb->dst before doing the MTU check, the route PMTU
caching and reporting is done on the new dst which is about to be
released.

Instead, PMTU handling should be done using the original dst.

This is aligned with IPv4 VTI.

Fixes: ccd740cbc6 ("vti6: Add pmtu handling to vti6_xmit.")
Signed-off-by: Eyal Birger 
Signed-off-by: Steffen Klassert 
---
 net/ipv6/ip6_vti.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index b7f28deddaea..c72ae3a4fe09 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -480,10 +480,6 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, 
struct flowi *fl)
goto tx_err_dst_release;
}
 
-   skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
-   skb_dst_set(skb, dst);
-   skb->dev = skb_dst(skb)->dev;
-
mtu = dst_mtu(dst);
if (!skb->ignore_df && skb->len > mtu) {
skb_dst_update_pmtu(skb, mtu);
@@ -498,9 +494,14 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, 
struct flowi *fl)
  htonl(mtu));
}
 
-   return -EMSGSIZE;
+   err = -EMSGSIZE;
+   goto tx_err_dst_release;
}
 
+   skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+   skb_dst_set(skb, dst);
+   skb->dev = skb_dst(skb)->dev;
+
err = dst_output(t->net, skb->sk, skb);
if (net_xmit_eval(err) == 0) {
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
-- 
2.14.1

Re: [patch net-next RFC] net: sched: don't dump chains only held by actions

2018-07-26 Thread Jiri Pirko

Fri, Jul 27, 2018 at 06:18:14AM CEST, xiyou.wangc...@gmail.com wrote:
>On Thu, Jul 26, 2018 at 9:33 AM Jiri Pirko  wrote:
>>
>> From: Jiri Pirko 
>>
>> In case a chain is empty and not explicitly created by a user,
>> such chain should not exist. The only exception is if there is
>> an action "goto chain" pointing to it. In that case, don't show the
>> chain in the dump. Track the chain references held by actions and
>> use them to find out if a chain should or should not be shown
>> in chain dump.
>
>Hiding it makes sense. But you still need to make sure
>user can't find it by ID, hiding it merely means user can't
>see it.
>
>Also, I don't understand why you increase the refcnt
>for a hiding chain either, like Jakub mentioned.
>
>If user can't see it, it must not be found by ID either.

Ack. Will do that.

Re: [patch net-next RFC] net: sched: don't dump chains only held by actions

2018-07-26 Thread Jiri Pirko

Thu, Jul 26, 2018 at 09:59:30PM CEST, jakub.kicin...@netronome.com wrote:
>On Thu, 26 Jul 2018 18:31:01 +0200, Jiri Pirko wrote:
>> From: Jiri Pirko 
>> 
>> In case a chain is empty and not explicitly created by a user,
>> such chain should not exist. The only exception is if there is
>> an action "goto chain" pointing to it. In that case, don't show the
>> chain in the dump. Track the chain references held by actions and
>> use them to find out if a chain should or should not be shown
>> in chain dump.
>> 
>> Signed-off-by: Jiri Pirko 
>
>I don't have any better ideas :)
>
>One question below.
>
>> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>> index 75cce2819de9..76035cd6e3bf 100644
>> --- a/net/sched/cls_api.c
>> +++ b/net/sched/cls_api.c
>> @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain)
>>  ++chain->refcnt;
>>  }
>>  
>> +static void tcf_chain_hold_by_act(struct tcf_chain *chain)
>> +{
>> +++chain->action_refcnt;
>> +}
>> +
>> +static void tcf_chain_release_by_act(struct tcf_chain *chain)
>> +{
>> +--chain->action_refcnt;
>> +}
>> +
>> +static bool tcf_chain_is_zombie(struct tcf_chain *chain)
>> +{
>> +/* In case all the references are action references, this
>> + * chain is a zombie and should not be listed in the chain
>> + * dump list.
>> + */
>> +return chain->refcnt == chain->action_refcnt;
>> +}
>> +
>>  static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
>>u32 chain_index)
>>  {
>> @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block 
>> *block, u32 chain_index,
>>  }
>>  EXPORT_SYMBOL(tcf_chain_get);
>>  
>> +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 
>> chain_index)
>> +{
>> +struct tcf_chain *chain = tcf_chain_get(block, chain_index, true);
>> +
>> +tcf_chain_hold_by_act(chain);
>> +return chain;
>> +}
>> +EXPORT_SYMBOL(tcf_chain_get_by_act);
>> +
>>  static void tc_chain_tmplt_del(struct tcf_chain *chain);
>>  
>>  void tcf_chain_put(struct tcf_chain *chain)
>> @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain)
>>  }
>>  EXPORT_SYMBOL(tcf_chain_put);
>>  
>> +void tcf_chain_put_by_act(struct tcf_chain *chain)
>> +{
>> +tcf_chain_release_by_act(chain);
>> +tcf_chain_put(chain);
>> +}
>> +EXPORT_SYMBOL(tcf_chain_put_by_act);
>> +
>>  static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
>>  {
>>  if (chain->explicitly_created)
>> @@ -1803,17 +1838,26 @@ static int tc_ctl_chain(struct sk_buff *skb, struct 
>> nlmsghdr *n,
>>  chain = tcf_chain_lookup(block, chain_index);
>>  if (n->nlmsg_type == RTM_NEWCHAIN) {
>>  if (chain) {
>> -NL_SET_ERR_MSG(extack, "Filter chain already exists");
>> -return -EEXIST;
>> -}
>> -if (!(n->nlmsg_flags & NLM_F_CREATE)) {
>> -NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and 
>> NLM_F_CREATE to create a new chain");
>> -return -ENOENT;
>> -}
>> -chain = tcf_chain_create(block, chain_index);
>> -if (!chain) {
>> -NL_SET_ERR_MSG(extack, "Failed to create filter chain");
>> -return -ENOMEM;
>> +if (tcf_chain_is_zombie(chain)) {
>> +/* The chain exists only because there is
>> + * some action referencing it, meaning it
>> + * is a zombie.
>> + */
>> +tcf_chain_hold(chain);
>
>I'm not 100% sure why this is needed?  In my tree below I see:
>
>   switch (n->nlmsg_type) {
>   case RTM_NEWCHAIN:
>   err = tc_chain_tmplt_add(chain, net, tca, extack);
>   if (err)
>   goto errout;
>   /* In case the chain was successfully added, take a reference
>* to the chain. This ensures that an empty chain
>* does not disappear at the end of this function.
>*/
>   tcf_chain_hold(chain);
>   chain->explicitly_created = true;
>
>so one reference will be taken..  do we need two? 

There is a put at the end of this function.

>
>> +} else {
>> +NL_SET_ERR_MSG(extack, "Filter chain already 
>> exists");
>> +return -EEXIST;
>> +}
>> +} else {
>> +if (!(n->nlmsg_flags & NLM_F_CREATE)) {
>> +NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN 
>> and NLM_F_CREATE to create a new chain");
>> +return -ENOENT;
>> +}
>> +chain = tcf_chain_create(block, chain_index);
>> +if (!chain) {
>> +NL_SET_ERR_MSG(extack, "Failed to create fi

RE: [PATCH net-next] net/tls: Corrected enabling of zero-copy mode

2018-07-26 Thread Vakul Garg




> -Original Message-
> From: netdev-ow...@vger.kernel.org [mailto:netdev-
> ow...@vger.kernel.org] On Behalf Of David Miller
> Sent: Thursday, July 26, 2018 1:59 AM
> To: Vakul Garg 
> Cc: netdev@vger.kernel.org; bor...@mellanox.com;
> avia...@mellanox.com; davejwat...@fb.com
> Subject: Re: [PATCH net-next] net/tls: Corrected enabling of zero-copy mode
> 
> From: Vakul Garg 
> Date: Mon, 23 Jul 2018 21:00:06 +0530
> 
> > @@ -787,7 +787,7 @@ int tls_sw_recvmsg(struct sock *sk,
> > target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
> > timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
> > do {
> > -   bool zc = false;
> > +   bool zc;
> > int chunk = 0;
> >
> > skb = tls_wait_data(sk, flags, timeo, &err);
>  ...
> > @@ -836,6 +835,7 @@ int tls_sw_recvmsg(struct sock *sk,
> > if (err < 0)
> > goto fallback_to_reg_recv;
> >
> > +   zc = true;
> > err = decrypt_skb_update(sk, skb, sgin, &zc);
> > for (; pages > 0; pages--)
> > put_page(sg_page(&sgin[pages]));
> @@ -845,6 +845,7 @@ int
> > tls_sw_recvmsg(struct sock *sk,
> > }
> > } else {
> >  fallback_to_reg_recv:
> > +   zc = false;
> > err = decrypt_skb_update(sk, skb, NULL,
> &zc);
> > if (err < 0) {
> > tls_err_abort(sk, EBADMSG);
> > --
> > 2.13.6
> >
> 
> This will leave a code path where 'zc' is evaluated but not initialized to
> any value.
> 
> And that's the path taken when ctx->decrypted is true.  The code after
> your changes looks like:
> 
>   bool zc;
>  ...
>   if (!ctx->decrypted) {
> 
>  ... assignments to 'zc' happen in this code block
> 
>   ctx->decrypted = true;
>   }
> 
>   if (!zc) {
> 
> So when ctx->decrypted it true, the if(!zc) condition runs on an
> uninitialized value.
> 
> I have to say that your TLS changes are becomming quite a time sink
> for two reasons.
> 
> First, you are making a lot of changes that seem not so needed, and
> whose value is purely determined by taste.  I'd put the
> msg_data_left() multiple evaluation patch into this category.
> 
> The rest require deep review and understanding of the complicated
> details of the TLS code, and many of them turn out to be incorrect.

My apologies for sending incorrect patches. I would be more careful next time. 


> 
> As I find more errors in your submissions, I begin to scrutinize your
> patches even more.  Thus, review of your changes takes even more time.
> 
> And it isn't helping that there are not a lot of other developers
> helping actively to review your changes.
> 
> I would like to just make a small request to you, that you concentrate
> on fixing clear bugs and clear issues that need to be resolved.
> 
> Thank you.

Re: [PATCH] xfrm: fix ptr_ret.cocci warnings

2018-07-26 Thread Steffen Klassert

On Thu, Jul 26, 2018 at 03:09:52PM +0800, kbuild test robot wrote:
> From: kbuild test robot 
> 
> net/xfrm/xfrm_interface.c:692:1-3: WARNING: PTR_ERR_OR_ZERO can be used
> 
> 
>  Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR
> 
> Generated by: scripts/coccinelle/api/ptr_ret.cocci
> 
> Fixes: 44e2b838c24d ("xfrm: Return detailed errors from xfrmi_newlink")
> CC: Benedict Wong 
> Signed-off-by: kbuild test robot 

Applied, thanks!

Re: [PATCH ipsec-next] xfrm: Return detailed errors from xfrmi_newlink

2018-07-26 Thread Steffen Klassert

On Wed, Jul 25, 2018 at 01:45:29PM -0700, Benedict Wong wrote:
> Currently all failure modes of xfrm interface creation return EEXIST.
> This change improves the granularity of errnos provided by also
> returning ENODEV or EINVAL if failures happen in looking up the
> underlying interface, or a required parameter is not provided.
> 
> This change has been tested against the Android Kernel Networking Tests,
> with additional xfrmi_newlink tests here:
> 
> https://android-review.googlesource.com/c/kernel/tests/+/715755
> 
> Signed-off-by: Benedict Wong 

Applied to ipsec-next, thanks Benedict!

Re: [PATCH bpf-next 0/4] samples: bpf: convert two more samples to libbpf

2018-07-26 Thread Daniel Borkmann

On 07/26/2018 11:32 PM, Jakub Kicinski wrote:
> Hi!
> 
> This set converts xdpsock_user.c and xdp_fwd_user.c to use libbpf instead
> of bpf_load.o.  First two patches are minor improvements to libbpf to make
> the conversion (and use of libbpf in general) nicer.
> 
> Jakub Kicinski (4):
>   tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd()
>   tools: libbpf: add bpf_object__find_program_by_title()
>   samples: bpf: convert xdp_fwd_user.c to libbpf
>   samples: bpf: convert xdpsock_user.c to libbpf
> 
>  samples/bpf/Makefile   |  4 ++--
>  samples/bpf/xdp_fwd_user.c | 34 +++---
>  samples/bpf/xdpsock_user.c | 38 +-
>  tools/lib/bpf/libbpf.c | 15 +++
>  tools/lib/bpf/libbpf.h |  3 +++
>  5 files changed, 72 insertions(+), 22 deletions(-)
> 

Applied to bpf-next, thanks Jakub!

Re: [PATCH bpf-next 0/6] nfp: bpf: improve efficiency of offloaded perf events

2018-07-26 Thread Daniel Borkmann

On 07/26/2018 04:53 AM, Jakub Kicinski wrote:
> Hi!
> 
> This set is focused on improving the performance of perf events
> reported from BPF offload.  Perf events can now be received on
> packet data queues, which significantly improves the performance
> (from total of 0.5 Msps to 5Msps per core).  To get to this
> performance we need a fast path for control messages which will
> operate on raw buffers and recycle them immediately.
> 
> Patch 5 replaces the map pointers for perf maps with map IDs.
> We look the pointers up in a hashtable, anyway, to validate they
> are correct, so there is no performance difference.  Map IDs
> have the advantage of being easier to understand for users in
> case of errors (we no longer print raw pointers to the logs).
> 
> Last patch improves info messages about map offload.
> 
> Jakub Kicinski (6):
>   nfp: move repr handling on RX path
>   nfp: allow control message reception on data queues
>   nfp: bpf: pass raw data buffer to nfp_bpf_event_output()
>   nfp: bpf: allow receiving perf events on data queues
>   nfp: bpf: remember maps by ID
>   nfp: bpf: improve map offload info messages
> 
>  drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 25 +++-
>  drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 12 ++--
>  drivers/net/ethernet/netronome/nfp/bpf/main.c |  5 +-
>  drivers/net/ethernet/netronome/nfp/bpf/main.h |  9 ++-
>  .../net/ethernet/netronome/nfp/bpf/offload.c  | 63 +++
>  drivers/net/ethernet/netronome/nfp/nfp_app.c  |  2 +
>  drivers/net/ethernet/netronome/nfp/nfp_app.h  | 17 +
>  .../ethernet/netronome/nfp/nfp_net_common.c   | 40 +++-
>  .../net/ethernet/netronome/nfp/nfp_net_ctrl.h |  1 +
>  9 files changed, 125 insertions(+), 49 deletions(-)
> 

Applied to bpf-next, thanks Jakub!

Re: [PATCH v5 bpf-next 2/9] veth: Add driver XDP

2018-07-26 Thread Toshiaki Makita

Hi John,

On 2018/07/27 12:02, John Fastabend wrote:
> On 07/26/2018 07:40 AM, Toshiaki Makita wrote:
>> From: Toshiaki Makita 
>>
>> This is the basic implementation of veth driver XDP.
>>
>> Incoming packets are sent from the peer veth device in the form of skb,
>> so this is generally doing the same thing as generic XDP.
>>
>> This itself is not so useful, but a starting point to implement other
>> useful veth XDP features like TX and REDIRECT.
>>
>> This introduces NAPI when XDP is enabled, because XDP is now heavily
>> relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function
>> enqueues packets to the ring and peer NAPI handler drains the ring.
>>
>> Currently only one ring is allocated for each veth device, so it does
>> not scale on multiqueue env. This can be resolved by allocating rings
>> on the per-queue basis later.
>>
>> Note that NAPI is not used but netif_rx is used when XDP is not loaded,
>> so this does not change the default behaviour.
>>
>> v3:
>> - Fix race on closing the device.
>> - Add extack messages in ndo_bpf.
>>
>> v2:
>> - Squashed with the patch adding NAPI.
>> - Implement adjust_tail.
>> - Don't acquire consumer lock because it is guarded by NAPI.
>> - Make poll_controller noop since it is unnecessary.
>> - Register rxq_info on enabling XDP rather than on opening the device.
>>
>> Signed-off-by: Toshiaki Makita 
>> ---
> 
> 
> [...]
> 
> One nit and one question.
> 
>> +
>> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
>> +struct sk_buff *skb)
>> +{
>> +u32 pktlen, headroom, act, metalen;
>> +void *orig_data, *orig_data_end;
>> +int size, mac_len, delta, off;
>> +struct bpf_prog *xdp_prog;
>> +struct xdp_buff xdp;
>> +
>> +rcu_read_lock();
>> +xdp_prog = rcu_dereference(priv->xdp_prog);
>> +if (unlikely(!xdp_prog)) {
>> +rcu_read_unlock();
>> +goto out;
>> +}
>> +
>> +mac_len = skb->data - skb_mac_header(skb);
>> +pktlen = skb->len + mac_len;
>> +size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
>> +   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> +if (size > PAGE_SIZE)
>> +goto drop;
> 
> I'm not sure why it matters if size > PAGE_SIZE here. Why not
> just consume it and use the correct page order in alloc_page if
> its not linear.

Indeed. We can allow such skbs here at least if we don't need
reallocation (which is highly unlikely though).

But I'm not sure we should allocate multiple pages in atomic context.
It tends to cause random allocation failure which is IMO more
frustrating. We are now prohibiting such a situation by max_mtu and
dropping features, which looks more robust to me.

>> +
>> +headroom = skb_headroom(skb) - mac_len;
>> +if (skb_shared(skb) || skb_head_is_locked(skb) ||
>> +skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
>> +struct sk_buff *nskb;
>> +void *head, *start;
>> +struct page *page;
>> +int head_off;
>> +
>> +page = alloc_page(GFP_ATOMIC);
> 
> Should also have __NO_WARN here as well this can be triggered by
> external events so we don't want DDOS here to flood system logs.

Sure, thanks!

-- 
Toshiaki Makita

Re: [pull request][net-next 00/13] Mellanox, mlx5e updates 2018-07-26 (XDP redirect)

2018-07-26 Thread David Miller

From: Saeed Mahameed 
Date: Thu, 26 Jul 2018 15:56:34 -0700

> This series from Tariq adds the support for device-out XDP redirect.
> 
> For more information please see tag log below.
> 
> Please pull and let me know if there's any problem.

Looks good, pulled, thanks a lot.

Re: [PATCH net] net: ena: Fix use of uninitialized DMA address bits field

2018-07-26 Thread David Miller

From: Gal Pressman 
Date: Thu, 26 Jul 2018 23:40:33 +0300

> UBSAN triggers the following undefined behaviour warnings:
> [...]
> [   13.236124] UBSAN: Undefined behaviour in 
> drivers/net/ethernet/amazon/ena/ena_eth_com.c:468:22
> [   13.240043] shift exponent 64 is too large for 64-bit type 'long long 
> unsigned int'
> [...]
> [   13.744769] UBSAN: Undefined behaviour in 
> drivers/net/ethernet/amazon/ena/ena_eth_com.c:373:4
> [   13.748694] shift exponent 64 is too large for 64-bit type 'long long 
> unsigned int'
> [...]
> 
> When splitting the address to high and low, GENMASK_ULL is used to generate
> a bitmask with dma_addr_bits field from io_sq (in ena_com_prepare_tx and
> ena_com_add_single_rx_desc).
> The problem is that dma_addr_bits is not initialized with a proper value
> (besides being cleared in ena_com_create_io_queue).
> Assign dma_addr_bits the correct value that is stored in ena_dev when
> initializing the SQ.
> 
> Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network 
> Adapters (ENA)")
> Signed-off-by: Gal Pressman 

Applied and queued up for -stable.

Re: [PATCH net-next] netdevsim: make debug dirs' dentries static

2018-07-26 Thread David Miller

From: Jakub Kicinski 
Date: Thu, 26 Jul 2018 14:25:26 -0700

> The root directories of netdevsim should only be used by the core
> to create per-device subdirectories, so limit their visibility to
> the core file.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 

Applied.

Re: [patch net-next RFC] net: sched: don't dump chains only held by actions

2018-07-26 Thread Cong Wang

On Thu, Jul 26, 2018 at 9:33 AM Jiri Pirko  wrote:
>
> From: Jiri Pirko 
>
> In case a chain is empty and not explicitly created by a user,
> such chain should not exist. The only exception is if there is
> an action "goto chain" pointing to it. In that case, don't show the
> chain in the dump. Track the chain references held by actions and
> use them to find out if a chain should or should not be shown
> in chain dump.

Hiding it makes sense. But you still need to make sure
user can't find it by ID, hiding it merely means user can't
see it.

Also, I don't understand why you increase the refcnt
for a hiding chain either, like Jakub mentioned.

If user can't see it, it must not be found by ID either.

Re: [PATCH v5 bpf-next 5/9] veth: Add ndo_xdp_xmit

2018-07-26 Thread John Fastabend

On 07/26/2018 07:40 AM, Toshiaki Makita wrote:
> From: Toshiaki Makita 
> 
> This allows NIC's XDP to redirect packets to veth. The destination veth
> device enqueues redirected packets to the napi ring of its peer, then
> they are processed by XDP on its peer veth device.
> This can be thought as calling another XDP program by XDP program using
> REDIRECT, when the peer enables driver XDP.
> 
> Note that when the peer veth device does not set driver xdp, redirected
> packets will be dropped because the peer is not ready for NAPI.
> 
> v4:
> - Don't use xdp_ok_fwd_dev() because checking IFF_UP is not necessary.
>   Add comments about it and check only MTU.
> 
> v2:
> - Drop the part converting xdp_frame into skb when XDP is not enabled.
> - Implement bulk interface of ndo_xdp_xmit.
> - Implement XDP_XMIT_FLUSH bit and drop ndo_xdp_flush.
> 
> Signed-off-by: Toshiaki Makita 
> ---
>  drivers/net/veth.c | 51 +++
>  1 file changed, 51 insertions(+)
> 

Acked-by: John Fastabend

Re: [PATCH v5 bpf-next 4/9] veth: Handle xdp_frames in xdp napi ring

2018-07-26 Thread John Fastabend

On 07/26/2018 07:40 AM, Toshiaki Makita wrote:
> From: Toshiaki Makita 
> 
> This is preparation for XDP TX and ndo_xdp_xmit.
> This allows napi handler to handle xdp_frames through xdp ring as well
> as sk_buff.
> 
> v3:
> - Revert v2 change around rings and use a flag to differentiate skb and
>   xdp_frame, since bulk skb xmit makes little performance difference
>   for now.
> 
> v2:
> - Use another ring instead of using flag to differentiate skb and
>   xdp_frame. This approach makes bulk skb transmit possible in
>   veth_xmit later.
> - Clear xdp_frame feilds in skb->head.
> - Implement adjust_tail.
> 
> Signed-off-by: Toshiaki Makita 
> ---
>  drivers/net/veth.c | 87 
> ++
>  1 file changed, 82 insertions(+), 5 deletions(-)
> 

Acked-by: John Fastabend

Re: [PATCH v5 bpf-next 2/9] veth: Add driver XDP

2018-07-26 Thread John Fastabend

On 07/26/2018 07:40 AM, Toshiaki Makita wrote:
> From: Toshiaki Makita 
> 
> This is the basic implementation of veth driver XDP.
> 
> Incoming packets are sent from the peer veth device in the form of skb,
> so this is generally doing the same thing as generic XDP.
> 
> This itself is not so useful, but a starting point to implement other
> useful veth XDP features like TX and REDIRECT.
> 
> This introduces NAPI when XDP is enabled, because XDP is now heavily
> relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function
> enqueues packets to the ring and peer NAPI handler drains the ring.
> 
> Currently only one ring is allocated for each veth device, so it does
> not scale on multiqueue env. This can be resolved by allocating rings
> on the per-queue basis later.
> 
> Note that NAPI is not used but netif_rx is used when XDP is not loaded,
> so this does not change the default behaviour.
> 
> v3:
> - Fix race on closing the device.
> - Add extack messages in ndo_bpf.
> 
> v2:
> - Squashed with the patch adding NAPI.
> - Implement adjust_tail.
> - Don't acquire consumer lock because it is guarded by NAPI.
> - Make poll_controller noop since it is unnecessary.
> - Register rxq_info on enabling XDP rather than on opening the device.
> 
> Signed-off-by: Toshiaki Makita 
> ---


[...]

One nit and one question.

> +
> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
> + struct sk_buff *skb)
> +{
> + u32 pktlen, headroom, act, metalen;
> + void *orig_data, *orig_data_end;
> + int size, mac_len, delta, off;
> + struct bpf_prog *xdp_prog;
> + struct xdp_buff xdp;
> +
> + rcu_read_lock();
> + xdp_prog = rcu_dereference(priv->xdp_prog);
> + if (unlikely(!xdp_prog)) {
> + rcu_read_unlock();
> + goto out;
> + }
> +
> + mac_len = skb->data - skb_mac_header(skb);
> + pktlen = skb->len + mac_len;
> + size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
> +SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> + if (size > PAGE_SIZE)
> + goto drop;

I'm not sure why it matters if size > PAGE_SIZE here. Why not
just consume it and use the correct page order in alloc_page if
its not linear.

> +
> + headroom = skb_headroom(skb) - mac_len;
> + if (skb_shared(skb) || skb_head_is_locked(skb) ||
> + skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> + struct sk_buff *nskb;
> + void *head, *start;
> + struct page *page;
> + int head_off;
> +
> + page = alloc_page(GFP_ATOMIC);

Should also have __NO_WARN here as well this can be triggered by
external events so we don't want DDOS here to flood system logs.

> + if (!page)
> + goto drop;
> +
> + head = page_address(page);
> + start = head + VETH_XDP_HEADROOM;
> + if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
> + page_frag_free(head);
> + goto drop;
> + }
> +
> + nskb = veth_build_skb(head,
> +   VETH_XDP_HEADROOM + mac_len, skb->len,
> +   PAGE_SIZE);
> + if (!nskb) {
> + page_frag_free(head);
> + goto drop;
> + }
> +
> + skb_copy_header(nskb, skb);
> + head_off = skb_headroom(nskb) - skb_headroom(skb);
> + skb_headers_offset_update(nskb, head_off);
> + if (skb->sk)
> + skb_set_owner_w(nskb, skb->sk);
> + consume_skb(skb);
> + skb = nskb;
> + }
> +
> + xdp.data_hard_start = skb->head;
> + xdp.data = skb_mac_header(skb);
> + xdp.data_end = xdp.data + pktlen;
> + xdp.data_meta = xdp.data;
> + xdp.rxq = &priv->xdp_rxq;
> + orig_data = xdp.data;
> + orig_data_end = xdp.data_end;
> +
> + act = bpf_prog_run_xdp(xdp_prog, &xdp);
> +
> + switch (act) {
> + case XDP_PASS:
> + break;
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + case XDP_ABORTED:
> + trace_xdp_exception(priv->dev, xdp_prog, act);
> + case XDP_DROP:
> + goto drop;
> + }
> + rcu_read_unlock();
> +
> + delta = orig_data - xdp.data;
> + off = mac_len + delta;
> + if (off > 0)
> + __skb_push(skb, off);
> + else if (off < 0)
> + __skb_pull(skb, -off);
> + skb->mac_header -= delta;
> + off = xdp.data_end - orig_data_end;
> + if (off != 0)
> + __skb_put(skb, off);
> + skb->protocol = eth_type_trans(skb, priv->dev);
> +
> + metalen = xdp.data - xdp.data_meta;
> + if (metalen)
> + skb_metadata_set(skb, metalen);
> +out:
> + return skb;
> +drop:
> + rcu_read_unlock();
> + kfree_skb(skb);
> + return NULL;
> +}
> +

Thanks,
John

Re: [PATCH net-next v3 1/5] tc/act: user space can't use TC_ACT_REDIRECT directly

2018-07-26 Thread Daniel Borkmann

On 07/26/2018 09:43 AM, Jiri Pirko wrote:
> Wed, Jul 25, 2018 at 06:29:54PM CEST, dan...@iogearbox.net wrote:
>> On 07/25/2018 05:48 PM, Paolo Abeni wrote:
>>> On Wed, 2018-07-25 at 15:03 +0200, Jiri Pirko wrote:
 Wed, Jul 25, 2018 at 02:54:04PM CEST, pab...@redhat.com wrote:
> On Wed, 2018-07-25 at 13:56 +0200, Jiri Pirko wrote:
>> Tue, Jul 24, 2018 at 10:06:39PM CEST, pab...@redhat.com wrote:
>>> Only cls_bpf and act_bpf can safely use such value. If a generic
>>> action is configured by user space to return TC_ACT_REDIRECT,
>>> the usually visible behavior is passing the skb up the stack - as
>>> for unknown action, but, with complex configuration, more random
>>> results can be obtained.
>>>
>>> This patch forcefully converts TC_ACT_REDIRECT to TC_ACT_UNSPEC
>>> at action init time, making the kernel behavior more consistent.
>>>
>>> v1 -> v3: use TC_ACT_UNSPEC instead of a newly definied act value
>>>
>>> Signed-off-by: Paolo Abeni 
>>> ---
>>> net/sched/act_api.c | 5 +
>>> 1 file changed, 5 insertions(+)
>>>
>>> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>>> index 148a89ab789b..24b5534967fe 100644
>>> --- a/net/sched/act_api.c
>>> +++ b/net/sched/act_api.c
>>> @@ -895,6 +895,11 @@ struct tc_action *tcf_action_init_1(struct net 
>>> *net, struct tcf_proto *tp,
>>> }
>>> }
>>>
>>> +   if (a->tcfa_action == TC_ACT_REDIRECT) {
>>> +   net_warn_ratelimited("TC_ACT_REDIRECT can't be used 
>>> directly");
>>
>> Can't you push this warning through extack?
>>
>> But, wouldn't it be more appropriate to fail here? User is passing
>> invalid configuration
>
> Jiri, Jamal, thank you for the feedback.
>
> Please allow me to answer both of you here, since you raised similar
> concers.
>
> I thought about rejecting the action, but that change of behavior could
> break some users, as currently most kind of invalid tcfa_action values
> are simply accepted.
>
> If there is consensus about it, I can simply fail.

 Well it was obviously wrong to expose TC_ACT_REDIRECT to uapi and it
 really has no meaning for anyone to use it throughout its whole history.
>>
>> That claim is completely wrong.
> 
> Why? Does addition of TC_ACT_REDIRECT to uapi have any meaning?

BPF programs return TC_ACT_* as a verdict which is then further processed,
including TC_ACT_REDIRECT. Hence, it's a contract for them similarly as e.g.
helper enums (BPF_FUNC_*) and other things they make use of.

Cheers,
Daniel

Re: [PATCH bpf] bpf: btf: Use exact btf value_size match in map_check_btf()

2018-07-26 Thread Daniel Borkmann

On 07/26/2018 06:57 PM, Martin KaFai Lau wrote:
> The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects
> '> map->value_size' to ensure map_seq_show_elem() will not
> access things beyond an array element.
> 
> Yonghong suggested that using '!=' is a more correct
> check.  The 8 bytes round_up on value_size is stored
> in array->elem_size.  Hence, using '!=' on map->value_size
> is a proper check.
> 
> This patch also adds new tests to check the btf array
> key type and value type.  Two of these new tests verify
> the btf's value_size (the change in this patch).
> 
> It also fixes two existing tests that wrongly encoded
> a btf's type size (pprint_test) and the value_type_id (in one
> of the raw_tests[]).  However, that do not affect these two
> BTF verification tests before or after this test changes.
> These two tests mainly failed at array creation time after
> this patch.
> 
> Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic 
> arraymap")
> Suggested-by: Yonghong Song 
> Acked-by: Yonghong Song 
> Signed-off-by: Martin KaFai Lau 

Applied to bpf, thanks Martin!

Re: [PATCH V3 bpf] xdp: add NULL pointer check in __xdp_return()

2018-07-26 Thread Daniel Borkmann

On 07/26/2018 04:17 PM, Taehee Yoo wrote:
> rhashtable_lookup() can return NULL. so that NULL pointer
> check routine should be added.
> 
> Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY")
> Acked-by: Martin KaFai Lau 
> Signed-off-by: Taehee Yoo 

Applied to bpf, thanks Taehee!

Re: [PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled

2018-07-26 Thread Jakub Kicinski

On Fri, 27 Jul 2018 10:06:41 +0900, Toshiaki Makita wrote:
> On 2018/07/27 9:51, Jakub Kicinski wrote:
> > On Thu, 26 Jul 2018 23:40:26 +0900, Toshiaki Makita wrote:  
> >> +  max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
> >> +peer->hard_header_len -
> >> +SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >> +  if (peer->mtu > max_mtu) {
> >> +  NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to 
> >> set XDP");
> >> +  err = -ERANGE;
> >> +  goto err;
> >> +  }  
> > 
> > You need to add .ndo_change_mtu and check this condition there too.  
> 
> I'm setting peer->max_mtu so no need to add .ndo_change_mtu.
> Inappropriate MTU will be refused in dev_set_mtu().

missed that, sorry

Re: [PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled

2018-07-26 Thread Toshiaki Makita

On 2018/07/27 9:51, Jakub Kicinski wrote:
> On Thu, 26 Jul 2018 23:40:26 +0900, Toshiaki Makita wrote:
>> +max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
>> +  peer->hard_header_len -
>> +  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> +if (peer->mtu > max_mtu) {
>> +NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to 
>> set XDP");
>> +err = -ERANGE;
>> +goto err;
>> +}
> 
> You need to add .ndo_change_mtu and check this condition there too.

I'm setting peer->max_mtu so no need to add .ndo_change_mtu.
Inappropriate MTU will be refused in dev_set_mtu().

-- 
Toshiaki Makita

Re: [PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled

2018-07-26 Thread Jakub Kicinski

On Thu, 26 Jul 2018 23:40:26 +0900, Toshiaki Makita wrote:
> + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
> +   peer->hard_header_len -
> +   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> + if (peer->mtu > max_mtu) {
> + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to 
> set XDP");
> + err = -ERANGE;
> + goto err;
> + }

You need to add .ndo_change_mtu and check this condition there too.

Re: [PATCH net-next v4 1/4] net/sched: user-space can't set unknown tcfa_action values

2018-07-26 Thread Marcelo Ricardo Leitner

Hi,

On Thu, Jul 26, 2018 at 04:34:57PM +0200, Paolo Abeni wrote:
...
> @@ -895,6 +904,14 @@ struct tc_action *tcf_action_init_1(struct net *net, 
> struct tcf_proto *tp,
>   }
>   }
>  
> + if (!tcf_action_valid(a->tcfa_action)) {
> + net_warn_ratelimited("invalid %d action value, using "
> +  "TC_ACT_UNSPEC instead", a->tcfa_action);

Now that it is reporting the error via extack, do we really need this
warn net_warn?
extack will be shown as a warning by iproute2 even if the command
succeeds.

> + NL_SET_ERR_MSG(extack, "invalid action value, using "
> +"TC_ACT_UNSPEC instead");

Quoted strings shouldn't be broken down into multiple lines..

> + a->tcfa_action = TC_ACT_UNSPEC;
> + }
> +
>   return a;
>  
>  err_mod:
> -- 
> 2.17.1
>

Re: [PATCH bpf-next 0/4] samples: bpf: convert two more samples to libbpf

2018-07-26 Thread Y Song

On Thu, Jul 26, 2018 at 2:32 PM, Jakub Kicinski
 wrote:
> Hi!
>
> This set converts xdpsock_user.c and xdp_fwd_user.c to use libbpf instead
> of bpf_load.o.  First two patches are minor improvements to libbpf to make
> the conversion (and use of libbpf in general) nicer.
>
> Jakub Kicinski (4):
>   tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd()
>   tools: libbpf: add bpf_object__find_program_by_title()
>   samples: bpf: convert xdp_fwd_user.c to libbpf
>   samples: bpf: convert xdpsock_user.c to libbpf

LGTM. Ack for the whole series.
Acked-by: Yonghong Song 

>
>  samples/bpf/Makefile   |  4 ++--
>  samples/bpf/xdp_fwd_user.c | 34 +++---
>  samples/bpf/xdpsock_user.c | 38 +-
>  tools/lib/bpf/libbpf.c | 15 +++
>  tools/lib/bpf/libbpf.h |  3 +++
>  5 files changed, 72 insertions(+), 22 deletions(-)
>
> --
> 2.17.1
>

Re: [PATCH net-next v4 3/4] net/tc: introduce TC_ACT_REINSERT.

2018-07-26 Thread Cong Wang

On Thu, Jul 26, 2018 at 7:35 AM Paolo Abeni  wrote:
>
> This is similar TC_ACT_REDIRECT, but with a slightly different
> semantic:
> - on ingress the mirred skbs are passed to the target device
> network stack without any additional check not scrubbing.
> - the rcu-protected stats provided via the tcf_result struct
>   are updated on error conditions.
>
> This new tcfa_action value is not exposed to the user-space
> and can be used only internally by clsact.
>
> v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce
>  a new action type instead
>
> v2 -> v3:
>  - rename the new action value TC_ACT_REINJECT, update the
>helper accordingly
>  - take care of uncloned reinjected packets in XDP generic
>hook
>
> v3 -> v4:
>  - renamed again the new action value (JiriP)
>
> Signed-off-by: Paolo Abeni 


Acked-by: Cong Wang 


> ---
> Note: this patch still touch only overlimits, even there is some
> agreement to touch (also) drops on reinsert/mirred failure, but
> such change is independent to this series

Totally agree.

Thanks!

Re: [PATCH net-next v4 4/4] act_mirred: use TC_ACT_REINSERT when possible

2018-07-26 Thread Cong Wang

On Thu, Jul 26, 2018 at 7:35 AM Paolo Abeni  wrote:
>
> When mirred is invoked from the ingress path, and it wants to redirect
> the processed packet, it can now use the TC_ACT_REINSERT action,
> filling the tcf_result accordingly, and avoiding a per packet
> skb_clone().
>
> Overall this gives a ~10% improvement in forwarding performance for the
> TC S/W data path and TC S/W performances are now comparable to the
> kernel openvswitch datapath.
>
> v1 -> v2: use ACT_MIRRED instead of ACT_REDIRECT
> v2 -> v3: updated after action rename, fixed typo into the commit
> message
> v3 -> v4: updated again after action rename, added more comments to
> the code (JiriP), skip the optimization if the control action
> need to touch the tcf_result (Paolo)
>
> Signed-off-by: Paolo Abeni 

Overall it looks good to me now.

Acked-by: Cong Wang 

Thanks!

Re: [PATCH net-next v3 4/5] net/tc: introduce TC_ACT_REINJECT.

2018-07-26 Thread Cong Wang

On Thu, Jul 26, 2018 at 5:52 AM Jamal Hadi Salim  wrote:
>
> On 25/07/18 01:09 PM, Marcelo Ricardo Leitner wrote:
> > On Wed, Jul 25, 2018 at 09:48:16AM -0700, Cong Wang wrote:
> >> On Wed, Jul 25, 2018 at 5:27 AM Jamal Hadi Salim  wrote:
> >>>
> >>> Those changes were there from the beginning (above patch did
> >>> not introduce them).
> >>> IIRC, the reason was to distinguish between policy intended
> >>> drops and drops because of errors.
> >>
> >> There must be a limit for "overlimit" to make sense. There is
> >> no limit in mirred action's context, probably there is only
> >> such a limit in act_police. So, all rest should not touch overlimit.
> >
> > +1
> >
>
> I agree we should at least record drop count(unrelated patch though).
> we should keep overlimit (for no other reason other than this
> has been around for at least 15 years).
>
> On why "overlimit"? It is just a name for a counter that is useless
> for most actions (but was still being transfered to user space).
> It is the closest counter to counting "this failed because of
> runtime errors" as opposed to "user asked us to drop this".
>
> Probably a good alternative is to make a very small stats v3 structure
> (we have migrated stats structures before) and extend for
> each action/classifier/qdisc to add its extra counters using XSTATS.

Agreed.

>
> Note:
> If you are _mirroring_ packets - incrementing the drop counter is
> misleading because the packet is not dropped by the system.
> i.e the qdisc will not record it as dropped; it should for
> redirect policy.  It is useful to be able to tell
> them apart when you are collecting  analytics just for actions.

Sounds like we just need another counter rather than re-using overlimit
or drops.


> (if youve worked on a massive amount of machines you'll appreciate
> being able to debug by looking at counters that reduce ambiguity).
>

Yes, this is how I found out the overlimit of htb qdisc is inaccurate
or misleading, instead the one in htb class is accurate, see:

commit 3c75f6ee139d464351f8ab77a042dd3635769d98
Author: Eric Dumazet 
Date:   Mon Sep 18 12:36:22 2017 -0700

net_sched: sch_htb: add per class overlimits counter

Re: [PATCH] bpf: verifier: BPF_MOV don't mark dst reg if src == dst

2018-07-26 Thread Y Song

On Thu, Jul 26, 2018 at 9:52 AM, Arthur Fabre  wrote:
> Oops, gmail seems to have mangled everything. Will resend using git
> send-email.
>
> I've added the test cases for mov64, but I'm not sure of the expected mov32
> behavior.

The interpreter has below:

ALU_MOV_X:
DST = (u32) SRC;
CONT;
...
ALU64_MOV_X:
DST = SRC;
CONT;

The later verifier code does seem to mark dst_reg properly for both
ALU and ALU64.

> Currently coerce_reg_to_size() is called after mark_reg_unknown(),
> which sets the bounds to 64bits. coerce_reg_to_size() resets the bounds
> again,
> as they're too "wide" to fit the new size. It sets SMIN = UMIN = 0,
> which seems weird. Shouldn't SMIN be 1 << (size * 8 - 1)? Same applies for
> SMAX.

The SMIN/UMIN still should be 0 since there is no negative here due to
smaller width?

> Should mov32 always mark the dst as unbounded?

We can do better than unbounded for dst register of mov32, which is
the code already
doing?

>
>
> On Thu, Jul 26, 2018 at 1:42 AM, Daniel Borkmann 
> wrote:
>>
>> On 07/26/2018 12:08 AM, Arthur Fabre wrote:
>> > When check_alu_op() handles a BPF_MOV between two registers,
>> > it calls check_reg_arg() on the dst register, marking it as unbounded.
>> > If the src and dst register are the same, this marks the src as
>> > unbounded, which can lead to unexpected errors for further checks that
>> > rely on bounds info.

Could you explain (and add to the commit messages eventually) what
are these unexpected errors?

>> >
>> > check_alu_op() now only marks the dst register as unbounded if it
>> > different from the src register.
>> >
>> > Signed-off-by: Arthur Fabre 
>> > ---
>> >  kernel/bpf/verifier.c | 5 +++--
>> >  1 file changed, 3 insertions(+), 2 deletions(-)
>> >
>> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> > index 63aaac52a265..ddfe3c544a80 100644
>> > --- a/kernel/bpf/verifier.c
>> > +++ b/kernel/bpf/verifier.c
>> > @@ -3238,8 +3238,9 @@ static int check_alu_op(struct bpf_verifier_env
>> > *env, struct bpf_insn *insn)
>> > }
>> > }
>> >
>> > -   /* check dest operand */
>> > -   err = check_reg_arg(env, insn->dst_reg, DST_OP);
>> > +   /* check dest operand, only mark if dest != src */
>> > +   err = check_reg_arg(env, insn->dst_reg,
>> > +   insn->dst_reg == insn->src_reg ?
>> > DST_OP_NO_MARK : DST_OP);
>> > if (err)
>> > return err;
>> >
>>
>> Thanks a lot for the patch! Looks like it's corrupted wrt newline.
>>
>> Please also add test cases to tools/testing/selftests/bpf/test_verifier.c
>> for the cases of mov64 and mov32 where in each src==dst and src!=dst;
>> mov32
>> should mark it as unbounded but not former, so would be good to keep
>> tracking
>> that in selftests.
>
>

[net-next 03/13] net/mlx5e: Gather all XDP pre-requisite checks in a single function

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Dedicate a function to all checks done when setting an XDP program.
Take indications from priv instead of netdev features.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 33 ---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c214b23e14ba..a1a54ea3aeb4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4094,26 +4094,37 @@ static void mlx5e_tx_timeout(struct net_device *dev)
queue_work(priv->wq, &priv->tx_timeout_work);
 }
 
+static int mlx5e_xdp_allowed(struct mlx5e_priv *priv)
+{
+   struct net_device *netdev = priv->netdev;
+
+   if (priv->channels.params.lro_en) {
+   netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO 
first\n");
+   return -EINVAL;
+   }
+
+   if (MLX5_IPSEC_DEV(priv->mdev)) {
+   netdev_warn(netdev, "can't set XDP with IPSec offload\n");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
 {
struct mlx5e_priv *priv = netdev_priv(netdev);
struct bpf_prog *old_prog;
-   int err = 0;
bool reset, was_opened;
+   int err;
int i;
 
mutex_lock(&priv->state_lock);
 
-   if ((netdev->features & NETIF_F_LRO) && prog) {
-   netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO 
first\n");
-   err = -EINVAL;
-   goto unlock;
-   }
-
-   if ((netdev->features & NETIF_F_HW_ESP) && prog) {
-   netdev_warn(netdev, "can't set XDP with IPSec offload\n");
-   err = -EINVAL;
-   goto unlock;
+   if (prog) {
+   err = mlx5e_xdp_allowed(priv);
+   if (err)
+   goto unlock;
}
 
was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
-- 
2.17.0

[net-next 06/13] net/mlx5e: Add counter for XDP redirect in RX

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Add per-ring and total stats for received packets that
goes into XDP redirection.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c   | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 3 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 649675c1af61..34accf3f4cee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -69,6 +69,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct 
mlx5e_dma_info *di,
rq->xdpsq.db.redirect_flush = true;
mlx5e_page_dma_unmap(rq, di);
}
+   rq->stats->xdp_redirect++;
return true;
default:
bpf_warn_invalid_xdp_action(act);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index c0507fada0be..b88db9d923ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -59,6 +59,7 @@ static const struct counter_desc sw_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) 
},
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) },
+   { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_redirect) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) },
@@ -142,6 +143,7 @@ void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
s->rx_csum_unnecessary += rq_stats->csum_unnecessary;
s->rx_csum_unnecessary_inner += 
rq_stats->csum_unnecessary_inner;
s->rx_xdp_drop += rq_stats->xdp_drop;
+   s->rx_xdp_redirect += rq_stats->xdp_redirect;
s->rx_xdp_tx += rq_stats->xdp_tx;
s->rx_xdp_tx_cqe  += rq_stats->xdp_tx_cqe;
s->rx_xdp_tx_full += rq_stats->xdp_tx_full;
@@ -1126,6 +1128,7 @@ static const struct counter_desc rq_stats_desc[] = {
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) 
},
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) },
+   { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_redirect) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx_cqe) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx_full) },
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index fc3f66003edd..07529cc58471 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -70,6 +70,7 @@ struct mlx5e_sw_stats {
u64 rx_csum_complete;
u64 rx_csum_unnecessary_inner;
u64 rx_xdp_drop;
+   u64 rx_xdp_redirect;
u64 rx_xdp_tx;
u64 rx_xdp_tx_cqe;
u64 rx_xdp_tx_full;
@@ -178,6 +179,7 @@ struct mlx5e_rq_stats {
u64 lro_bytes;
u64 removed_vlan_packets;
u64 xdp_drop;
+   u64 xdp_redirect;
u64 xdp_tx;
u64 xdp_tx_cqe;
u64 xdp_tx_full;
-- 
2.17.0

[net-next 10/13] net/mlx5e: Add support for XDP_REDIRECT in device-out side

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Add implementation for the ndo_xdp_xmit callback.

Dedicate a new set of XDP-SQ instances to satisfy the XDP_REDIRECT
requests.  These instances are totally separated from the existing
XDP-SQ objects that satisfy local XDP_TX actions.

Performance tests:

xdp_redirect_map from ConnectX-5 to ConnectX-5.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Packet-rate of 64B packets.

Single queue: 7 Mpps.
Multi queue: 55 Mpps.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eugenia Emantayev 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  5 +
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 92 ---
 .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  2 +
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 36 ++--
 .../ethernet/mellanox/mlx5/core/en_stats.c| 32 ++-
 .../ethernet/mellanox/mlx5/core/en_stats.h|  5 +
 .../net/ethernet/mellanox/mlx5/core/en_txrx.c |  3 +
 7 files changed, 154 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 16e219a8f240..3f21cafe6be3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -344,6 +344,7 @@ enum {
MLX5E_SQ_STATE_IPSEC,
MLX5E_SQ_STATE_AM,
MLX5E_SQ_STATE_TLS,
+   MLX5E_SQ_STATE_REDIRECT,
 };
 
 struct mlx5e_sq_wqe_info {
@@ -598,6 +599,9 @@ struct mlx5e_channel {
__be32 mkey_be;
u8 num_tc;
 
+   /* XDP_REDIRECT */
+   struct mlx5e_xdpsq xdpsq;
+
/* data path - accessed per napi poll */
struct irq_desc *irq_desc;
struct mlx5e_ch_stats *stats;
@@ -621,6 +625,7 @@ struct mlx5e_channel_stats {
struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
struct mlx5e_rq_stats rq;
struct mlx5e_xdpsq_stats rq_xdpsq;
+   struct mlx5e_xdpsq_stats xdpsq;
 } cacheline_aligned_in_smp;
 
 enum mlx5e_traffic_types {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index eabd5537927d..bab8cd44d1c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -167,6 +167,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
struct mlx5e_xdpsq *sq;
struct mlx5_cqe64 *cqe;
struct mlx5e_rq *rq;
+   bool is_redirect;
u16 sqcc;
int i;
 
@@ -179,6 +180,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
if (!cqe)
return false;
 
+   is_redirect = test_bit(MLX5E_SQ_STATE_REDIRECT, &sq->state);
rq = container_of(sq, struct mlx5e_rq, xdpsq);
 
/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
@@ -196,17 +198,20 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
wqe_counter = be16_to_cpu(cqe->wqe_counter);
 
do {
-   struct mlx5e_xdp_info *xdpi;
-   u16 ci;
+   u16 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
+   struct mlx5e_xdp_info *xdpi = &sq->db.xdpi[ci];
 
last_wqe = (sqcc == wqe_counter);
-
-   ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
-   xdpi = &sq->db.xdpi[ci];
-
sqcc++;
-   /* Recycle RX page */
-   mlx5e_page_release(rq, &xdpi->di, true);
+
+   if (is_redirect) {
+   xdp_return_frame(xdpi->xdpf);
+   dma_unmap_single(sq->pdev, xdpi->dma_addr,
+xdpi->xdpf->len, 
DMA_TO_DEVICE);
+   } else {
+   /* Recycle RX page */
+   mlx5e_page_release(rq, &xdpi->di, true);
+   }
} while (!last_wqe);
} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = 
mlx5_cqwq_get_cqe(&cq->wq)));
 
@@ -223,16 +228,75 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
 
 void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
 {
-   struct mlx5e_rq *rq = container_of(sq, struct mlx5e_rq, xdpsq);
-   struct mlx5e_xdp_info *xdpi;
-   u16 ci;
+   struct mlx5e_rq *rq;
+   bool is_redirect;
+
+   is_redirect = test_bit(MLX5E_SQ_STATE_REDIRECT, &sq->state);
+   rq = is_redirect ? NULL : container_of(sq, struct mlx5e_rq, xdpsq);
 
while (sq->cc != sq->pc) {
-   ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc);
-   xdpi = &sq->db.xdpi[ci];
+   u16 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc);
+   struct mlx5e_xdp_info *xdpi = &sq->db.xdpi[ci];
+
sq->cc++;
 
-   mlx5e_page_release(rq, &xdpi->di, false);
+   if (is_redirect) {
+   xdp_return_fram

[net-next 12/13] net/mlx5e: TX, Move DB fields in TXQ-SQ struct

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Pointers in DB are static, move them to read-only area so they
do not share a cacheline with fields modified in datapath.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3f21cafe6be3..c41cfc2a4b70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -365,16 +365,14 @@ struct mlx5e_txqsq {
 
struct mlx5e_cqcq;
 
-   /* write@xmit, read@completion */
-   struct {
-   struct mlx5e_sq_dma   *dma_fifo;
-   struct mlx5e_tx_wqe_info  *wqe_info;
-   } db;
-
/* read only */
struct mlx5_wq_cyc wq;
u32dma_fifo_mask;
struct mlx5e_sq_stats *stats;
+   struct {
+   struct mlx5e_sq_dma   *dma_fifo;
+   struct mlx5e_tx_wqe_info  *wqe_info;
+   } db;
void __iomem  *uar_map;
struct netdev_queue   *txq;
u32sqn;
-- 
2.17.0

[net-next 04/13] net/mlx5e: Restrict the combination of large MTU and XDP

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Add checks in control path upon an MTU change or an XDP program set,
to prevent reaching cases where large MTU and XDP are set simultaneously.

This is to make sure we allow XDP only with the linear RX memory scheme,
i.e. a received packet is not scattered to different pages.
Change mlx5e_rx_get_linear_frag_sz() accordingly, so that we make sure
the XDP configuration can really be set, instead of assuming that it is.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 +
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 39 +++
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index dc9aa070eafa..7840f6f44379 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -72,6 +72,8 @@ struct page_pool;
 #define MLX5_RX_HEADROOM NET_SKB_PAD
 #define MLX5_SKB_FRAG_SZ(len)  (SKB_DATA_ALIGN(len) +  \
 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#define MLX5E_XDP_MAX_MTU ((int)(PAGE_SIZE - \
+MLX5_SKB_FRAG_SZ(XDP_PACKET_HEADROOM)))
 
 #define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
(6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a1a54ea3aeb4..56ae6e428fdf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -96,14 +96,19 @@ bool mlx5e_check_fragmented_striding_rq_cap(struct 
mlx5_core_dev *mdev)
 
 static u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params)
 {
-   if (!params->xdp_prog) {
-   u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
-   u16 rq_headroom = MLX5_RX_HEADROOM + NET_IP_ALIGN;
+   u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+   u16 linear_rq_headroom = params->xdp_prog ?
+   XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
+   u32 frag_sz;
 
-   return MLX5_SKB_FRAG_SZ(rq_headroom + hw_mtu);
-   }
+   linear_rq_headroom += NET_IP_ALIGN;
 
-   return PAGE_SIZE;
+   frag_sz = MLX5_SKB_FRAG_SZ(linear_rq_headroom + hw_mtu);
+
+   if (params->xdp_prog && frag_sz < PAGE_SIZE)
+   frag_sz = PAGE_SIZE;
+
+   return frag_sz;
 }
 
 static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
@@ -3707,6 +3712,14 @@ int mlx5e_change_mtu(struct net_device *netdev, int 
new_mtu,
new_channels.params = *params;
new_channels.params.sw_mtu = new_mtu;
 
+   if (params->xdp_prog &&
+   !mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) {
+   netdev_err(netdev, "MTU(%d) > %d is not allowed while XDP 
enabled\n",
+  new_mtu, MLX5E_XDP_MAX_MTU);
+   err = -EINVAL;
+   goto out;
+   }
+
if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params);
u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params);
@@ -4094,9 +4107,10 @@ static void mlx5e_tx_timeout(struct net_device *dev)
queue_work(priv->wq, &priv->tx_timeout_work);
 }
 
-static int mlx5e_xdp_allowed(struct mlx5e_priv *priv)
+static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog)
 {
struct net_device *netdev = priv->netdev;
+   struct mlx5e_channels new_channels = {};
 
if (priv->channels.params.lro_en) {
netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO 
first\n");
@@ -4108,6 +4122,15 @@ static int mlx5e_xdp_allowed(struct mlx5e_priv *priv)
return -EINVAL;
}
 
+   new_channels.params = priv->channels.params;
+   new_channels.params.xdp_prog = prog;
+
+   if (!mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) {
+   netdev_warn(netdev, "XDP is not allowed with MTU(%d) > %d\n",
+   new_channels.params.sw_mtu, MLX5E_XDP_MAX_MTU);
+   return -EINVAL;
+   }
+
return 0;
 }
 
@@ -4122,7 +4145,7 @@ static int mlx5e_xdp_set(struct net_device *netdev, 
struct bpf_prog *prog)
mutex_lock(&priv->state_lock);
 
if (prog) {
-   err = mlx5e_xdp_allowed(priv);
+   err = mlx5e_xdp_allowed(priv, prog);
if (err)
goto unlock;
}
-- 
2.17.0

[net-next 13/13] net/mlx5e: TX, Use function to access sq_dma object in fifo

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Use designated function mlx5e_dma_get() to get
the mlx5e_sq_dma object to be pushed into fifo.

Signed-off-by: Tariq Toukan 
Reviewed-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 9106ea45e3cb..ae73ea992845 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -66,22 +66,21 @@ static inline void mlx5e_tx_dma_unmap(struct device *pdev,
}
 }
 
+static inline struct mlx5e_sq_dma *mlx5e_dma_get(struct mlx5e_txqsq *sq, u32 i)
+{
+   return &sq->db.dma_fifo[i & sq->dma_fifo_mask];
+}
+
 static inline void mlx5e_dma_push(struct mlx5e_txqsq *sq,
  dma_addr_t addr,
  u32 size,
  enum mlx5e_dma_map_type map_type)
 {
-   u32 i = sq->dma_fifo_pc & sq->dma_fifo_mask;
+   struct mlx5e_sq_dma *dma = mlx5e_dma_get(sq, sq->dma_fifo_pc++);
 
-   sq->db.dma_fifo[i].addr = addr;
-   sq->db.dma_fifo[i].size = size;
-   sq->db.dma_fifo[i].type = map_type;
-   sq->dma_fifo_pc++;
-}
-
-static inline struct mlx5e_sq_dma *mlx5e_dma_get(struct mlx5e_txqsq *sq, u32 i)
-{
-   return &sq->db.dma_fifo[i & sq->dma_fifo_mask];
+   dma->addr = addr;
+   dma->size = size;
+   dma->type = map_type;
 }
 
 static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma)
-- 
2.17.0

[net-next 11/13] net/mlx5e: RX, Prefetch the xdp_frame data area

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

A loaded XDP program might write to the xdp_frame data area,
prefetchw() it to avoid a potential cache miss.

Performance tests:
ConnectX-5, XDP_TX packet rate, single ring.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Before: 13,172,976 pps
After:  13,456,248 pps
2% gain.

Fixes: 22f453988194 ("net/mlx5e: Support XDP over Striding RQ")
Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index e33ca03b2100..15d8ae28c040 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1099,6 +1099,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, 
struct mlx5e_mpw_info *wi,
 
dma_sync_single_range_for_cpu(rq->pdev, di->addr, head_offset,
  frag_size, DMA_FROM_DEVICE);
+   prefetchw(va); /* xdp_frame data area */
prefetch(data);
 
rcu_read_lock();
-- 
2.17.0

[net-next 01/13] net/mlx5e: Replace call to MPWQE free with dealloc in interface down flow

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

No need to expose the MPWQE free function to control path.
The dealloc function already exposed, use it.

Signed-off-by: Tariq Toukan 
Reviewed-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e1b237ccdf56..dc9aa070eafa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -892,7 +892,6 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
 bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq);
 void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
-void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi);
 struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
u16 cqe_bcnt, u32 head_offset, u32 page_idx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index dccde18f6170..c214b23e14ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -877,7 +877,7 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 
/* UMR WQE (if in progress) is always at wq->head */
if (rq->mpwqe.umr_in_progress)
-   mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
+   rq->dealloc_wqe(rq, wq->head);
 
while (!mlx5_wq_ll_is_empty(wq)) {
struct mlx5e_rx_wqe_ll *wqe;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 1d5295ee863c..e6b3d178c45f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -396,7 +396,7 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev,
}
 }
 
-void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
+static void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 {
const bool no_xdp_xmit =
bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
-- 
2.17.0

[net-next 08/13] net/mlx5e: Refactor XDP counters

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Separate the XDP counters into two sets:
(1) One set reside in the RQ stats, and they monitor XDP stats
in the RQ side.
(2) Another set is per XDP-SQ, and they monitor XDP stats that
are related to XDP transmit flow.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eugenia Emantayev 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 +
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 12 ++---
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  1 +
 .../ethernet/mellanox/mlx5/core/en_stats.c| 47 +--
 .../ethernet/mellanox/mlx5/core/en_stats.h| 16 +--
 5 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 118d66207079..d9e24fbe6a28 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -424,6 +424,7 @@ struct mlx5e_xdpsq {
 
/* read only */
struct mlx5_wq_cyc wq;
+   struct mlx5e_xdpsq_stats  *stats;
void __iomem  *uar_map;
u32sqn;
struct device *pdev;
@@ -619,6 +620,7 @@ struct mlx5e_channel_stats {
struct mlx5e_ch_stats ch;
struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
struct mlx5e_rq_stats rq;
+   struct mlx5e_xdpsq_stats rq_xdpsq;
 } cacheline_aligned_in_smp;
 
 enum mlx5e_traffic_types {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 53d011eb71ab..26e24823504b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -106,8 +106,6 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct 
mlx5e_xdp_info *xdpi)
u16   pi   = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
struct mlx5e_tx_wqe  *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
 
-   struct mlx5e_rq *rq = container_of(sq, struct mlx5e_rq, xdpsq);
-
struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
struct mlx5_wqe_eth_seg  *eseg = &wqe->eth;
struct mlx5_wqe_data_seg *dseg = wqe->data;
@@ -116,12 +114,12 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct 
mlx5e_xdp_info *xdpi)
dma_addr_t dma_addr  = xdpi->dma_addr;
unsigned int dma_len = xdpf->len;
 
-   struct mlx5e_rq_stats *stats = rq->stats;
+   struct mlx5e_xdpsq_stats *stats = sq->stats;
 
prefetchw(wqe);
 
if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE || sq->hw_mtu < dma_len)) {
-   stats->xdp_drop++;
+   stats->err++;
return false;
}
 
@@ -131,7 +129,7 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct 
mlx5e_xdp_info *xdpi)
mlx5e_xmit_xdp_doorbell(sq);
sq->db.doorbell = false;
}
-   stats->xdp_tx_full++;
+   stats->full++;
return false;
}
 
@@ -160,7 +158,7 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct 
mlx5e_xdp_info *xdpi)
 
sq->db.doorbell = true;
 
-   stats->xdp_tx++;
+   stats->xmit++;
return true;
 }
 
@@ -212,7 +210,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
} while (!last_wqe);
} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = 
mlx5_cqwq_get_cqe(&cq->wq)));
 
-   rq->stats->xdp_tx_cqe += i;
+   sq->stats->cqes += i;
 
mlx5_cqwq_update_db_record(&cq->wq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7ed71db9b32f..5a6c4403a3e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1001,6 +1001,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->uar_map   = mdev->mlx5e_res.bfreg.map;
sq->min_inline_mode = params->tx_min_inline_mode;
sq->hw_mtu= MLX5E_SW2HW_MTU(params, params->sw_mtu);
+   sq->stats = &c->priv->channel_stats[c->ix].rq_xdpsq;
 
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index b88db9d923ad..85b1827b76c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -60,9 +60,10 @@ static const struct counter_desc sw_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) 
},
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_redirect) },
-   { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx) },
-   { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe)

[net-next 07/13] net/mlx5e: Make XDP xmit functions more generic

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Convert the XDP xmit functions to use the generic xdp_frame API
in XDP_TX flow.
Same functions will be used later in this series to transmit
the XDP redirect-out packets as well.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eugenia Emantayev 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 20 --
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 70 +++
 .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  3 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 10 +--
 4 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 2f1058da0907..118d66207079 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -395,6 +395,17 @@ struct mlx5e_txqsq {
} recover;
 } cacheline_aligned_in_smp;
 
+struct mlx5e_dma_info {
+   struct page *page;
+   dma_addr_t  addr;
+};
+
+struct mlx5e_xdp_info {
+   struct xdp_frame  *xdpf;
+   dma_addr_tdma_addr;
+   struct mlx5e_dma_info di;
+};
+
 struct mlx5e_xdpsq {
/* data path */
 
@@ -406,7 +417,7 @@ struct mlx5e_xdpsq {
 
/* write@xmit, read@completion */
struct {
-   struct mlx5e_dma_info *di;
+   struct mlx5e_xdp_info *xdpi;
bool   doorbell;
bool   redirect_flush;
} db;
@@ -419,6 +430,7 @@ struct mlx5e_xdpsq {
__be32 mkey_be;
u8 min_inline_mode;
unsigned long  state;
+   unsigned int   hw_mtu;
 
/* control path */
struct mlx5_wq_ctrlwq_ctrl;
@@ -455,11 +467,6 @@ mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 
pc, u16 n)
return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc);
 }
 
-struct mlx5e_dma_info {
-   struct page *page;
-   dma_addr_t  addr;
-};
-
 struct mlx5e_wqe_frag_info {
struct mlx5e_dma_info *di;
u32 offset;
@@ -562,7 +569,6 @@ struct mlx5e_rq {
 
/* XDP */
struct bpf_prog   *xdp_prog;
-   unsigned int   hw_mtu;
struct mlx5e_xdpsq xdpsq;
DECLARE_BITMAP(flags, 8);
struct page_pool  *page_pool;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 34accf3f4cee..53d011eb71ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -33,6 +33,23 @@
 #include 
 #include "en/xdp.h"
 
+static inline bool
+mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_dma_info *di,
+   struct xdp_buff *xdp)
+{
+   struct mlx5e_xdp_info xdpi;
+
+   xdpi.xdpf = convert_to_xdp_frame(xdp);
+   if (unlikely(!xdpi.xdpf))
+   return false;
+   xdpi.dma_addr = di->addr + (xdpi.xdpf->data - (void *)xdpi.xdpf);
+   dma_sync_single_for_device(sq->pdev, xdpi.dma_addr,
+  xdpi.xdpf->len, PCI_DMA_TODEVICE);
+   xdpi.di = *di;
+
+   return mlx5e_xmit_xdp_frame(sq, &xdpi);
+}
+
 /* returns true if packet was consumed by xdp */
 bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
  void *va, u16 *rx_headroom, u32 *len)
@@ -58,22 +75,24 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct 
mlx5e_dma_info *di,
*len = xdp.data_end - xdp.data;
return false;
case XDP_TX:
-   if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
-   trace_xdp_exception(rq->netdev, prog, act);
+   if (unlikely(!mlx5e_xmit_xdp_buff(&rq->xdpsq, di, &xdp)))
+   goto xdp_abort;
+   __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
return true;
case XDP_REDIRECT:
/* When XDP enabled then page-refcnt==1 here */
err = xdp_do_redirect(rq->netdev, &xdp, prog);
-   if (!err) {
-   __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
-   rq->xdpsq.db.redirect_flush = true;
-   mlx5e_page_dma_unmap(rq, di);
-   }
+   if (unlikely(err))
+   goto xdp_abort;
+   __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
+   rq->xdpsq.db.redirect_flush = true;
+   mlx5e_page_dma_unmap(rq, di);
rq->stats->xdp_redirect++;
return true;
default:
bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED:
+xdp_abort:
trace_xdp_exception(rq->netdev, prog, act);
case XDP_DROP:
rq->stats->xdp_drop++;
@@ -81,27 +100,27 @@ bool mlx5e_xd

[net-next 09/13] net/mlx5e: Re-order fields of struct mlx5e_xdpsq

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

In the downstream patch that adds support to XDP_REDIRECT-out,
the XDP xmit frame function doesn't share the same run context as
the NAPI that polls the XDP-SQ completion queue.

Hence, need to re-order the XDP-SQ fields to avoid cacheline
false-sharing.

Take redirect_flush and doorbell out of DB, into separated
cachelines.

Add a cacheline breaker within the stats struct.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 18 +-
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c   |  8 
 .../net/ethernet/mellanox/mlx5/core/en_rx.c|  8 
 .../net/ethernet/mellanox/mlx5/core/en_stats.h |  3 ++-
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index d9e24fbe6a28..16e219a8f240 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -409,22 +409,22 @@ struct mlx5e_xdp_info {
 struct mlx5e_xdpsq {
/* data path */
 
-   /* dirtied @rx completion */
+   /* dirtied @completion */
u16cc;
-   u16pc;
+   bool   redirect_flush;
 
-   struct mlx5e_cqcq;
+   /* dirtied @xmit */
+   u16pc cacheline_aligned_in_smp;
+   bool   doorbell;
 
-   /* write@xmit, read@completion */
-   struct {
-   struct mlx5e_xdp_info *xdpi;
-   bool   doorbell;
-   bool   redirect_flush;
-   } db;
+   struct mlx5e_cqcq;
 
/* read only */
struct mlx5_wq_cyc wq;
struct mlx5e_xdpsq_stats  *stats;
+   struct {
+   struct mlx5e_xdp_info *xdpi;
+   } db;
void __iomem  *uar_map;
u32sqn;
struct device *pdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 26e24823504b..eabd5537927d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -85,7 +85,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct 
mlx5e_dma_info *di,
if (unlikely(err))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
-   rq->xdpsq.db.redirect_flush = true;
+   rq->xdpsq.redirect_flush = true;
mlx5e_page_dma_unmap(rq, di);
rq->stats->xdp_redirect++;
return true;
@@ -124,10 +124,10 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct 
mlx5e_xdp_info *xdpi)
}
 
if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1))) {
-   if (sq->db.doorbell) {
+   if (sq->doorbell) {
/* SQ is full, ring doorbell */
mlx5e_xmit_xdp_doorbell(sq);
-   sq->db.doorbell = false;
+   sq->doorbell = false;
}
stats->full++;
return false;
@@ -156,7 +156,7 @@ bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct 
mlx5e_xdp_info *xdpi)
sq->db.xdpi[pi] = *xdpi;
sq->pc++;
 
-   sq->db.doorbell = true;
+   sq->doorbell = true;
 
stats->xmit++;
return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 70b984c4e8a4..e33ca03b2100 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1201,14 +1201,14 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
rq->handle_rx_cqe(rq, cqe);
} while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
 
-   if (xdpsq->db.doorbell) {
+   if (xdpsq->doorbell) {
mlx5e_xmit_xdp_doorbell(xdpsq);
-   xdpsq->db.doorbell = false;
+   xdpsq->doorbell = false;
}
 
-   if (xdpsq->db.redirect_flush) {
+   if (xdpsq->redirect_flush) {
xdp_do_flush_map();
-   xdpsq->db.redirect_flush = false;
+   xdpsq->redirect_flush = false;
}
 
mlx5_cqwq_update_db_record(&cq->wq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 95e1f32c67d9..7aa8ff389cdd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -230,7 +230,8 @@ struct mlx5e_xdpsq_stats {
u64 xmit;
u64 full;
u64 err;
-   u64 cqes;
+   /* dirtied @completion */
+   u64 cqes cacheline_aligned_in

[net-next 05/13] net/mlx5e: Move XDP related code into new XDP files

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Take XDP code out of the general EN header and RX file into
new XDP files.

Currently, XDP-SQ resides only within an RQ and used from a
single flow (XDP_TX) triggered upon RX completions.
In a downstream patch, additional type of XDP-SQ instances will be
presented and used for the XDP_REDIRECT flow, totally unrelated to
the RX context.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   9 +-
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 225 ++
 .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  62 +
 .../net/ethernet/mellanox/mlx5/core/en_main.c |   1 +
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 208 +---
 .../net/ethernet/mellanox/mlx5/core/en_txrx.c |   1 +
 7 files changed, 293 insertions(+), 215 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fa7fcca5dc78..ae2bdcb1647c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -14,7 +14,7 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o \
fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o 
\
-   en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o  \
+   en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o vxlan.o \
en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7840f6f44379..2f1058da0907 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -72,8 +72,6 @@ struct page_pool;
 #define MLX5_RX_HEADROOM NET_SKB_PAD
 #define MLX5_SKB_FRAG_SZ(len)  (SKB_DATA_ALIGN(len) +  \
 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
-#define MLX5E_XDP_MAX_MTU ((int)(PAGE_SIZE - \
-MLX5_SKB_FRAG_SZ(XDP_PACKET_HEADROOM)))
 
 #define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
(6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
@@ -149,10 +147,6 @@ struct page_pool;
(DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_BB))
 #define MLX5E_ICOSQ_MAX_WQEBBS MLX5E_UMR_WQEBBS
 
-#define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
-#define MLX5E_XDP_TX_DS_COUNT \
-   ((sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */)
-
 #define MLX5E_NUM_MAIN_GROUPS 9
 
 #define MLX5E_MSG_LEVELNETIF_MSG_LINK
@@ -878,14 +872,13 @@ void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum 
mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
-bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
 void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq);
-void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev);
 bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
struct mlx5e_params *params);
 
+void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info 
*dma_info);
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
bool recycle);
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
new file mode 100644
index ..649675c1af61
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other ma

[net-next 02/13] net/mlx5e: Do not recycle RX pages in interface down flow

2018-07-26 Thread Saeed Mahameed

From: Tariq Toukan 

Keep all page-pool recycle calls within NAPI context.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 37 ++-
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index e6b3d178c45f..97db5eeca9f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -277,10 +277,11 @@ static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
 }
 
 static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq,
-struct mlx5e_wqe_frag_info *frag)
+struct mlx5e_wqe_frag_info *frag,
+bool recycle)
 {
if (frag->last_in_page)
-   mlx5e_page_release(rq, frag->di, true);
+   mlx5e_page_release(rq, frag->di, recycle);
 }
 
 static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
@@ -308,25 +309,26 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct 
mlx5e_rx_wqe_cyc *wqe,
 
 free_frags:
while (--i >= 0)
-   mlx5e_put_rx_frag(rq, --frag);
+   mlx5e_put_rx_frag(rq, --frag, true);
 
return err;
 }
 
 static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
-struct mlx5e_wqe_frag_info *wi)
+struct mlx5e_wqe_frag_info *wi,
+bool recycle)
 {
int i;
 
for (i = 0; i < rq->wqe.info.num_frags; i++, wi++)
-   mlx5e_put_rx_frag(rq, wi);
+   mlx5e_put_rx_frag(rq, wi, recycle);
 }
 
 void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 {
struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix);
 
-   mlx5e_free_rx_wqe(rq, wi);
+   mlx5e_free_rx_wqe(rq, wi, false);
 }
 
 static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
@@ -396,7 +398,8 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev,
}
 }
 
-static void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
+static void
+mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool 
recycle)
 {
const bool no_xdp_xmit =
bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
@@ -405,7 +408,7 @@ static void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct 
mlx5e_mpw_info *wi)
 
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++)
if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
-   mlx5e_page_release(rq, &dma_info[i], true);
+   mlx5e_page_release(rq, &dma_info[i], recycle);
 }
 
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
@@ -505,8 +508,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 {
struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
-
-   mlx5e_free_rx_mpwqe(rq, wi);
+   /* Don't recycle, this function is called on rq/netdev close */
+   mlx5e_free_rx_mpwqe(rq, wi, false);
 }
 
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
@@ -1113,7 +1116,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe)
napi_gro_receive(rq->cq.napi, skb);
 
 free_wqe:
-   mlx5e_free_rx_wqe(rq, wi);
+   mlx5e_free_rx_wqe(rq, wi, true);
 wq_cyc_pop:
mlx5_wq_cyc_pop(wq);
 }
@@ -1155,7 +1158,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe)
napi_gro_receive(rq->cq.napi, skb);
 
 free_wqe:
-   mlx5e_free_rx_wqe(rq, wi);
+   mlx5e_free_rx_wqe(rq, wi, true);
 wq_cyc_pop:
mlx5_wq_cyc_pop(wq);
 }
@@ -1292,7 +1295,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, 
struct mlx5_cqe64 *cqe)
 
wq  = &rq->mpwqe.wq;
wqe = mlx5_wq_ll_get_wqe(wq, wqe_id);
-   mlx5e_free_rx_mpwqe(rq, wi);
+   mlx5e_free_rx_mpwqe(rq, wi, true);
mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index);
 }
 
@@ -1521,7 +1524,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe)
napi_gro_receive(rq->cq.napi, skb);
 
 wq_free_wqe:
-   mlx5e_free_rx_wqe(rq, wi);
+   mlx5e_free_rx_wqe(rq, wi, true);
mlx5_wq_cyc_pop(wq);
 }
 
@@ -1544,19 +1547,19 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, 
struct mlx5_cqe64 *cqe)
skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
if (unlikely(!skb)) {
/* a DROP, save the page-reuse checks */
-   mlx5e_free_rx_wqe(rq, wi);
+   mlx5e_free_rx_wqe(rq, wi, true);
goto wq_cyc_pop;
}
skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb, &cqe_bcnt);
if (unlikely(!skb)) {
-   mlx5e_free_rx_wqe(rq, wi);
+   mlx5e_free_rx_wqe(rq, wi, true);

[pull request][net-next 00/13] Mellanox, mlx5e updates 2018-07-26 (XDP redirect)

2018-07-26 Thread Saeed Mahameed

Hi Dave,

This series from Tariq adds the support for device-out XDP redirect.

For more information please see tag log below.

Please pull and let me know if there's any problem.

Thanks,
Saeed.

---

The following changes since commit 6a8fab17940d4934293d4145abce00e178393bec:

  Merge branch '10GbE' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue (2018-07-26 
14:14:01 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5e-updates-2018-07-26

for you to fetch changes up to 8ee48233566624826d185bf63735cc01d7113fce:

  net/mlx5e: TX, Use function to access sq_dma object in fifo (2018-07-26 
15:23:59 -0700)


mlx5e-updates-2018-07-26 (XDP redirect)

This series from Tariq adds the support for device-out XDP redirect.

Start with a simple RX and XDP cleanups:
- Replace call to MPWQE free with dealloc in interface down flow
- Do not recycle RX pages in interface down flow
- Gather all XDP pre-requisite checks in a single function
- Restrict the combination of large MTU and XDP

Since now XDP logic is going to be called from TX side as well,
generic XDP TX logic is not RX only anymore, for that Tariq creates
a new xdp.c file and moves XDP related code into it, and generalizes
the code to support XDP TX for XDP redirect, such as the xdp tx sq
structures and xdp counters.

XDP redirect support:
Add implementation for the ndo_xdp_xmit callback.

Dedicate a new set of XDP-SQ instances to satisfy the XDP_REDIRECT
requests.  These instances are totally separated from the existing
XDP-SQ objects that satisfy local XDP_TX actions.

Performance tests:

xdp_redirect_map from ConnectX-5 to ConnectX-5.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Packet-rate of 64B packets.

Single queue: 7 Mpps.
Multi queue: 55 Mpps.

-Saeed.


Tariq Toukan (13):
  net/mlx5e: Replace call to MPWQE free with dealloc in interface down flow
  net/mlx5e: Do not recycle RX pages in interface down flow
  net/mlx5e: Gather all XDP pre-requisite checks in a single function
  net/mlx5e: Restrict the combination of large MTU and XDP
  net/mlx5e: Move XDP related code into new XDP files
  net/mlx5e: Add counter for XDP redirect in RX
  net/mlx5e: Make XDP xmit functions more generic
  net/mlx5e: Refactor XDP counters
  net/mlx5e: Re-order fields of struct mlx5e_xdpsq
  net/mlx5e: Add support for XDP_REDIRECT in device-out side
  net/mlx5e: RX, Prefetch the xdp_frame data area
  net/mlx5e: TX, Move DB fields in TXQ-SQ struct
  net/mlx5e: TX, Use function to access sq_dma object in fifo

 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  61 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c   | 302 +
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h   |  63 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 116 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c| 254 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  80 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  24 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c|  19 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |   4 +
 10 files changed, 612 insertions(+), 313 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h

RE: [net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP

2018-07-26 Thread Nguyen, Anthony L

> From: Jakub Kicinski [mailto:jakub.kicin...@netronome.com]
> Sent: Thursday, July 26, 2018 2:47 PM
> On Thu, 26 Jul 2018 14:21:08 -0700, Jakub Kicinski wrote:
> > On Thu, 26 Jul 2018 10:40:45 -0700, Jeff Kirsher wrote:
> > > From: Tony Nguyen 
> > >
> > > XDP does not support jumbo frames or LRO.  These checks are being
> > > made outside the driver when an XDP program is loaded, however,
> > > there is nothing preventing these from changing after an XDP program is
> loaded.
> > > Add the checks so that while an XDP program is loaded, do not allow
> > > MTU to be changed or LRO to be enabled.
> > >
> > > Signed-off-by: Tony Nguyen 
> > > Tested-by: Andrew Bowers 
> > > Signed-off-by: Jeff Kirsher 
> > > ---
> > >  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++
> > >  1 file changed, 10 insertions(+)
> > >
> > > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > > b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > > index 5a6600f7b382..c42256e91997 100644
> > > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > > @@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device
> > > *netdev, int new_mtu)  {
> > >   struct ixgbe_adapter *adapter = netdev_priv(netdev);
> > >
> > > + if (adapter->xdp_prog) {
> > > + e_warn(probe, "MTU cannot be changed while XDP program is
> loaded\n");
> > > + return -EPERM;
> >
> > EPERM looks wrong, EINVAL is common.  Also most drivers will just
> > check the bounds like you do in ixgbe_xdp_setup(), allowing the change
> > if new MTU still fits constraints.
> >
> > FWIW are the IXGBE_FLAG_SRIOV_ENABLED and IXGBE_FLAG_DCB_ENABLED
> flag
> > changes also covered while xdp is enabled?  Quick grep doesn't reveal
> > them being checked against xdp_prog other than in ixgbe_xdp_setup().
> 
> Ah, I didn't make the review in time :)  Could you follow up?

Yes, I'll make a follow-on patch to address your comments.

Re: [net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP

2018-07-26 Thread Jakub Kicinski

On Thu, 26 Jul 2018 14:21:08 -0700, Jakub Kicinski wrote:
> On Thu, 26 Jul 2018 10:40:45 -0700, Jeff Kirsher wrote:
> > From: Tony Nguyen 
> > 
> > XDP does not support jumbo frames or LRO.  These checks are being made
> > outside the driver when an XDP program is loaded, however, there is
> > nothing preventing these from changing after an XDP program is loaded.
> > Add the checks so that while an XDP program is loaded, do not allow MTU
> > to be changed or LRO to be enabled.
> > 
> > Signed-off-by: Tony Nguyen 
> > Tested-by: Andrew Bowers 
> > Signed-off-by: Jeff Kirsher 
> > ---
> >  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++
> >  1 file changed, 10 insertions(+)
> > 
> > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
> > b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > index 5a6600f7b382..c42256e91997 100644
> > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > @@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device 
> > *netdev, int new_mtu)
> >  {
> > struct ixgbe_adapter *adapter = netdev_priv(netdev);
> >  
> > +   if (adapter->xdp_prog) {
> > +   e_warn(probe, "MTU cannot be changed while XDP program is 
> > loaded\n");
> > +   return -EPERM;  
> 
> EPERM looks wrong, EINVAL is common.  Also most drivers will just check
> the bounds like you do in ixgbe_xdp_setup(), allowing the change if new
> MTU still fits constraints.
> 
> FWIW are the IXGBE_FLAG_SRIOV_ENABLED and IXGBE_FLAG_DCB_ENABLED flag
> changes also covered while xdp is enabled?  Quick grep doesn't reveal
> them being checked against xdp_prog other than in ixgbe_xdp_setup().

Ah, I didn't make the review in time :)  Could you follow up?

[RFC bpf-next 0/6] net: xsk: minor improvements around queue handling

2018-07-26 Thread Jakub Kicinski

Hi!

This set tries to make the core take care of error checking for the
drivers.  In particular making sure that the AF_XDP UMEM is not installed
on queues which don't exist (or are disabled) and that changing queue
(AKA ethtool channel) count cannot disable queues with active AF_XDF
zero-copy sockets.

I'm sending as an RFC because I'm not entirely sure what the desired
behaviour is here.  Is it Okay to install AF_XDP on queues which don't
exist?  I presume not?  Are the AF_XDP queue_ids referring to TX queues
as well as RX queues in case of the driver?  I presume not?  Should
we try to prevent disabling queues which have non zero-copy sockets
installed as well? :S

Anyway, if any of those patches seem useful and reasonable, please let
me know I will repost as non-RFC.

Jakub Kicinski (6):
  net: update real_num_rx_queues even when !CONFIG_SYSFS
  xsk: refactor xdp_umem_assign_dev()
  xsk: don't allow umem replace at stack level
  xsk: don't allow installing UMEM beyond the number of queues
  ethtool: rename local variable max -> curr
  ethtool: don't allow disabling queues with umem installed

 include/linux/netdevice.h | 16 +++--
 net/core/ethtool.c| 19 ++
 net/xdp/xdp_umem.c| 73 ---
 3 files changed, 71 insertions(+), 37 deletions(-)

-- 
2.17.1

[RFC bpf-next 2/6] xsk: refactor xdp_umem_assign_dev()

2018-07-26 Thread Jakub Kicinski

Return early and only take the ref on dev once there is no possibility
of failing.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 net/xdp/xdp_umem.c | 49 --
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index f47abb46c587..c199d66b5f3f 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -56,41 +56,34 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct 
net_device *dev,
if (force_copy)
return 0;
 
-   dev_hold(dev);
+   if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
+   return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
 
-   if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
-   bpf.command = XDP_QUERY_XSK_UMEM;
+   bpf.command = XDP_QUERY_XSK_UMEM;
 
-   rtnl_lock();
-   err = dev->netdev_ops->ndo_bpf(dev, &bpf);
-   rtnl_unlock();
+   rtnl_lock();
+   err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+   rtnl_unlock();
 
-   if (err) {
-   dev_put(dev);
-   return force_zc ? -ENOTSUPP : 0;
-   }
+   if (err)
+   return force_zc ? -ENOTSUPP : 0;
 
-   bpf.command = XDP_SETUP_XSK_UMEM;
-   bpf.xsk.umem = umem;
-   bpf.xsk.queue_id = queue_id;
+   bpf.command = XDP_SETUP_XSK_UMEM;
+   bpf.xsk.umem = umem;
+   bpf.xsk.queue_id = queue_id;
 
-   rtnl_lock();
-   err = dev->netdev_ops->ndo_bpf(dev, &bpf);
-   rtnl_unlock();
+   rtnl_lock();
+   err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+   rtnl_unlock();
 
-   if (err) {
-   dev_put(dev);
-   return force_zc ? err : 0; /* fail or fallback */
-   }
-
-   umem->dev = dev;
-   umem->queue_id = queue_id;
-   umem->zc = true;
-   return 0;
-   }
+   if (err)
+   return force_zc ? err : 0; /* fail or fallback */
 
-   dev_put(dev);
-   return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+   dev_hold(dev);
+   umem->dev = dev;
+   umem->queue_id = queue_id;
+   umem->zc = true;
+   return 0;
 }
 
 static void xdp_umem_clear_dev(struct xdp_umem *umem)
-- 
2.17.1

[RFC bpf-next 4/6] xsk: don't allow installing UMEM beyond the number of queues

2018-07-26 Thread Jakub Kicinski

Don't allow installing UMEM on queue IDs higher than real_num_rx_queues.
Note that the check in xsk_bind() is advisory at most, since it's done
without rtnl.  Besides from driver's perspective the UMEM queue ID really
only relates to RX queues.  TX real_num_tx_queues queues the driver
exposes are for SKBs.  AF_XDP zero-copy TX will most likely use separate
rings much like XDP_TX does.  So the AF_XDP's concept of queue indexes
only relates to RX queues at driver level.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 net/xdp/xdp_umem.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 911ca6d3cb5a..8139a2c5c5ed 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -79,6 +79,11 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct 
net_device *dev,
bpf.command = XDP_QUERY_XSK_UMEM;
 
rtnl_lock();
+   if (queue_id >= dev->real_num_rx_queues) {
+   err = -EINVAL;
+   goto err_rtnl_unlock;
+   }
+
err = xdp_umem_query(dev, queue_id);
if (err) {
err = err < 0 ? -ENOTSUPP : -EBUSY;
-- 
2.17.1

[RFC bpf-next 6/6] ethtool: don't allow disabling queues with umem installed

2018-07-26 Thread Jakub Kicinski

We already check the RSS indirection table does not use queues
which would be disabled by channel reconfiguration.  Make sure
user does not try to disable queues which have a UMEM and zero-
-copy AF_XDP socket installed.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 include/linux/netdevice.h | 8 
 net/core/ethtool.c| 7 +++
 2 files changed, 15 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a5a34f0fb485..c0df40deec54 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3566,7 +3566,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct 
netlink_ext_ack *extack,
  int fd, u32 flags);
 u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
enum bpf_netdev_command cmd);
+
+#if defined(CONFIG_XDP_SOCKETS)
 int xdp_umem_query(struct net_device *dev, u16 queue_id);
+#else
+static inline int xdp_umem_query(struct net_device *dev, u16 queue_id)
+{
+   return 0;
+}
+#endif
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4d5093577fe6..a8e693de5b11 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1745,6 +1745,7 @@ static noinline_for_stack int ethtool_set_channels(struct 
net_device *dev,
 {
struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
u32 max_rx_in_use = 0;
+   unsigned int i;
 
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
@@ -1768,6 +1769,12 @@ static noinline_for_stack int 
ethtool_set_channels(struct net_device *dev,
(channels.combined_count + channels.rx_count) <= max_rx_in_use)
return -EINVAL;
 
+   /* Disabling channels, query zero-copy AF_XDP sockets */
+   for (i = channels.combined_count + channels.rx_count;
+i < curr.combined_count + curr.rx_count; i++)
+   if (xdp_umem_query(dev, i) > 0)
+   return -EINVAL;
+
return dev->ethtool_ops->set_channels(dev, &channels);
 }
 
-- 
2.17.1

[RFC bpf-next 5/6] ethtool: rename local variable max -> curr

2018-07-26 Thread Jakub Kicinski

ethtool_set_channels() validates the config against driver's max
settings.  It retrieves the current config and stores it in a
variable called max.  This was okay when only max settings were
accessed but we will soon want to access current settings as
well, so calling the entire structure max makes the code less
readable.

While at it drop unnecessary parenthesis.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 net/core/ethtool.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c9993c6c2fd4..4d5093577fe6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1743,7 +1743,7 @@ static noinline_for_stack int ethtool_get_channels(struct 
net_device *dev,
 static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
   void __user *useraddr)
 {
-   struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
+   struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
u32 max_rx_in_use = 0;
 
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
@@ -1752,13 +1752,13 @@ static noinline_for_stack int 
ethtool_set_channels(struct net_device *dev,
if (copy_from_user(&channels, useraddr, sizeof(channels)))
return -EFAULT;
 
-   dev->ethtool_ops->get_channels(dev, &max);
+   dev->ethtool_ops->get_channels(dev, &curr);
 
/* ensure new counts are within the maximums */
-   if ((channels.rx_count > max.max_rx) ||
-   (channels.tx_count > max.max_tx) ||
-   (channels.combined_count > max.max_combined) ||
-   (channels.other_count > max.max_other))
+   if (channels.rx_count > curr.max_rx ||
+   channels.tx_count > curr.max_tx ||
+   channels.combined_count > curr.max_combined ||
+   channels.other_count > curr.max_other)
return -EINVAL;
 
/* ensure the new Rx count fits within the configured Rx flow
-- 
2.17.1

[RFC bpf-next 1/6] net: update real_num_rx_queues even when !CONFIG_SYSFS

2018-07-26 Thread Jakub Kicinski

We used to depend on real_num_rx_queues as a upper bound for sanity
checks.  For AF_XDP it's useful if drivers can trust the stack never
to try to install UMEM for queues which are not configured.  Update
dev->real_num_rx_queues even without sysfs compiled, otherwise it
would always stay equal dev->num_rx_queues.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c1295c7a452e..6717dc7e8fbf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3433,6 +3433,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, 
unsigned int rxq);
 static inline int netif_set_real_num_rx_queues(struct net_device *dev,
unsigned int rxq)
 {
+   dev->real_num_rx_queues = rxqs;
return 0;
 }
 #endif
-- 
2.17.1

[RFC bpf-next 3/6] xsk: don't allow umem replace at stack level

2018-07-26 Thread Jakub Kicinski

Currently drivers have to check if they already have a umem
installed for a given queue and return an error if so.  Make
better use of XDP_QUERY_XSK_UMEM and move this functionality
to the core.

We need to keep rtnl across the calls now.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 include/linux/netdevice.h |  7 ---
 net/xdp/xdp_umem.c| 37 -
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6717dc7e8fbf..a5a34f0fb485 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -872,10 +872,10 @@ struct netdev_bpf {
struct {
struct bpf_offloaded_map *offmap;
};
-   /* XDP_SETUP_XSK_UMEM */
+   /* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */
struct {
-   struct xdp_umem *umem;
-   u16 queue_id;
+   struct xdp_umem *umem; /* out for query*/
+   u16 queue_id; /* in for query */
} xsk;
};
 };
@@ -3566,6 +3566,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct 
netlink_ext_ack *extack,
  int fd, u32 flags);
 u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
enum bpf_netdev_command cmd);
+int xdp_umem_query(struct net_device *dev, u16 queue_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index c199d66b5f3f..911ca6d3cb5a 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "xdp_umem.h"
 #include "xsk_queue.h"
@@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock 
*xs)
}
 }
 
+int xdp_umem_query(struct net_device *dev, u16 queue_id)
+{
+   struct netdev_bpf bpf;
+
+   ASSERT_RTNL();
+
+   memset(&bpf, 0, sizeof(bpf));
+   bpf.command = XDP_QUERY_XSK_UMEM;
+   bpf.xsk.queue_id = queue_id;
+
+   if (!dev->netdev_ops->ndo_bpf)
+   return 0;
+   return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u32 queue_id, u16 flags)
 {
@@ -62,28 +79,30 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct 
net_device *dev,
bpf.command = XDP_QUERY_XSK_UMEM;
 
rtnl_lock();
-   err = dev->netdev_ops->ndo_bpf(dev, &bpf);
-   rtnl_unlock();
-
-   if (err)
-   return force_zc ? -ENOTSUPP : 0;
+   err = xdp_umem_query(dev, queue_id);
+   if (err) {
+   err = err < 0 ? -ENOTSUPP : -EBUSY;
+   goto err_rtnl_unlock;
+   }
 
bpf.command = XDP_SETUP_XSK_UMEM;
bpf.xsk.umem = umem;
bpf.xsk.queue_id = queue_id;
 
-   rtnl_lock();
err = dev->netdev_ops->ndo_bpf(dev, &bpf);
-   rtnl_unlock();
-
if (err)
-   return force_zc ? err : 0; /* fail or fallback */
+   goto err_rtnl_unlock;
+   rtnl_unlock();
 
dev_hold(dev);
umem->dev = dev;
umem->queue_id = queue_id;
umem->zc = true;
return 0;
+
+err_rtnl_unlock:
+   rtnl_unlock();
+   return force_zc ? err : 0; /* fail or fallback */
 }
 
 static void xdp_umem_clear_dev(struct xdp_umem *umem)
-- 
2.17.1

[PATCH bpf-next 2/4] tools: libbpf: add bpf_object__find_program_by_title()

2018-07-26 Thread Jakub Kicinski

Allow users to find programs by section names.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/lib/bpf/libbpf.c | 12 
 tools/lib/bpf/libbpf.h |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index afa9860db755..857d3d16968e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -873,6 +873,18 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int 
idx)
return NULL;
 }
 
+struct bpf_program *
+bpf_object__find_program_by_title(struct bpf_object *obj, const char *title)
+{
+   struct bpf_program *pos;
+
+   bpf_object__for_each_program(pos, obj) {
+   if (pos->section_name && !strcmp(pos->section_name, title))
+   return pos;
+   }
+   return NULL;
+}
+
 static int
 bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
   Elf_Data *data, struct bpf_object *obj)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 1f8fc2060460..a295fe2f822b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -86,6 +86,9 @@ const char *bpf_object__name(struct bpf_object *obj);
 unsigned int bpf_object__kversion(struct bpf_object *obj);
 int bpf_object__btf_fd(const struct bpf_object *obj);
 
+struct bpf_program *
+bpf_object__find_program_by_title(struct bpf_object *obj, const char *title);
+
 struct bpf_object *bpf_object__next(struct bpf_object *prev);
 #define bpf_object__for_each_safe(pos, tmp)\
for ((pos) = bpf_object__next(NULL),\
-- 
2.17.1

[PATCH bpf-next 1/4] tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd()

2018-07-26 Thread Jakub Kicinski

bpf_map__fd() handles NULL map gracefully and returns -EINVAL.
bpf_program__fd() and bpf_program__nth_fd() crash in this case.
Make the behaviour more consistent by validating prog pointer
as well.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/lib/bpf/libbpf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 955f8eafbf41..afa9860db755 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1991,6 +1991,9 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
 {
int fd;
 
+   if (!prog)
+   return -EINVAL;
+
if (n >= prog->instances.nr || n < 0) {
pr_warning("Can't get the %dth fd from program %s: only %d 
instances\n",
   n, prog->section_name, prog->instances.nr);
-- 
2.17.1

[PATCH bpf-next 0/4] samples: bpf: convert two more samples to libbpf

2018-07-26 Thread Jakub Kicinski

Hi!

This set converts xdpsock_user.c and xdp_fwd_user.c to use libbpf instead
of bpf_load.o.  First two patches are minor improvements to libbpf to make
the conversion (and use of libbpf in general) nicer.

Jakub Kicinski (4):
  tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd()
  tools: libbpf: add bpf_object__find_program_by_title()
  samples: bpf: convert xdp_fwd_user.c to libbpf
  samples: bpf: convert xdpsock_user.c to libbpf

 samples/bpf/Makefile   |  4 ++--
 samples/bpf/xdp_fwd_user.c | 34 +++---
 samples/bpf/xdpsock_user.c | 38 +-
 tools/lib/bpf/libbpf.c | 15 +++
 tools/lib/bpf/libbpf.h |  3 +++
 5 files changed, 72 insertions(+), 22 deletions(-)

-- 
2.17.1

[PATCH bpf-next 4/4] samples: bpf: convert xdpsock_user.c to libbpf

2018-07-26 Thread Jakub Kicinski

Convert xdpsock_user.c to use libbpf instead of bpf_load.o.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
CC: Magnus Karlsson 
CC: Björn Töpel 

 samples/bpf/Makefile   |  2 +-
 samples/bpf/xdpsock_user.c | 38 +-
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 405fa6c880bb..6bfa250fc76e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -105,7 +105,7 @@ xdp_rxq_info-objs := xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o syscall_tp_user.o
 cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
-xdpsock-objs := bpf_load.o xdpsock_user.o
+xdpsock-objs := xdpsock_user.o
 xdp_fwd-objs := xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 5904b1543831..5c1fcb1a84f1 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -26,7 +26,7 @@
 #include 
 #include 
 
-#include "bpf_load.h"
+#include "bpf/libbpf.h"
 #include "bpf_util.h"
 #include 
 
@@ -886,7 +886,13 @@ static void l2fwd(struct xdpsock *xsk)
 int main(int argc, char **argv)
 {
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+   struct bpf_prog_load_attr prog_load_attr = {
+   .prog_type  = BPF_PROG_TYPE_XDP,
+   };
+   int prog_fd, qidconf_map, xsks_map;
+   struct bpf_object *obj;
char xdp_filename[256];
+   struct bpf_map *map;
int i, ret, key = 0;
pthread_t pt;
 
@@ -899,24 +905,38 @@ int main(int argc, char **argv)
}
 
snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
+   prog_load_attr.file = xdp_filename;
 
-   if (load_bpf_file(xdp_filename)) {
-   fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf);
+   if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+   exit(EXIT_FAILURE);
+   if (prog_fd < 0) {
+   fprintf(stderr, "ERROR: no program found: %s\n",
+   strerror(prog_fd));
exit(EXIT_FAILURE);
}
 
-   if (!prog_fd[0]) {
-   fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n",
-   strerror(errno));
+   map = bpf_object__find_map_by_name(obj, "qidconf_map");
+   qidconf_map = bpf_map__fd(map);
+   if (qidconf_map < 0) {
+   fprintf(stderr, "ERROR: no qidconf map found: %s\n",
+   strerror(qidconf_map));
+   exit(EXIT_FAILURE);
+   }
+
+   map = bpf_object__find_map_by_name(obj, "xsks_map");
+   xsks_map = bpf_map__fd(map);
+   if (xsks_map < 0) {
+   fprintf(stderr, "ERROR: no xsks map found: %s\n",
+   strerror(xsks_map));
exit(EXIT_FAILURE);
}
 
-   if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) {
+   if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
fprintf(stderr, "ERROR: link set xdp fd failed\n");
exit(EXIT_FAILURE);
}
 
-   ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0);
+   ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0);
if (ret) {
fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
exit(EXIT_FAILURE);
@@ -933,7 +953,7 @@ int main(int argc, char **argv)
/* ...and insert them into the map. */
for (i = 0; i < num_socks; i++) {
key = i;
-   ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0);
+   ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0);
if (ret) {
fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
exit(EXIT_FAILURE);
-- 
2.17.1

[PATCH bpf-next 3/4] samples: bpf: convert xdp_fwd_user.c to libbpf

2018-07-26 Thread Jakub Kicinski

Convert xdp_fwd_user.c to use libbpf instead of bpf_load.o.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
CC: dsah...@gmail.com

 samples/bpf/Makefile   |  2 +-
 samples/bpf/xdp_fwd_user.c | 34 +++---
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 9ea2f7b64869..405fa6c880bb 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -106,7 +106,7 @@ syscall_tp-objs := bpf_load.o syscall_tp_user.o
 cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
-xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+xdp_fwd-objs := xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
 
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
index a87a2048ed32..f88e1d7093d6 100644
--- a/samples/bpf/xdp_fwd_user.c
+++ b/samples/bpf/xdp_fwd_user.c
@@ -24,8 +24,7 @@
 #include 
 #include 
 
-#include "bpf_load.h"
-#include "bpf_util.h"
+#include "bpf/libbpf.h"
 #include 
 
 
@@ -63,9 +62,15 @@ static void usage(const char *prog)
 
 int main(int argc, char **argv)
 {
+   struct bpf_prog_load_attr prog_load_attr = {
+   .prog_type  = BPF_PROG_TYPE_XDP,
+   };
+   const char *prog_name = "xdp_fwd";
+   struct bpf_program *prog;
char filename[PATH_MAX];
+   struct bpf_object *obj;
int opt, i, idx, err;
-   int prog_id = 0;
+   int prog_fd, map_fd;
int attach = 1;
int ret = 0;
 
@@ -75,7 +80,7 @@ int main(int argc, char **argv)
attach = 0;
break;
case 'D':
-   prog_id = 1;
+   prog_name = "xdp_fwd_direct";
break;
default:
usage(basename(argv[0]));
@@ -90,6 +95,7 @@ int main(int argc, char **argv)
 
if (attach) {
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+   prog_load_attr.file = filename;
 
if (access(filename, O_RDONLY) < 0) {
printf("error accessing file %s: %s\n",
@@ -97,19 +103,25 @@ int main(int argc, char **argv)
return 1;
}
 
-   if (load_bpf_file(filename)) {
-   printf("%s", bpf_log_buf);
+   if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
return 1;
-   }
 
-   if (!prog_fd[prog_id]) {
-   printf("load_bpf_file: %s\n", strerror(errno));
+   prog = bpf_object__find_program_by_title(obj, prog_name);
+   prog_fd = bpf_program__fd(prog);
+   if (prog_fd < 0) {
+   printf("program not found: %s\n", strerror(prog_fd));
+   return 1;
+   }
+   map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj,
+ "tx_port"));
+   if (map_fd < 0) {
+   printf("map not found: %s\n", strerror(map_fd));
return 1;
}
}
if (attach) {
for (i = 1; i < 64; ++i)
-   bpf_map_update_elem(map_fd[0], &i, &i, 0);
+   bpf_map_update_elem(map_fd, &i, &i, 0);
}
 
for (i = optind; i < argc; ++i) {
@@ -126,7 +138,7 @@ int main(int argc, char **argv)
if (err)
ret = err;
} else {
-   err = do_attach(idx, prog_fd[prog_id], argv[i]);
+   err = do_attach(idx, prog_fd, argv[i]);
if (err)
ret = err;
}
-- 
2.17.1

[PATCH net-next] netdevsim: make debug dirs' dentries static

2018-07-26 Thread Jakub Kicinski

The root directories of netdevsim should only be used by the core
to create per-device subdirectories, so limit their visibility to
the core file.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 drivers/net/netdevsim/netdev.c| 6 +++---
 drivers/net/netdevsim/netdevsim.h | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 2d244551298b..8d8e2b3f263e 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -41,6 +41,9 @@ struct nsim_vf_config {
 
 static u32 nsim_dev_id;
 
+static struct dentry *nsim_ddir;
+static struct dentry *nsim_sdev_ddir;
+
 static int nsim_num_vf(struct device *dev)
 {
struct netdevsim *ns = to_nsim(dev);
@@ -566,9 +569,6 @@ static struct rtnl_link_ops nsim_link_ops __read_mostly = {
.dellink= nsim_dellink,
 };
 
-struct dentry *nsim_ddir;
-struct dentry *nsim_sdev_ddir;
-
 static int __init nsim_module_init(void)
 {
int err;
diff --git a/drivers/net/netdevsim/netdevsim.h 
b/drivers/net/netdevsim/netdevsim.h
index 02be199eb005..384c254fafc5 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -103,9 +103,6 @@ struct netdevsim {
struct nsim_ipsec ipsec;
 };
 
-extern struct dentry *nsim_ddir;
-extern struct dentry *nsim_sdev_ddir;
-
 #ifdef CONFIG_BPF_SYSCALL
 int nsim_bpf_init(struct netdevsim *ns);
 void nsim_bpf_uninit(struct netdevsim *ns);
-- 
2.17.1

Re: [net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP

2018-07-26 Thread Jakub Kicinski

On Thu, 26 Jul 2018 10:40:45 -0700, Jeff Kirsher wrote:
> From: Tony Nguyen 
> 
> XDP does not support jumbo frames or LRO.  These checks are being made
> outside the driver when an XDP program is loaded, however, there is
> nothing preventing these from changing after an XDP program is loaded.
> Add the checks so that while an XDP program is loaded, do not allow MTU
> to be changed or LRO to be enabled.
> 
> Signed-off-by: Tony Nguyen 
> Tested-by: Andrew Bowers 
> Signed-off-by: Jeff Kirsher 
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
> b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 5a6600f7b382..c42256e91997 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device *netdev, 
> int new_mtu)
>  {
>   struct ixgbe_adapter *adapter = netdev_priv(netdev);
>  
> + if (adapter->xdp_prog) {
> + e_warn(probe, "MTU cannot be changed while XDP program is 
> loaded\n");
> + return -EPERM;

EPERM looks wrong, EINVAL is common.  Also most drivers will just check
the bounds like you do in ixgbe_xdp_setup(), allowing the change if new
MTU still fits constraints.

FWIW are the IXGBE_FLAG_SRIOV_ENABLED and IXGBE_FLAG_DCB_ENABLED flag
changes also covered while xdp is enabled?  Quick grep doesn't reveal
them being checked against xdp_prog other than in ixgbe_xdp_setup().

> + }
> +
>   /*
>* For 82599EB we cannot allow legacy VFs to enable their receive
>* paths when MTU greater than 1500 is configured.  So display a
> @@ -9407,6 +9412,11 @@ static netdev_features_t ixgbe_fix_features(struct 
> net_device *netdev,
>   if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE))
>   features &= ~NETIF_F_LRO;
>  
> + if (adapter->xdp_prog && (features & NETIF_F_LRO)) {
> + e_dev_err("LRO is not supported with XDP\n");
> + features &= ~NETIF_F_LRO;
> + }
> +
>   return features;
>  }
>

Re: [net-next 0/6][pull request] 10GbE Intel Wired LAN Driver Updates 2018-07-26

2018-07-26 Thread David Miller

From: Jeff Kirsher 
Date: Thu, 26 Jul 2018 10:40:44 -0700

> This series contains updates to ixgbe and igb.

Pulled, thanks Jeff.

Re: [patch net-next] net: sched: unmark chain as explicitly created on delete

2018-07-26 Thread David Miller

From: Jiri Pirko 
Date: Thu, 26 Jul 2018 18:27:58 +0200

> From: Jiri Pirko 
> 
> Once user manually deletes the chain using "chain del", the chain cannot
> be marked as explicitly created anymore.
> 
> Signed-off-by: Jiri Pirko 
> Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi")

Applied.

Re: [PATCH net-next] l2tp: remove ->recv_payload_hook

2018-07-26 Thread David Miller

From: Guillaume Nault 
Date: Wed, 25 Jul 2018 14:53:33 +0200

> The tunnel reception hook is only used by l2tp_ppp for skipping PPP
> framing bytes. This is a session specific operation, but once a PPP
> session sets ->recv_payload_hook on its tunnel, all frames received by
> the tunnel will enter pppol2tp_recv_payload_hook(), including those
> targeted at Ethernet sessions (an L2TPv3 tunnel can multiplex PPP and
> Ethernet sessions).
> 
> So this mechanism is wrong, and uselessly complex. Let's just move this
> functionality to the pppol2tp rx handler and drop ->recv_payload_hook.
> 
> Signed-off-by: Guillaume Nault 

Applied, thanks Guillaume.

Re: [PATCH net-next] net/tls: Removed redundant checks for non-NULL

2018-07-26 Thread David Miller

From: Vakul Garg 
Date: Tue, 24 Jul 2018 16:54:27 +0530

> Removed checks against non-NULL before calling kfree_skb() and
> crypto_free_aead(). These functions are safe to be called with NULL
> as an argument.
> 
> Signed-off-by: Vakul Garg 

Applied.

Re: [PATCH net v2] net: rollback orig value on failure of dev_qdisc_change_tx_queue_len

2018-07-26 Thread David Miller

From: Tariq Toukan 
Date: Tue, 24 Jul 2018 14:12:20 +0300

> Fix dev_change_tx_queue_len so it rolls back original value
> upon a failure in dev_qdisc_change_tx_queue_len.
> This is already done for notifirers' failures, share the code.
> 
> In case of failure in dev_qdisc_change_tx_queue_len, some tx queues
> would still be of the new length, while they should be reverted.
> Currently, the revert is not done, and is marked with a TODO label
> in dev_qdisc_change_tx_queue_len, and should find some nice solution
> to do it.
> Yet it is still better to not apply the newly requested value.
> 
> Fixes: 48bfd55e7e41 ("net_sched: plug in qdisc ops change_tx_queue_len")
> Signed-off-by: Tariq Toukan 
> Reviewed-by: Eran Ben Elisha 
> Reported-by: Ran Rozenstein 

Applied and queued up for -stable.

Re: [PATCH net-next] cbs: Add support for the graft function

2018-07-26 Thread David Miller

From: Vinicius Costa Gomes 
Date: Mon, 23 Jul 2018 17:08:00 -0700

> This will allow to install a child qdisc under cbs. The main use case
> is to install ETF (Earliest TxTime First) qdisc under cbs, so there's
> another level of control for time-sensitive traffic.
> 
> Signed-off-by: Vinicius Costa Gomes 

Applied, thank you.

[PATCH net] net: ena: Fix use of uninitialized DMA address bits field

2018-07-26 Thread Gal Pressman

UBSAN triggers the following undefined behaviour warnings:
[...]
[   13.236124] UBSAN: Undefined behaviour in 
drivers/net/ethernet/amazon/ena/ena_eth_com.c:468:22
[   13.240043] shift exponent 64 is too large for 64-bit type 'long long 
unsigned int'
[...]
[   13.744769] UBSAN: Undefined behaviour in 
drivers/net/ethernet/amazon/ena/ena_eth_com.c:373:4
[   13.748694] shift exponent 64 is too large for 64-bit type 'long long 
unsigned int'
[...]

When splitting the address to high and low, GENMASK_ULL is used to generate
a bitmask with dma_addr_bits field from io_sq (in ena_com_prepare_tx and
ena_com_add_single_rx_desc).
The problem is that dma_addr_bits is not initialized with a proper value
(besides being cleared in ena_com_create_io_queue).
Assign dma_addr_bits the correct value that is stored in ena_dev when
initializing the SQ.

Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network 
Adapters (ENA)")
Signed-off-by: Gal Pressman 
---
 drivers/net/ethernet/amazon/ena/ena_com.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c 
b/drivers/net/ethernet/amazon/ena/ena_com.c
index 1b9d3130af4d..17f12c18d225 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -333,6 +333,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 
memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr));
 
+   io_sq->dma_addr_bits = ena_dev->dma_addr_bits;
io_sq->desc_entry_size =
(io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ?
sizeof(struct ena_eth_io_tx_desc) :
-- 
2.14.4

Re: [patch net-next RFC] net: sched: don't dump chains only held by actions

2018-07-26 Thread Jakub Kicinski

On Thu, 26 Jul 2018 18:31:01 +0200, Jiri Pirko wrote:
> From: Jiri Pirko 
> 
> In case a chain is empty and not explicitly created by a user,
> such chain should not exist. The only exception is if there is
> an action "goto chain" pointing to it. In that case, don't show the
> chain in the dump. Track the chain references held by actions and
> use them to find out if a chain should or should not be shown
> in chain dump.
> 
> Signed-off-by: Jiri Pirko 

I don't have any better ideas :)

One question below.

> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 75cce2819de9..76035cd6e3bf 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain)
>   ++chain->refcnt;
>  }
>  
> +static void tcf_chain_hold_by_act(struct tcf_chain *chain)
> +{
> + ++chain->action_refcnt;
> +}
> +
> +static void tcf_chain_release_by_act(struct tcf_chain *chain)
> +{
> + --chain->action_refcnt;
> +}
> +
> +static bool tcf_chain_is_zombie(struct tcf_chain *chain)
> +{
> + /* In case all the references are action references, this
> +  * chain is a zombie and should not be listed in the chain
> +  * dump list.
> +  */
> + return chain->refcnt == chain->action_refcnt;
> +}
> +
>  static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
> u32 chain_index)
>  {
> @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, 
> u32 chain_index,
>  }
>  EXPORT_SYMBOL(tcf_chain_get);
>  
> +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 
> chain_index)
> +{
> + struct tcf_chain *chain = tcf_chain_get(block, chain_index, true);
> +
> + tcf_chain_hold_by_act(chain);
> + return chain;
> +}
> +EXPORT_SYMBOL(tcf_chain_get_by_act);
> +
>  static void tc_chain_tmplt_del(struct tcf_chain *chain);
>  
>  void tcf_chain_put(struct tcf_chain *chain)
> @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain)
>  }
>  EXPORT_SYMBOL(tcf_chain_put);
>  
> +void tcf_chain_put_by_act(struct tcf_chain *chain)
> +{
> + tcf_chain_release_by_act(chain);
> + tcf_chain_put(chain);
> +}
> +EXPORT_SYMBOL(tcf_chain_put_by_act);
> +
>  static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
>  {
>   if (chain->explicitly_created)
> @@ -1803,17 +1838,26 @@ static int tc_ctl_chain(struct sk_buff *skb, struct 
> nlmsghdr *n,
>   chain = tcf_chain_lookup(block, chain_index);
>   if (n->nlmsg_type == RTM_NEWCHAIN) {
>   if (chain) {
> - NL_SET_ERR_MSG(extack, "Filter chain already exists");
> - return -EEXIST;
> - }
> - if (!(n->nlmsg_flags & NLM_F_CREATE)) {
> - NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and 
> NLM_F_CREATE to create a new chain");
> - return -ENOENT;
> - }
> - chain = tcf_chain_create(block, chain_index);
> - if (!chain) {
> - NL_SET_ERR_MSG(extack, "Failed to create filter chain");
> - return -ENOMEM;
> + if (tcf_chain_is_zombie(chain)) {
> + /* The chain exists only because there is
> +  * some action referencing it, meaning it
> +  * is a zombie.
> +  */
> + tcf_chain_hold(chain);

I'm not 100% sure why this is needed?  In my tree below I see:

switch (n->nlmsg_type) {
case RTM_NEWCHAIN:
err = tc_chain_tmplt_add(chain, net, tca, extack);
if (err)
goto errout;
/* In case the chain was successfully added, take a reference
 * to the chain. This ensures that an empty chain
 * does not disappear at the end of this function.
 */
tcf_chain_hold(chain);
chain->explicitly_created = true;

so one reference will be taken..  do we need two? 

> + } else {
> + NL_SET_ERR_MSG(extack, "Filter chain already 
> exists");
> + return -EEXIST;
> + }
> + } else {
> + if (!(n->nlmsg_flags & NLM_F_CREATE)) {
> + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN 
> and NLM_F_CREATE to create a new chain");
> + return -ENOENT;
> + }
> + chain = tcf_chain_create(block, chain_index);
> + if (!chain) {
> + NL_SET_ERR_MSG(extack, "Failed to create filter 
> chain");
> + return -ENOMEM;
> + }
>   }
>   } else {
>   if (!chain) {
> @@ -1944,6 +1988,8 @@ static int tc_

Re: [PATCH net] net/mlx5e: Move mlx5e_priv_flags into en_ethtool.c

2018-07-26 Thread Saeed Mahameed

On Sun, Jul 15, 2018 at 12:06 PM, Kamal Heib  wrote:
> Move the definition of mlx5e_priv_flags into en_ethtool.c because it's
> only used there.
>
> Fixes: 4e59e2888139 ("net/mlx5e: Introduce net device priv flags 
> infrastructure")
> Signed-off-by: Kamal Heib 
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en.h | 7 ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 +++
>  2 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
> b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index eb9eb7aa953a..84e6a5b42286 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> @@ -208,13 +208,6 @@ struct mlx5e_umr_wqe {
>
>  extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
>
> -static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
> -   "rx_cqe_moder",
> -   "tx_cqe_moder",
> -   "rx_cqe_compress",
> -   "rx_striding_rq",
> -};
> -

Hi Kamal, on a second thought, i would like to drop this change and
keep mlx5e_priv_flags close/local to the below mlx5e_priv_flag.

Please let me know.

>  enum mlx5e_priv_flag {
> MLX5E_PFLAG_RX_CQE_BASED_MODER = (1 << 0),
> MLX5E_PFLAG_TX_CQE_BASED_MODER = (1 << 1),
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
> index fffe514ba855..2a1c35d82c2e 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
> @@ -33,6 +33,13 @@
>  #include "en.h"
>  #include "en/port.h"
>
> +static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
> +   "rx_cqe_moder",
> +   "tx_cqe_moder",
> +   "rx_cqe_compress",
> +   "rx_striding_rq",
> +};
> +
>  void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
>struct ethtool_drvinfo *drvinfo)
>  {
> --
> 2.14.4
>

Re: [PATCH bpf] xdp: add NULL pointer check in __xdp_return()

2018-07-26 Thread Jakub Kicinski

On Thu, 26 Jul 2018 14:13:20 +0200, Björn Töpel wrote:
> Den mån 23 juli 2018 kl 21:58 skrev Jakub Kicinski:
> > On Mon, 23 Jul 2018 11:39:36 +0200, Björn Töpel wrote:  
> > > Den fre 20 juli 2018 kl 22:08 skrev Jakub Kicinski:  
> > > > On Fri, 20 Jul 2018 10:18:21 -0700, Martin KaFai Lau wrote:  
> > > > > On Sat, Jul 21, 2018 at 01:04:45AM +0900, Taehee Yoo wrote:  
> > > > > > rhashtable_lookup() can return NULL. so that NULL pointer
> > > > > > check routine should be added.
> > > > > >
> > > > > > Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY")
> > > > > > Signed-off-by: Taehee Yoo 
> > > > > > ---
> > > > > >  net/core/xdp.c | 3 ++-
> > > > > >  1 file changed, 2 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > > > > > index 9d1f220..1c12bc7 100644
> > > > > > --- a/net/core/xdp.c
> > > > > > +++ b/net/core/xdp.c
> > > > > > @@ -345,7 +345,8 @@ static void __xdp_return(void *data, struct 
> > > > > > xdp_mem_info *mem, bool napi_direct,
> > > > > > rcu_read_lock();
> > > > > > /* mem->id is valid, checked in 
> > > > > > xdp_rxq_info_reg_mem_model() */
> > > > > > xa = rhashtable_lookup(mem_id_ht, &mem->id, 
> > > > > > mem_id_rht_params);
> > > > > > -   xa->zc_alloc->free(xa->zc_alloc, handle);
> > > > > > +   if (xa)
> > > > > > +   xa->zc_alloc->free(xa->zc_alloc, handle);  
> > > > > hmm...It is not clear to me the "!xa" case don't have to be handled?  
> > > >
> > > > Actually I have a more fundamental question about this interface I've
> > > > been meaning to ask.
> > > >
> > > > IIUC free() can happen on any CPU at any time, when whatever device,
> > > > socket or CPU this got redirected to completed the TX.  IOW there may
> > > > be multiple producers.  Drivers would need to create spin lock a'la the
> > > > a9744f7ca200 ("xsk: fix potential race in SKB TX completion code") fix?
> > > >  
> > >
> > > Jakub, apologies for the slow response. I'm still in
> > > "holiday/hammock&beer mode", but will be back in a week. :-P  
> >
> > Ah, sorry to interrupt! :)
> >  
> 
> Don't make it a habit! ;-P

:-D

> > > The idea with the xdp_return_* functions are that an xdp_buff and
> > > xdp_frame can have custom allocations schemes. The difference beween
> > > struct xdp_buff and struct xdp_frame is lifetime. The xdp_buff
> > > lifetime is within the napi context, whereas xdp_frame can have a
> > > lifetime longer/outside the napi context. E.g. for a XDP_REDIRECT
> > > scenario an xdp_buff is converted to a xdp_frame. The conversion is
> > > done in include/net/xdp.h:convert_to_xdp_frame.
> > >
> > > Currently, the zero-copy MEM_TYPE_ZERO_COPY memtype can *only* be used
> > > for xdp_buff, meaning that the lifetime is constrained to a napi
> > > context. Further, given an xdp_buff with memtype MEM_TYPE_ZERO_COPY,
> > > doing XDP_REDIRECT to a target that is *not* an AF_XDP socket would
> > > mean converting the xdp_buff to an xdp_frame. The xdp_frame can then
> > > be free'd on any CPU.
> > >
> > > Note that the xsk_rcv* functions is always called from an napi
> > > context, and therefore is using the xdp_return_buff calls.
> > >
> > > To answer your question -- no, this fix is *not* needed, because the
> > > xdp_buff napi constrained, and the xdp_buff will only be free'd on one
> > > CPU.  
> >
> > Oh, thanks, I missed the check in convert_to_xdp_frame(), so the only
> > frames which can come back via the free path are out of the error path
> > in __xsk_rcv_zc()?
> >
> > That path looks a little surprising too, isn't the expectation that if
> > xdp_do_redirect() returns an error the driver retains the ownership of
> > the buffer?
> >
> > static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
> > {
> > int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
> >
> > if (err) {
> > xdp_return_buff(xdp);
> > xs->rx_dropped++;
> > }
> >
> > return err;
> > }
> >
> > This seems to call xdp_return_buff() *and* return an error.
> >  
> 
> Ugh, this is indeed an error. The xdp_return buff should be removed.
> Thanks for spotting this!
> 
> So, yes, the way to get the buffer back (in ZC) to the driver is via
> the error path (recycling) or via the UMEM fill queue.

Okay, I'm gonna test and submit this later today for bpf tree:

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 72335c2e8108..4e937cd7c17d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -84,10 +84,8 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff 
*xdp, u32 len)
 {
int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 
-   if (err) {
-   xdp_return_buff(xdp);
+   if (err)
xs->rx_dropped++;
-   }
 
return err;
 }

Now the frame return/ZC allocator ->free path is all dead code :S

(no subject)

2018-07-26 Thread Fritz Micheal.





--
Do you need a loan at 2% interest rate for any reason ?
Every interested applicant should contact us via the below contact
details.
E-mails: fritzmicloanf...@financier.com
firtzmicloanf...@gmail.com

Yours Sincerely
Fritz Micheal.

[net-next 1/6] ixgbe: Do not allow LRO or MTU change with XDP

2018-07-26 Thread Jeff Kirsher

From: Tony Nguyen 

XDP does not support jumbo frames or LRO.  These checks are being made
outside the driver when an XDP program is loaded, however, there is
nothing preventing these from changing after an XDP program is loaded.
Add the checks so that while an XDP program is loaded, do not allow MTU
to be changed or LRO to be enabled.

Signed-off-by: Tony Nguyen 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 5a6600f7b382..c42256e91997 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6469,6 +6469,11 @@ static int ixgbe_change_mtu(struct net_device *netdev, 
int new_mtu)
 {
struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
+   if (adapter->xdp_prog) {
+   e_warn(probe, "MTU cannot be changed while XDP program is 
loaded\n");
+   return -EPERM;
+   }
+
/*
 * For 82599EB we cannot allow legacy VFs to enable their receive
 * paths when MTU greater than 1500 is configured.  So display a
@@ -9407,6 +9412,11 @@ static netdev_features_t ixgbe_fix_features(struct 
net_device *netdev,
if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE))
features &= ~NETIF_F_LRO;
 
+   if (adapter->xdp_prog && (features & NETIF_F_LRO)) {
+   e_dev_err("LRO is not supported with XDP\n");
+   features &= ~NETIF_F_LRO;
+   }
+
return features;
 }
 
-- 
2.17.1

[net-next 0/6][pull request] 10GbE Intel Wired LAN Driver Updates 2018-07-26

2018-07-26 Thread Jeff Kirsher

This series contains updates to ixgbe and igb.

Tony fixes ixgbe to add checks to ensure jumbo frames or LRO get enabled
after an XDP program is loaded.

Shannon Nelson adds the missing security configuration registers to the
ixgbe register dump, which will help in debugging.

Christian Grönke fixes an issue in igb that occurs on SGMII based SPF
mdoules, by reverting changes from 2 previous patches.  The issue was
that initialization would fail on the fore mentioned modules because the
driver would try to reset the PHY before obtaining the PHY address of
the SGMII attached PHY.

Venkatesh Srinivas replaces wmb() with dma_wmb() for doorbell writes,
which avoids SFENCEs before the doorbell writes.

Alex cleans up and refactors ixgbe Tx/Rx shutdown to reduce time needed
to stop the device.  The code refactor allows us to take the completion
time into account when disabling queues, so that on some platforms with
higher completion times, would not result in receive queues disabled
messages.

The following are changes since commit dc66fe43b7ebdb53628dcbc1f8f15de3e000aacf:
  rds: send: Fix dead code in rds_sendmsg
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE

Alexander Duyck (2):
  ixgbe: Reorder Tx/Rx shutdown to reduce time needed to stop device
  ixgbe: Refactor queue disable logic to take completion time into
account

Christian Grönke (1):
  igb: Remove superfluous reset to PHY and page 0 selection

Shannon Nelson (1):
  ixgbe: add ipsec security registers into ethtool register dump

Tony Nguyen (1):
  ixgbe: Do not allow LRO or MTU change with XDP

Venkatesh Srinivas (1):
  igb: Use dma_wmb() instead of wmb() before doorbell writes

 drivers/net/ethernet/intel/igb/e1000_82575.c  |  12 -
 drivers/net/ethernet/intel/igb/igb_main.c |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |   3 +-
 .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c  |  42 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 300 ++
 5 files changed, 250 insertions(+), 111 deletions(-)

-- 
2.17.1

[net-next 5/6] ixgbe: Reorder Tx/Rx shutdown to reduce time needed to stop device

2018-07-26 Thread Jeff Kirsher

From: Alexander Duyck 

This change is meant to help reduce the time needed to shutdown the
transmit and receive paths for the device. Specifically what we now do
after this patch is disable the transmit path first at the netdev level,
and then work on disabling the Rx. This way while we are waiting on the Rx
queues to be disabled the Tx queues have an opportunity to drain out.

In addition I have dropped the 10ms timeout that was left in the ixgbe_down
function that seems to have been carried through from back in e1000 as far
as I can tell. We shouldn't need it since we don't actually disable the Tx
until much later and we have additional logic in place for verifying the Tx
queues have been disabled.

Signed-off-by: Alexander Duyck 
Tested-by: Don Buchholz 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c42256e91997..aa4f05c36260 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5814,6 +5814,13 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
if (test_and_set_bit(__IXGBE_DOWN, &adapter->state))
return; /* do nothing if already down */
 
+   /* Shut off incoming Tx traffic */
+   netif_tx_stop_all_queues(netdev);
+
+   /* call carrier off first to avoid false dev_watchdog timeouts */
+   netif_carrier_off(netdev);
+   netif_tx_disable(netdev);
+
/* disable receives */
hw->mac.ops.disable_rx(hw);
 
@@ -5822,16 +5829,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
/* this call also flushes the previous write */
ixgbe_disable_rx_queue(adapter, adapter->rx_ring[i]);
 
-   usleep_range(1, 2);
-
/* synchronize_sched() needed for pending XDP buffers to drain */
if (adapter->xdp_ring[0])
synchronize_sched();
-   netif_tx_stop_all_queues(netdev);
-
-   /* call carrier off first to avoid false dev_watchdog timeouts */
-   netif_carrier_off(netdev);
-   netif_tx_disable(netdev);
 
ixgbe_irq_disable(adapter);
 
-- 
2.17.1

[net-next 3/6] igb: Remove superfluous reset to PHY and page 0 selection

2018-07-26 Thread Jeff Kirsher

From: Christian Grönke 

This patch reverts two previous applied patches to fix an issue
that appeared when using SGMII based SFP modules. In the current
state the driver will try to reset the PHY before obtaining the
phy_addr of the SGMII attached PHY. That leads to an error in
e1000_write_phy_reg_sgmii_82575. Causing the initialization to
fail:

igb: Intel(R) Gigabit Ethernet Network Driver - version 5.4.0-k
igb: Copyright (c) 2007-2014 Intel Corporation.
igb: probe of :??:??.? failed with error -3

The patches being reverted are:

commit 182785335447957409282ca745aa5bc3968facee
Author: Aaron Sierra 
Date:   Tue Nov 29 10:03:56 2016 -0600

igb: reset the PHY before reading the PHY ID

commit 440aeca4b9858248d8f16d724d9fa87a4f65fa33
Author: Matwey V Kornilov 
Date:   Thu Nov 24 13:32:48 2016 +0300

 igb: Explicitly select page 0 at initialization

The first reverted patch directly causes the problem mentioned above.
In case of SGMII the phy_addr is not known at this point and will
only be obtained by 'igb_get_phy_id_82575' further down in the code.
The second removed patch selects forces selection of page 0 in the
PHY. Something that the reset tries to address as well.

As pointed out by Alexander Duzck, the patch below fixes the same
issue but in the proper location:

commit 4e684f59d760a2c7c716bb60190783546e2d08a1
Author: Chris J Arges 
Date:   Wed Nov 2 09:13:42 2016 -0500

igb: Workaround for igb i210 firmware issue

Reverts: 440aeca4b9858248d8f16d724d9fa87a4f65fa33.
Reverts: 182785335447957409282ca745aa5bc3968facee.

Signed-off-by: Christian Grönke 
Reviewed-by: Alexander Duyck 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igb/e1000_82575.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c 
b/drivers/net/ethernet/intel/igb/e1000_82575.c
index b13b42e5a1d9..a795c07d0df7 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.c
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.c
@@ -225,19 +225,7 @@ static s32 igb_init_phy_params_82575(struct e1000_hw *hw)
hw->bus.func = (rd32(E1000_STATUS) & E1000_STATUS_FUNC_MASK) >>
E1000_STATUS_FUNC_SHIFT;
 
-   /* Make sure the PHY is in a good state. Several people have reported
-* firmware leaving the PHY's page select register set to something
-* other than the default of zero, which causes the PHY ID read to
-* access something other than the intended register.
-*/
-   ret_val = hw->phy.ops.reset(hw);
-   if (ret_val) {
-   hw_dbg("Error resetting the PHY.\n");
-   goto out;
-   }
-
/* Set phy->phy_addr and phy->id. */
-   igb_write_phy_reg_82580(hw, I347AT4_PAGE_SELECT, 0);
ret_val = igb_get_phy_id_82575(hw);
if (ret_val)
return ret_val;
-- 
2.17.1

[net-next 2/6] ixgbe: add ipsec security registers into ethtool register dump

2018-07-26 Thread Jeff Kirsher

From: Shannon Nelson 

Add the ixgbe's security configuration registers into
the register dump.

Signed-off-by: Shannon Nelson 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index bd1ba88ec1d5..1d688840cd6c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -511,7 +511,7 @@ static void ixgbe_set_msglevel(struct net_device *netdev, 
u32 data)
 
 static int ixgbe_get_regs_len(struct net_device *netdev)
 {
-#define IXGBE_REGS_LEN  1139
+#define IXGBE_REGS_LEN  1145
return IXGBE_REGS_LEN * sizeof(u32);
 }
 
@@ -874,6 +874,14 @@ static void ixgbe_get_regs(struct net_device *netdev,
/* X540 specific DCB registers  */
regs_buff[1137] = IXGBE_READ_REG(hw, IXGBE_RTTQCNCR);
regs_buff[1138] = IXGBE_READ_REG(hw, IXGBE_RTTQCNTG);
+
+   /* Security config registers */
+   regs_buff[1139] = IXGBE_READ_REG(hw, IXGBE_SECTXCTRL);
+   regs_buff[1140] = IXGBE_READ_REG(hw, IXGBE_SECTXSTAT);
+   regs_buff[1141] = IXGBE_READ_REG(hw, IXGBE_SECTXBUFFAF);
+   regs_buff[1142] = IXGBE_READ_REG(hw, IXGBE_SECTXMINIFG);
+   regs_buff[1143] = IXGBE_READ_REG(hw, IXGBE_SECRXCTRL);
+   regs_buff[1144] = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT);
 }
 
 static int ixgbe_get_eeprom_len(struct net_device *netdev)
-- 
2.17.1

[net-next 4/6] igb: Use dma_wmb() instead of wmb() before doorbell writes

2018-07-26 Thread Jeff Kirsher

From: Venkatesh Srinivas 

igb writes to doorbells to post transmit and receive descriptors;
after writing descriptors to memory but before writing to doorbells,
use dma_wmb() rather than wmb(). wmb() is more heavyweight than
necessary before doorbell writes.

On x86, this avoids SFENCEs before doorbell writes in both the
tx and rx refill paths.

Signed-off-by: Venkatesh Srinivas 
Tested-by: Aaron Brown 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index e3a0c02721c9..25720d95d4ea 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6031,7 +6031,7 @@ static int igb_tx_map(struct igb_ring *tx_ring,
 * We also need this memory barrier to make certain all of the
 * status bits have been updated before next_to_watch is written.
 */
-   wmb();
+   dma_wmb();
 
/* set next_to_watch value indicating a packet is present */
first->next_to_watch = tx_desc;
@@ -8531,7 +8531,7 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 
cleaned_count)
 * applicable for weak-ordered memory model archs,
 * such as IA-64).
 */
-   wmb();
+   dma_wmb();
writel(i, rx_ring->tail);
}
 }
-- 
2.17.1

[PATCH v1 net-next] igb: Use an advanced ctx descriptor for launchtime

2018-07-26 Thread Jesus Sanchez-Palencia

On i210, Launchtime (TxTime) requires the usage of an "Advanced
Transmit Context Descriptor" for retrieving the timestamp of a packet.

The igb driver correctly builds such descriptor on the segmentation
flow (i.e. igb_tso()) or on the checksum one (i.e. igb_tx_csum()), but the
feature is broken for AF_PACKET if the IGB_TX_FLAGS_VLAN is not set,
which happens due to an early return on igb_tx_csum().

This flag is only set by the kernel when a VLAN interface is used,
thus we can't just rely on it. Here we are fixing this issue by checking
if launchtime is enabled for the current tx_ring before performing the
early return.

Signed-off-by: Jesus Sanchez-Palencia 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index e3a0c02721c9..fa1089defcd5 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5816,7 +5816,8 @@ static void igb_tx_csum(struct igb_ring *tx_ring, struct 
igb_tx_buffer *first)
 
if (skb->ip_summed != CHECKSUM_PARTIAL) {
 csum_failed:
-   if (!(first->tx_flags & IGB_TX_FLAGS_VLAN))
+   if (!(first->tx_flags & IGB_TX_FLAGS_VLAN)
+   && !tx_ring->launchtime_enable)
return;
goto no_csum;
}
-- 
2.18.0

[PATCH bpf] bpf: btf: Use exact btf value_size match in map_check_btf()

2018-07-26 Thread Martin KaFai Lau

The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects
'> map->value_size' to ensure map_seq_show_elem() will not
access things beyond an array element.

Yonghong suggested that using '!=' is a more correct
check.  The 8 bytes round_up on value_size is stored
in array->elem_size.  Hence, using '!=' on map->value_size
is a proper check.

This patch also adds new tests to check the btf array
key type and value type.  Two of these new tests verify
the btf's value_size (the change in this patch).

It also fixes two existing tests that wrongly encoded
a btf's type size (pprint_test) and the value_type_id (in one
of the raw_tests[]).  However, that do not affect these two
BTF verification tests before or after this test changes.
These two tests mainly failed at array creation time after
this patch.

Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap")
Suggested-by: Yonghong Song 
Acked-by: Yonghong Song 
Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/arraymap.c  |  2 +-
 tools/testing/selftests/bpf/test_btf.c | 86 +-
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 544e58f5f642..2aa55d030c77 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -378,7 +378,7 @@ static int array_map_check_btf(const struct bpf_map *map, 
const struct btf *btf,
return -EINVAL;
 
value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
-   if (!value_type || value_size > map->value_size)
+   if (!value_type || value_size != map->value_size)
return -EINVAL;
 
return 0;
diff --git a/tools/testing/selftests/bpf/test_btf.c 
b/tools/testing/selftests/bpf/test_btf.c
index 402c0f7cc418..ffdd27737c9e 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -507,7 +507,7 @@ static struct btf_raw_test raw_tests[] = {
.key_size = sizeof(int),
.value_size = sizeof(void *) * 4,
.key_type_id = 1,
-   .value_type_id = 4,
+   .value_type_id = 5,
.max_entries = 4,
 },
 
@@ -1292,6 +1292,88 @@ static struct btf_raw_test raw_tests[] = {
.err_str = "type != 0",
 },
 
+{
+   .descr = "arraymap invalid btf key (a bit field)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   /* 32 bit int with 32 bit offset */ /* [2] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 32, 32, 8),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 2,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .map_create_err = true,
+},
+
+{
+   .descr = "arraymap invalid btf key (!= 32 bits)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   /* 16 bit int with 0 bit offset */  /* [2] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 16, 2),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .key_type_id = 2,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .map_create_err = true,
+},
+
+{
+   .descr = "arraymap invalid btf value (too small)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   /* btf_value_size < map->value_size */
+   .value_size = sizeof(__u64),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .map_create_err = true,
+},
+
+{
+   .descr = "arraymap invalid btf value (too big)",
+   .raw_types = {
+   /* int */   /* [1] */
+   BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+   BTF_END_RAW,
+   },
+   .str_sec = "",
+   .str_sec_size = sizeof(""),
+   .map_type = BPF_MAP_TYPE_ARRAY,
+   .map_name = "array_map_check_btf",
+   .key_size = sizeof(int),
+   /* btf_value_size > map->value_size */
+   .value_size = sizeof(__u16),
+   .key_type_id = 1,
+   .value_type_id = 1,
+   .max_entries = 4,
+   .map_create_err = true,
+},
+
 }; /* struct btf_raw_test raw_tests[] */
 
 static con

Re: [PATCH] bpf: verifier: BPF_MOV don't mark dst reg if src == dst

2018-07-26 Thread Arthur Fabre

Oops, gmail seems to have mangled everything. Will resend using git
send-email.

I've added the test cases for mov64, but I'm not sure of the expected mov32
behavior.
Currently coerce_reg_to_size() is called after mark_reg_unknown(),
which sets the bounds to 64bits. coerce_reg_to_size() resets the bounds
again,
as they're too "wide" to fit the new size. It sets SMIN = UMIN = 0,
which seems weird. Shouldn't SMIN be 1 << (size * 8 - 1)? Same applies for
SMAX.
Should mov32 always mark the dst as unbounded?


On Thu, Jul 26, 2018 at 1:42 AM, Daniel Borkmann 
wrote:

> On 07/26/2018 12:08 AM, Arthur Fabre wrote:
> > When check_alu_op() handles a BPF_MOV between two registers,
> > it calls check_reg_arg() on the dst register, marking it as unbounded.
> > If the src and dst register are the same, this marks the src as
> > unbounded, which can lead to unexpected errors for further checks that
> > rely on bounds info.
> >
> > check_alu_op() now only marks the dst register as unbounded if it
> > different from the src register.
> >
> > Signed-off-by: Arthur Fabre 
> > ---
> >  kernel/bpf/verifier.c | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 63aaac52a265..ddfe3c544a80 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -3238,8 +3238,9 @@ static int check_alu_op(struct bpf_verifier_env
> > *env, struct bpf_insn *insn)
> > }
> > }
> >
> > -   /* check dest operand */
> > -   err = check_reg_arg(env, insn->dst_reg, DST_OP);
> > +   /* check dest operand, only mark if dest != src */
> > +   err = check_reg_arg(env, insn->dst_reg,
> > +   insn->dst_reg == insn->src_reg ?
> > DST_OP_NO_MARK : DST_OP);
> > if (err)
> > return err;
> >
>
> Thanks a lot for the patch! Looks like it's corrupted wrt newline.
>
> Please also add test cases to tools/testing/selftests/bpf/test_verifier.c
> for the cases of mov64 and mov32 where in each src==dst and src!=dst; mov32
> should mark it as unbounded but not former, so would be good to keep
> tracking
> that in selftests.
>

Re: [patch net-next] selftests: forwarding: add tests for TC chain get and dump operations

2018-07-26 Thread David Miller

From: Jiri Pirko 
Date: Thu, 26 Jul 2018 11:38:34 +0200

> From: Jiri Pirko 
> 
> Signed-off-by: Jiri Pirko 

Applied, thanks Jiri.

[patch net-next RFC] net: sched: don't dump chains only held by actions

2018-07-26 Thread Jiri Pirko

From: Jiri Pirko 

In case a chain is empty and not explicitly created by a user,
such chain should not exist. The only exception is if there is
an action "goto chain" pointing to it. In that case, don't show the
chain in the dump. Track the chain references held by actions and
use them to find out if a chain should or should not be shown
in chain dump.

Signed-off-by: Jiri Pirko 
---
 include/net/pkt_cls.h |  3 +++
 include/net/sch_generic.h |  1 +
 net/sched/act_api.c   |  4 +--
 net/sched/cls_api.c   | 68 +++
 4 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a3101582f642..6d02f31abba8 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -39,7 +39,10 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t 
func);
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
bool create);
+struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block,
+  u32 chain_index);
 void tcf_chain_put(struct tcf_chain *chain);
+void tcf_chain_put_by_act(struct tcf_chain *chain);
 void tcf_block_netif_keep_dst(struct tcf_block *block);
 int tcf_block_get(struct tcf_block **p_block,
  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 085c509c8674..c5432362dc26 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -314,6 +314,7 @@ struct tcf_chain {
struct tcf_block *block;
u32 index; /* chain index */
unsigned int refcnt;
+   unsigned int action_refcnt;
bool explicitly_created;
const struct tcf_proto_ops *tmplt_ops;
void *tmplt_priv;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 148a89ab789b..b43df1e25c6d 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, 
struct tcf_proto *tp)
 
if (!tp)
return -EINVAL;
-   a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true);
+   a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
if (!a->goto_chain)
return -ENOMEM;
return 0;
@@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, 
struct tcf_proto *tp)
 
 static void tcf_action_goto_chain_fini(struct tc_action *a)
 {
-   tcf_chain_put(a->goto_chain);
+   tcf_chain_put_by_act(a->goto_chain);
 }
 
 static void tcf_action_goto_chain_exec(const struct tc_action *a,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 75cce2819de9..76035cd6e3bf 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain)
++chain->refcnt;
 }
 
+static void tcf_chain_hold_by_act(struct tcf_chain *chain)
+{
+   ++chain->action_refcnt;
+}
+
+static void tcf_chain_release_by_act(struct tcf_chain *chain)
+{
+   --chain->action_refcnt;
+}
+
+static bool tcf_chain_is_zombie(struct tcf_chain *chain)
+{
+   /* In case all the references are action references, this
+* chain is a zombie and should not be listed in the chain
+* dump list.
+*/
+   return chain->refcnt == chain->action_refcnt;
+}
+
 static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
  u32 chain_index)
 {
@@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, 
u32 chain_index,
 }
 EXPORT_SYMBOL(tcf_chain_get);
 
+struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 
chain_index)
+{
+   struct tcf_chain *chain = tcf_chain_get(block, chain_index, true);
+
+   tcf_chain_hold_by_act(chain);
+   return chain;
+}
+EXPORT_SYMBOL(tcf_chain_get_by_act);
+
 static void tc_chain_tmplt_del(struct tcf_chain *chain);
 
 void tcf_chain_put(struct tcf_chain *chain)
@@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain)
 }
 EXPORT_SYMBOL(tcf_chain_put);
 
+void tcf_chain_put_by_act(struct tcf_chain *chain)
+{
+   tcf_chain_release_by_act(chain);
+   tcf_chain_put(chain);
+}
+EXPORT_SYMBOL(tcf_chain_put_by_act);
+
 static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
 {
if (chain->explicitly_created)
@@ -1803,17 +1838,26 @@ static int tc_ctl_chain(struct sk_buff *skb, struct 
nlmsghdr *n,
chain = tcf_chain_lookup(block, chain_index);
if (n->nlmsg_type == RTM_NEWCHAIN) {
if (chain) {
-   NL_SET_ERR_MSG(extack, "Filter chain already exists");
-   return -EEXIST;
-   }
-   if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-   NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and 
NLM_F_CREATE to create a new chain");
-

[patch net-next] net: sched: unmark chain as explicitly created on delete

2018-07-26 Thread Jiri Pirko

From: Jiri Pirko 

Once user manually deletes the chain using "chain del", the chain cannot
be marked as explicitly created anymore.

Signed-off-by: Jiri Pirko 
Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi")
---
 net/sched/cls_api.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f3d78c23338e..75cce2819de9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1844,6 +1844,7 @@ static int tc_ctl_chain(struct sk_buff *skb, struct 
nlmsghdr *n,
 * to the chain previously taken during addition.
 */
tcf_chain_put_explicitly_created(chain);
+   chain->explicitly_created = false;
break;
case RTM_GETCHAIN:
err = tc_chain_notify(chain, skb, n->nlmsg_seq,
-- 
2.14.4

Re: [PATCH V3 bpf] xdp: add NULL pointer check in __xdp_return()

2018-07-26 Thread Björn Töpel

Den tors 26 juli 2018 kl 16:18 skrev Taehee Yoo :
>
> rhashtable_lookup() can return NULL. so that NULL pointer
> check routine should be added.
>

Thanks Taehee!

Acked-by: Björn Töpel 

> Fixes: 02b55e5657c3 ("xdp: add MEM_TYPE_ZERO_COPY")
> Acked-by: Martin KaFai Lau 
> Signed-off-by: Taehee Yoo 
> ---
> V3 : reduce code line
> V2 : add WARN_ON_ONCE when xa is NULL.
>  net/core/xdp.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 9d1f220..6771f18 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -345,7 +345,8 @@ static void __xdp_return(void *data, struct xdp_mem_info 
> *mem, bool napi_direct,
> rcu_read_lock();
> /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() 
> */
> xa = rhashtable_lookup(mem_id_ht, &mem->id, 
> mem_id_rht_params);
> -   xa->zc_alloc->free(xa->zc_alloc, handle);
> +   if (!WARN_ON_ONCE(!xa))
> +   xa->zc_alloc->free(xa->zc_alloc, handle);
> rcu_read_unlock();
> default:
> /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
> --
> 2.9.3
>

Re: [Query]: DSA Understanding

2018-07-26 Thread Andrew Lunn

> I am bit confused on how dsa needs to be actually working,
> Q's
> 1] should I be running a dhcp server on eth1 (where switch is connected)
> so that devices connected on lan* devices get an ip ?

Nope. You need eth1 up, but otherwise you do not use it. Use the lanX
interfaces like normal Linux interfaces. Run your dhclient on lanX, etc.
> 
> 2] From the device where switch is connected if the cpu port wants to send
>any data to any other user ports lan* how do i do it (just open
> socket on eth1 or lan*) ?

Just treat the lanX interfaces as normal Linux interfaces.

 Andrew

Re: [Query]: DSA Understanding

2018-07-26 Thread Lad, Prabhakar

On Thu, Jul 26, 2018 at 3:08 PM, Andrew Lunn  wrote:
>> Yes I am using fixed phy on slave1, following is my dts:
>
> Posting the original DTS file is better, not the decompiled version.
>
My bad will take care of it next time.

>>
>> ethernet@48484000 {
>> compatible = "ti,dra7-cpsw", "ti,cpsw";
>> ti,hwmods = "gmac";
>> clocks = <0x124 0x125>;
>> clock-names = "fck", "cpts";
>> cpdma_channels = <0x8>;
>> ale_entries = <0x400>;
>> bd_ram_size = <0x2000>;
>> mac_control = <0x20>;
>> slaves = <0x2>;
>> active_slave = <0x0>;
>> cpts_clock_mult = <0x784cfe14>;
>> cpts_clock_shift = <0x1d>;
>> reg = <0x48484000 0x1000 0x48485200 0x2e00>;
>> #address-cells = <0x1>;
>> #size-cells = <0x1>;
>> ti,no-idle;
>> interrupts = <0x0 0x14e 0x4 0x0 0x14f 0x4 0x0 0x150 0x4
>> 0x0 0x151 0x4>;
>> ranges;
>> syscon = <0x8>;
>> status = "okay";
>> pinctrl-names = "default", "sleep";
>> pinctrl-0 = <0x126 0x127>;
>> pinctrl-1 = <0x128 0x129>;
>> dual_emac;
>> linux,phandle = <0x500>;
>> phandle = <0x500>;
>
> So here is 0x500
>
>>
>> mdio@48485000 {
>> compatible = "ti,cpsw-mdio", "ti,davinci_mdio";
>> #address-cells = <0x1>;
>> #size-cells = <0x0>;
>> ti,hwmods = "davinci_mdio";
>> bus_freq = <0xf4240>;
>> reg = <0x48485000 0x100>;
>> status = "okay";
>> pinctrl-names = "default", "sleep";
>> pinctrl-0 = <0x12a>;
>> pinctrl-1 = <0x12b>;
>>
>> ethernet-phy@1 {
>> reg = <0x1>;
>> linux,phandle = <0x12c>;
>> phandle = <0x12c>;
>> };
>> };
>>
>> slave@48480200 {
>> mac-address = [00 00 00 00 00 00];
>> status = "okay";
>> phy-handle = <0x12c>;
>> phy-mode = "rgmii";
>> dual_emac_res_vlan = <0x1>;
>> };
>>
>> slave@48480300 {
>> mac-address = [00 00 00 00 00 00];
>> status = "okay";
>> phy-mode = "rgmii";
>> dual_emac_res_vlan = <0x2>;
>> linux,phandle = <0xf3>;
>> phandle = <0xf3>;
>
> This is the actual interface you are using and it has a phandle of
> 0xf3.
>
The dsa of_find_net_device_by_node() looks for ethernet device node,
but the above node is for the cpsw slave. I tried adding 0xf3 and got
probe defer,
that is the reason I added 0x500 (mdio) reference to cpu switch and
patched to pick
up eth1 instead of eth0.

>>
>> fixed-link {
>> speed = <0x3e8>;
>> full-duplex;
>> };
>> };
>>
>> cpsw-phy-sel@4a002554 {
>> compatible = "ti,dra7xx-cpsw-phy-sel";
>> reg = <0x4a002554 0x4>;
>> reg-names = "gmii-sel";
>> };
>> };
>>
>> spi@480ba000 {
>> compatible = "ti,omap4-mcspi";
>> reg = <0x480ba000 0x200>;
>> interrupts = <0x0 0x2b 0x4>;
>> #address-cells = <0x1>;
>> #size-cells = <0x0>;
>> ti,hwmods = "mcspi4";
>> ti,spi-num-cs = <0x1>;
>> dmas = <0xb2 0x46 0xb2 0x47>;
>> dma-names = "tx0", "rx0";
>> status = "okay";
>> ti,pindir-d0-out-d1-in;
>>
>> ksz9477@0 {
>> compatible = "microchip,ksz9477";
>> reg = <0x0>;
>> spi-max-frequency = <0x2dc6c00>;
>> spi-cpha;
>> spi-cpol;
>>
>> ports {
>> #address-cells = <0x1>;
>> #size-cells = <0x0>;
>>
>> port@0 {
>> reg = <0x0>;
>> label = "lan0";
>> };
>>
>> port@1 {
>> reg = <0x1>;
>> label = "lan1";
>> };
>>
>> port@2 {
>> reg = <0x2>;
>> label = "lan2";
>> };
>>
>> port@3 {
>> reg = <0x3>;
>> label = "lan3";
>> };
>>
>> port@4 {
>> reg = <0x4>;
>> label = "lan4";
>> };
>>
>> port@5 {
>> reg = <0x5>;
>> label = "cpu";
>> ethernet = <0x5

[PATCH net-next v2 1/2] tls: Remove dead code in tls_sw_sendmsg

2018-07-26 Thread Doron Roberts-Kedes

tls_push_record either returns 0 on success or a negative value on failure.
This patch removes code that would only be executed if tls_push_record
were to return a positive value.

Signed-off-by: Doron Roberts-Kedes 
---
 net/tls/tls_sw.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 03f1370f5db1..0279bc4a168b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -423,12 +423,10 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
 
copied += try_to_copy;
ret = tls_push_record(sk, msg->msg_flags, record_type);
-   if (!ret)
-   continue;
-   if (ret < 0)
+   if (ret)
goto send_end;
+   continue;
 
-   copied -= try_to_copy;
 fallback_to_reg_send:
iov_iter_revert(&msg->msg_iter,
ctx->sg_plaintext_size - orig_size);
-- 
2.17.1

[PATCH net-next v2 2/2] tls: Fix improper revert in zerocopy_from_iter

2018-07-26 Thread Doron Roberts-Kedes

The current code is problematic because the iov_iter is reverted and
never advanced in the non-error case. This patch skips the revert in the
non-error case. This patch also fixes the amount by which the iov_iter
is reverted. Currently, iov_iter is reverted by size, which can be
greater than the amount by which the iter was actually advanced.
Instead, only revert by the amount that the iter was advanced.

Fixes: 4718799817c5 ("tls: Fix zerocopy_from_iter iov handling")
Signed-off-by: Doron Roberts-Kedes 
---
 net/tls/tls_sw.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 0279bc4a168b..190b0aa79c85 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -263,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
  int length, int *pages_used,
  unsigned int *size_used,
  struct scatterlist *to, int to_max_pages,
- bool charge, bool revert)
+ bool charge)
 {
struct page *pages[MAX_SKB_FRAGS];
 
@@ -312,10 +312,10 @@ static int zerocopy_from_iter(struct sock *sk, struct 
iov_iter *from,
}
 
 out:
+   if (rc)
+   iov_iter_revert(from, size - *size_used);
*size_used = size;
*pages_used = num_elem;
-   if (revert)
-   iov_iter_revert(from, size);
 
return rc;
 }
@@ -417,7 +417,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
&ctx->sg_plaintext_size,
ctx->sg_plaintext_data,
ARRAY_SIZE(ctx->sg_plaintext_data),
-   true, false);
+   true);
if (ret)
goto fallback_to_reg_send;
 
@@ -428,8 +428,6 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
continue;
 
 fallback_to_reg_send:
-   iov_iter_revert(&msg->msg_iter,
-   ctx->sg_plaintext_size - orig_size);
trim_sg(sk, ctx->sg_plaintext_data,
&ctx->sg_plaintext_num_elem,
&ctx->sg_plaintext_size,
@@ -833,7 +831,7 @@ int tls_sw_recvmsg(struct sock *sk,
err = zerocopy_from_iter(sk, &msg->msg_iter,
 to_copy, &pages,
 &chunk, &sgin[1],
-MAX_SKB_FRAGS, false, 
true);
+MAX_SKB_FRAGS, false);
if (err < 0)
goto fallback_to_reg_recv;
 
-- 
2.17.1

[PATCH net-next v2 0/2] tls: Fix improper revert in zerocopy_from_iter

2018-07-26 Thread Doron Roberts-Kedes

This series fixes the improper iov_iter_revert introcded in 
"tls: Fix zerocopy_from_iter iov handling". 

Changes from v1:
- call iov_iter_revert inside zerocopy_from_iter

Doron Roberts-Kedes (2):
  tls: Remove dead code in tls_sw_sendmsg
  tls: Fix improper revert in zerocopy_from_iter

 net/tls/tls_sw.c | 18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

-- 
2.17.1

Re: [PATCH rdma-next v2 0/8] Support mlx5 flow steering with RAW data

2018-07-26 Thread Jason Gunthorpe

On Thu, Jul 26, 2018 at 07:35:49AM +0300, Leon Romanovsky wrote:
> On Wed, Jul 25, 2018 at 08:35:17AM -0600, Jason Gunthorpe wrote:
> > On Wed, Jul 25, 2018 at 08:37:03AM +0300, Leon Romanovsky wrote:
> >
> > > > Also, I would like to keep the specs consistently formatted according
> > > > to clang-format with 'BinPackParameters: true', so I reflowed them as
> > > > well.
> > >
> > > I'm using default VIM clang-format.py without anything in .clang-format.
> > > Do you have an extra definitions there, except BinPackParameters?
> >
> > These days Linux includes a top level .clang-format that does a
> > pretty good job.
> >
> > I have to manually switch BinPackParameters on when working with these
> > specs to get the right indenting.. A pain, but maybe there is a better
> > way someday..
> 
> I don't think that it is feasible to ask from people to change some
> defaults only for patches that touch those specs. Any change in this
> area will change formatting back.

Eventually I think we might be able toadd a code comment to tell
clang-format, but that would be down the road a bit..

> Jason, bottom line, I won't use BinPackParameters for my patches.

Well, you can make sure the specs macro follows the required
formatting code style by hand if you prefer..

But, I want to see them in this layout, so they are easier to
maintain, not the haphazard layout we had before.

Jason

[PATCH v5 bpf-next 9/9] veth: Support per queue XDP ring

2018-07-26 Thread Toshiaki Makita

From: Toshiaki Makita 

Move XDP and napi related fields in veth_priv to newly created veth_rq
structure.

When xdp_frames are enqueued from ndo_xdp_xmit and XDP_TX, rxq is
selected by current cpu.

When skbs are enqueued from the peer device, rxq is one to one mapping
of its peer txq. This way we have a restriction that the number of rxqs
must not less than the number of peer txqs, but leave the possibility to
achieve bulk skb xmit in the future because txq lock would make it
possible to remove rxq ptr_ring lock.

v3:
- Add extack messages.
- Fix array overrun in veth_xmit.

Signed-off-by: Toshiaki Makita 
---
 drivers/net/veth.c | 278 -
 1 file changed, 188 insertions(+), 90 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 60397a8ea2e9..3059b897ecea 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -42,20 +42,24 @@ struct pcpu_vstats {
struct u64_stats_sync   syncp;
 };
 
-struct veth_priv {
+struct veth_rq {
struct napi_struct  xdp_napi;
struct net_device   *dev;
struct bpf_prog __rcu   *xdp_prog;
-   struct bpf_prog *_xdp_prog;
-   struct net_device __rcu *peer;
-   atomic64_t  dropped;
struct xdp_mem_info xdp_mem;
-   unsignedrequested_headroom;
boolrx_notify_masked;
struct ptr_ring xdp_ring;
struct xdp_rxq_info xdp_rxq;
 };
 
+struct veth_priv {
+   struct net_device __rcu *peer;
+   atomic64_t  dropped;
+   struct bpf_prog *_xdp_prog;
+   struct veth_rq  *rq;
+   unsigned intrequested_headroom;
+};
+
 /*
  * ethtool interface
  */
@@ -144,19 +148,19 @@ static void veth_ptr_free(void *ptr)
kfree_skb(ptr);
 }
 
-static void __veth_xdp_flush(struct veth_priv *priv)
+static void __veth_xdp_flush(struct veth_rq *rq)
 {
/* Write ptr_ring before reading rx_notify_masked */
smp_mb();
-   if (!priv->rx_notify_masked) {
-   priv->rx_notify_masked = true;
-   napi_schedule(&priv->xdp_napi);
+   if (!rq->rx_notify_masked) {
+   rq->rx_notify_masked = true;
+   napi_schedule(&rq->xdp_napi);
}
 }
 
-static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb)
+static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
 {
-   if (unlikely(ptr_ring_produce(&priv->xdp_ring, skb))) {
+   if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
dev_kfree_skb_any(skb);
return NET_RX_DROP;
}
@@ -164,21 +168,22 @@ static int veth_xdp_rx(struct veth_priv *priv, struct 
sk_buff *skb)
return NET_RX_SUCCESS;
 }
 
-static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, bool 
xdp)
+static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
+   struct veth_rq *rq, bool xdp)
 {
-   struct veth_priv *priv = netdev_priv(dev);
-
return __dev_forward_skb(dev, skb) ?: xdp ?
-   veth_xdp_rx(priv, skb) :
+   veth_xdp_rx(rq, skb) :
netif_rx(skb);
 }
 
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+   struct veth_rq *rq = NULL;
struct net_device *rcv;
int length = skb->len;
bool rcv_xdp = false;
+   int rxq;
 
rcu_read_lock();
rcv = rcu_dereference(priv->peer);
@@ -188,9 +193,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
}
 
rcv_priv = netdev_priv(rcv);
-   rcv_xdp = rcu_access_pointer(rcv_priv->xdp_prog);
+   rxq = skb_get_queue_mapping(skb);
+   if (rxq < rcv->real_num_rx_queues) {
+   rq = &rcv_priv->rq[rxq];
+   rcv_xdp = rcu_access_pointer(rq->xdp_prog);
+   if (rcv_xdp)
+   skb_record_rx_queue(skb, rxq);
+   }
 
-   if (likely(veth_forward_skb(rcv, skb, rcv_xdp) == NET_RX_SUCCESS)) {
+   if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
u64_stats_update_begin(&stats->syncp);
@@ -203,7 +214,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
}
 
if (rcv_xdp)
-   __veth_xdp_flush(rcv_priv);
+   __veth_xdp_flush(rq);
 
rcu_read_unlock();
 
@@ -278,12 +289,18 @@ static struct sk_buff *veth_build_skb(void *head, int 
headroom, int len,
return skb;
 }
 
+static int veth_select_rxq(struct net_device *dev)
+{
+   return smp_processor_id() % dev->real_num_rx_queues;
+}
+
 static int veth_xdp_xmit(struct net_device *dev, int n,
 struct xdp_frame **frames, u32 flags)
 {
struct veth_p

[PATCH v5 bpf-next 6/9] bpf: Make redirect_info accessible from modules

2018-07-26 Thread Toshiaki Makita

From: Toshiaki Makita 

We are going to add kern_flags field in redirect_info for kernel
internal use.
In order to avoid function call to access the flags, make redirect_info
accessible from modules. Also as it is now non-static, add prefix bpf_
to redirect_info.

Signed-off-by: Toshiaki Makita 
---
 include/linux/filter.h | 10 ++
 net/core/filter.c  | 29 +++--
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c73dd7396886..4717af8b95e6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -537,6 +537,16 @@ struct sk_msg_buff {
struct list_head list;
 };
 
+struct bpf_redirect_info {
+   u32 ifindex;
+   u32 flags;
+   struct bpf_map *map;
+   struct bpf_map *map_to_flush;
+   unsigned long   map_owner;
+};
+
+DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+
 /* Compute the linear packet data range [data, data_end) which
  * will be accessed by various program types (cls_bpf, act_bpf,
  * lwt, ...). Subsystems allowing direct data access must (!)
diff --git a/net/core/filter.c b/net/core/filter.c
index 104d560946da..acf322296535 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2080,19 +2080,12 @@ static const struct bpf_func_proto 
bpf_clone_redirect_proto = {
.arg3_type  = ARG_ANYTHING,
 };
 
-struct redirect_info {
-   u32 ifindex;
-   u32 flags;
-   struct bpf_map *map;
-   struct bpf_map *map_to_flush;
-   unsigned long   map_owner;
-};
-
-static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+EXPORT_SYMBOL_GPL(bpf_redirect_info);
 
 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
if (unlikely(flags & ~(BPF_F_INGRESS)))
return TC_ACT_SHOT;
@@ -2105,7 +2098,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 
 int skb_do_redirect(struct sk_buff *skb)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *dev;
 
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
@@ -3198,7 +3191,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, 
void *fwd,
 
 void xdp_do_flush_map(void)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = ri->map_to_flush;
 
ri->map_to_flush = NULL;
@@ -3243,7 +3236,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog 
*xdp_prog,
 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
   struct bpf_prog *xdp_prog)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
u32 index = ri->ifindex;
@@ -3283,7 +3276,7 @@ static int xdp_do_redirect_map(struct net_device *dev, 
struct xdp_buff *xdp,
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *fwd;
u32 index = ri->ifindex;
int err;
@@ -3315,7 +3308,7 @@ static int xdp_do_generic_redirect_map(struct net_device 
*dev,
   struct xdp_buff *xdp,
   struct bpf_prog *xdp_prog)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
u32 index = ri->ifindex;
@@ -3366,7 +3359,7 @@ static int xdp_do_generic_redirect_map(struct net_device 
*dev,
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
u32 index = ri->ifindex;
struct net_device *fwd;
int err = 0;
@@ -3397,7 +3390,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
 
 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 {
-   struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
if (unlikely(flags))
return XDP_ABORTED;
@@ -3421,7 +3414,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto 
= {

[PATCH v5 bpf-next 8/9] veth: Add XDP TX and REDIRECT

2018-07-26 Thread Toshiaki Makita

From: Toshiaki Makita 

This allows further redirection of xdp_frames like

 NIC   -> veth--veth -> veth--veth
 (XDP)  (XDP) (XDP)

The intermediate XDP, redirecting packets from NIC to the other veth,
reuses xdp_mem_info from NIC so that page recycling of the NIC works on
the destination veth's XDP.
In this way return_frame is not fully guarded by NAPI, since another
NAPI handler on another cpu may use the same xdp_mem_info concurrently.
Thus disable napi_direct by xdp_set_return_frame_no_direct() during the
NAPI context.

v4:
- Use xdp_[set|clear]_return_frame_no_direct() instead of a flag in
  xdp_mem_info.

v3:
- Fix double free when veth_xdp_tx() returns a positive value.
- Convert xdp_xmit and xdp_redir variables into flags.

Signed-off-by: Toshiaki Makita 
---
 drivers/net/veth.c | 119 +
 1 file changed, 110 insertions(+), 9 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index acdb1c543f4b..60397a8ea2e9 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -32,6 +32,10 @@
 #define VETH_RING_SIZE 256
 #define VETH_XDP_HEADROOM  (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
 
+/* Separating two types of XDP xmit */
+#define VETH_XDP_TXBIT(0)
+#define VETH_XDP_REDIR BIT(1)
+
 struct pcpu_vstats {
u64 packets;
u64 bytes;
@@ -45,6 +49,7 @@ struct veth_priv {
struct bpf_prog *_xdp_prog;
struct net_device __rcu *peer;
atomic64_t  dropped;
+   struct xdp_mem_info xdp_mem;
unsignedrequested_headroom;
boolrx_notify_masked;
struct ptr_ring xdp_ring;
@@ -317,10 +322,42 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
return n - drops;
 }
 
+static void veth_xdp_flush(struct net_device *dev)
+{
+   struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+   struct net_device *rcv;
+
+   rcu_read_lock();
+   rcv = rcu_dereference(priv->peer);
+   if (unlikely(!rcv))
+   goto out;
+
+   rcv_priv = netdev_priv(rcv);
+   /* xdp_ring is initialized on receive side? */
+   if (unlikely(!rcu_access_pointer(rcv_priv->xdp_prog)))
+   goto out;
+
+   __veth_xdp_flush(rcv_priv);
+out:
+   rcu_read_unlock();
+}
+
+static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
+{
+   struct xdp_frame *frame = convert_to_xdp_frame(xdp);
+
+   if (unlikely(!frame))
+   return -EOVERFLOW;
+
+   return veth_xdp_xmit(dev, 1, &frame, 0);
+}
+
 static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
-   struct xdp_frame *frame)
+   struct xdp_frame *frame,
+   unsigned int *xdp_xmit)
 {
int len = frame->len, delta = 0;
+   struct xdp_frame orig_frame;
struct bpf_prog *xdp_prog;
unsigned int headroom;
struct sk_buff *skb;
@@ -344,6 +381,29 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv 
*priv,
delta = frame->data - xdp.data;
len = xdp.data_end - xdp.data;
break;
+   case XDP_TX:
+   orig_frame = *frame;
+   xdp.data_hard_start = frame;
+   xdp.rxq->mem = frame->mem;
+   if (unlikely(veth_xdp_tx(priv->dev, &xdp) < 0)) {
+   trace_xdp_exception(priv->dev, xdp_prog, act);
+   frame = &orig_frame;
+   goto err_xdp;
+   }
+   *xdp_xmit |= VETH_XDP_TX;
+   rcu_read_unlock();
+   goto xdp_xmit;
+   case XDP_REDIRECT:
+   orig_frame = *frame;
+   xdp.data_hard_start = frame;
+   xdp.rxq->mem = frame->mem;
+   if (xdp_do_redirect(priv->dev, &xdp, xdp_prog)) {
+   frame = &orig_frame;
+   goto err_xdp;
+   }
+   *xdp_xmit |= VETH_XDP_REDIR;
+   rcu_read_unlock();
+   goto xdp_xmit;
default:
bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED:
@@ -368,12 +428,13 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv 
*priv,
 err_xdp:
rcu_read_unlock();
xdp_return_frame(frame);
-
+xdp_xmit:
return NULL;
 }
 
 static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
-   struct sk_buff *skb)
+   struct sk_buff *skb,
+   unsigned int *xdp_xmit)
 {

[PATCH v5 bpf-next 5/9] veth: Add ndo_xdp_xmit

2018-07-26 Thread Toshiaki Makita

From: Toshiaki Makita 

This allows NIC's XDP to redirect packets to veth. The destination veth
device enqueues redirected packets to the napi ring of its peer, then
they are processed by XDP on its peer veth device.
This can be thought as calling another XDP program by XDP program using
REDIRECT, when the peer enables driver XDP.

Note that when the peer veth device does not set driver xdp, redirected
packets will be dropped because the peer is not ready for NAPI.

v4:
- Don't use xdp_ok_fwd_dev() because checking IFF_UP is not necessary.
  Add comments about it and check only MTU.

v2:
- Drop the part converting xdp_frame into skb when XDP is not enabled.
- Implement bulk interface of ndo_xdp_xmit.
- Implement XDP_XMIT_FLUSH bit and drop ndo_xdp_flush.

Signed-off-by: Toshiaki Makita 
---
 drivers/net/veth.c | 51 +++
 1 file changed, 51 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index ef22d991f678..acdb1c543f4b 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -125,6 +126,11 @@ static void *veth_ptr_to_xdp(void *ptr)
return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
 }
 
+static void *veth_xdp_to_ptr(void *ptr)
+{
+   return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
+}
+
 static void veth_ptr_free(void *ptr)
 {
if (veth_is_xdp_frame(ptr))
@@ -267,6 +273,50 @@ static struct sk_buff *veth_build_skb(void *head, int 
headroom, int len,
return skb;
 }
 
+static int veth_xdp_xmit(struct net_device *dev, int n,
+struct xdp_frame **frames, u32 flags)
+{
+   struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+   struct net_device *rcv;
+   unsigned int max_len;
+   int i, drops = 0;
+
+   if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+   return -EINVAL;
+
+   rcv = rcu_dereference(priv->peer);
+   if (unlikely(!rcv))
+   return -ENXIO;
+
+   rcv_priv = netdev_priv(rcv);
+   /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
+* side. This means an XDP program is loaded on the peer and the peer
+* device is up.
+*/
+   if (!rcu_access_pointer(rcv_priv->xdp_prog))
+   return -ENXIO;
+
+   max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
+
+   spin_lock(&rcv_priv->xdp_ring.producer_lock);
+   for (i = 0; i < n; i++) {
+   struct xdp_frame *frame = frames[i];
+   void *ptr = veth_xdp_to_ptr(frame);
+
+   if (unlikely(frame->len > max_len ||
+__ptr_ring_produce(&rcv_priv->xdp_ring, ptr))) {
+   xdp_return_frame_rx_napi(frame);
+   drops++;
+   }
+   }
+   spin_unlock(&rcv_priv->xdp_ring.producer_lock);
+
+   if (flags & XDP_XMIT_FLUSH)
+   __veth_xdp_flush(rcv_priv);
+
+   return n - drops;
+}
+
 static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
struct xdp_frame *frame)
 {
@@ -766,6 +816,7 @@ static const struct net_device_ops veth_netdev_ops = {
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom= veth_set_rx_headroom,
.ndo_bpf= veth_xdp,
+   .ndo_xdp_xmit   = veth_xdp_xmit,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
-- 
2.14.3

[PATCH v5 bpf-next 4/9] veth: Handle xdp_frames in xdp napi ring

2018-07-26 Thread Toshiaki Makita

From: Toshiaki Makita 

This is preparation for XDP TX and ndo_xdp_xmit.
This allows napi handler to handle xdp_frames through xdp ring as well
as sk_buff.

v3:
- Revert v2 change around rings and use a flag to differentiate skb and
  xdp_frame, since bulk skb xmit makes little performance difference
  for now.

v2:
- Use another ring instead of using flag to differentiate skb and
  xdp_frame. This approach makes bulk skb transmit possible in
  veth_xmit later.
- Clear xdp_frame feilds in skb->head.
- Implement adjust_tail.

Signed-off-by: Toshiaki Makita 
---
 drivers/net/veth.c | 87 ++
 1 file changed, 82 insertions(+), 5 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 1b4006d3df32..ef22d991f678 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -22,12 +22,12 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #define DRV_NAME   "veth"
 #define DRV_VERSION"1.0"
 
+#define VETH_XDP_FLAG  BIT(0)
 #define VETH_RING_SIZE 256
 #define VETH_XDP_HEADROOM  (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
 
@@ -115,6 +115,24 @@ static const struct ethtool_ops veth_ethtool_ops = {
 
 /* general routines */
 
+static bool veth_is_xdp_frame(void *ptr)
+{
+   return (unsigned long)ptr & VETH_XDP_FLAG;
+}
+
+static void *veth_ptr_to_xdp(void *ptr)
+{
+   return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
+}
+
+static void veth_ptr_free(void *ptr)
+{
+   if (veth_is_xdp_frame(ptr))
+   xdp_return_frame(veth_ptr_to_xdp(ptr));
+   else
+   kfree_skb(ptr);
+}
+
 static void __veth_xdp_flush(struct veth_priv *priv)
 {
/* Write ptr_ring before reading rx_notify_masked */
@@ -249,6 +267,61 @@ static struct sk_buff *veth_build_skb(void *head, int 
headroom, int len,
return skb;
 }
 
+static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
+   struct xdp_frame *frame)
+{
+   int len = frame->len, delta = 0;
+   struct bpf_prog *xdp_prog;
+   unsigned int headroom;
+   struct sk_buff *skb;
+
+   rcu_read_lock();
+   xdp_prog = rcu_dereference(priv->xdp_prog);
+   if (likely(xdp_prog)) {
+   struct xdp_buff xdp;
+   u32 act;
+
+   xdp.data_hard_start = frame->data - frame->headroom;
+   xdp.data = frame->data;
+   xdp.data_end = frame->data + frame->len;
+   xdp.data_meta = frame->data - frame->metasize;
+   xdp.rxq = &priv->xdp_rxq;
+
+   act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+   switch (act) {
+   case XDP_PASS:
+   delta = frame->data - xdp.data;
+   len = xdp.data_end - xdp.data;
+   break;
+   default:
+   bpf_warn_invalid_xdp_action(act);
+   case XDP_ABORTED:
+   trace_xdp_exception(priv->dev, xdp_prog, act);
+   case XDP_DROP:
+   goto err_xdp;
+   }
+   }
+   rcu_read_unlock();
+
+   headroom = frame->data - delta - (void *)frame;
+   skb = veth_build_skb(frame, headroom, len, 0);
+   if (!skb) {
+   xdp_return_frame(frame);
+   goto err;
+   }
+
+   memset(frame, 0, sizeof(*frame));
+   skb->protocol = eth_type_trans(skb, priv->dev);
+err:
+   return skb;
+err_xdp:
+   rcu_read_unlock();
+   xdp_return_frame(frame);
+
+   return NULL;
+}
+
 static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
struct sk_buff *skb)
 {
@@ -358,12 +431,16 @@ static int veth_xdp_rcv(struct veth_priv *priv, int 
budget)
int i, done = 0;
 
for (i = 0; i < budget; i++) {
-   struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring);
+   void *ptr = __ptr_ring_consume(&priv->xdp_ring);
+   struct sk_buff *skb;
 
-   if (!skb)
+   if (!ptr)
break;
 
-   skb = veth_xdp_rcv_skb(priv, skb);
+   if (veth_is_xdp_frame(ptr))
+   skb = veth_xdp_rcv_one(priv, veth_ptr_to_xdp(ptr));
+   else
+   skb = veth_xdp_rcv_skb(priv, ptr);
 
if (skb)
napi_gro_receive(&priv->xdp_napi, skb);
@@ -416,7 +493,7 @@ static void veth_napi_del(struct net_device *dev)
napi_disable(&priv->xdp_napi);
netif_napi_del(&priv->xdp_napi);
priv->rx_notify_masked = false;
-   ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
+   ptr_ring_cleanup(&priv->xdp_ring, veth_ptr_free);
 }
 
 static int veth_enable_xdp(struct net_device *dev)
-- 
2.14.3

[PATCH v5 bpf-next 7/9] xdp: Helpers for disabling napi_direct of xdp_return_frame

2018-07-26 Thread Toshiaki Makita

From: Toshiaki Makita 

We need some mechanism to disable napi_direct on calling
xdp_return_frame_rx_napi() from some context.
When veth gets support of XDP_REDIRECT, it will redirects packets which
are redirected from other devices. On redirection veth will reuse
xdp_mem_info of the redirection source device to make return_frame work.
But in this case .ndo_xdp_xmit() called from veth redirection uses
xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit()
is not called directly from the rxq which owns the xdp_mem_info.

This approach introduces a flag in bpf_redirect_info to indicate that
napi_direct should be disabled even when _rx_napi variant is used as
well as helper functions to use it.

A NAPI handler who wants to use this flag needs to call
xdp_set_return_frame_no_direct() before processing packets, and call
xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before
exiting NAPI.

v4:
- Use bpf_redirect_info for storing the flag instead of xdp_mem_info to
  avoid per-frame copy cost.

Signed-off-by: Toshiaki Makita 
---
 include/linux/filter.h | 25 +
 net/core/xdp.c |  6 --
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4717af8b95e6..2b072dab32c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -543,10 +543,14 @@ struct bpf_redirect_info {
struct bpf_map *map;
struct bpf_map *map_to_flush;
unsigned long   map_owner;
+   u32 kern_flags;
 };
 
 DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
 
+/* flags for bpf_redirect_info kern_flags */
+#define BPF_RI_F_RF_NO_DIRECT  BIT(0)  /* no napi_direct on return_frame */
+
 /* Compute the linear packet data range [data, data_end) which
  * will be accessed by various program types (cls_bpf, act_bpf,
  * lwt, ...). Subsystems allowing direct data access must (!)
@@ -775,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
   const struct bpf_insn *patch, u32 len);
 
+static inline bool xdp_return_frame_no_direct(void)
+{
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+   return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
+}
+
+static inline void xdp_set_return_frame_no_direct(void)
+{
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+   ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
+}
+
+static inline void xdp_clear_return_frame_no_direct(void)
+{
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+   ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
+}
+
 static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
 unsigned int pktlen)
 {
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 57285383ed00..3dd99e1c04f5 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info 
*mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
-   if (xa)
+   if (xa) {
+   napi_direct &= !xdp_return_frame_no_direct();
page_pool_put_page(xa->page_pool, page, napi_direct);
-   else
+   } else {
put_page(page);
+   }
rcu_read_unlock();
break;
case MEM_TYPE_PAGE_SHARED:
-- 
2.14.3

[PATCH v5 bpf-next 3/9] veth: Avoid drops by oversized packets when XDP is enabled

2018-07-26 Thread Toshiaki Makita

From: Toshiaki Makita 

All oversized packets including GSO packets are dropped if XDP is
enabled on receiver side, so don't send such packets from peer.

Drop TSO and SCTP fragmentation features so that veth devices themselves
segment packets with XDP enabled. Also cap MTU accordingly.

v4:
- Don't auto-adjust MTU but cap max MTU.

Signed-off-by: Toshiaki Makita 
---
 drivers/net/veth.c | 47 +--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 78fa08cb6e24..1b4006d3df32 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -542,6 +542,23 @@ static int veth_get_iflink(const struct net_device *dev)
return iflink;
 }
 
+static netdev_features_t veth_fix_features(struct net_device *dev,
+  netdev_features_t features)
+{
+   struct veth_priv *priv = netdev_priv(dev);
+   struct net_device *peer;
+
+   peer = rtnl_dereference(priv->peer);
+   if (peer) {
+   struct veth_priv *peer_priv = netdev_priv(peer);
+
+   if (peer_priv->_xdp_prog)
+   features &= ~NETIF_F_GSO_SOFTWARE;
+   }
+
+   return features;
+}
+
 static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 {
struct veth_priv *peer_priv, *priv = netdev_priv(dev);
@@ -571,6 +588,7 @@ static int veth_xdp_set(struct net_device *dev, struct 
bpf_prog *prog,
struct veth_priv *priv = netdev_priv(dev);
struct bpf_prog *old_prog;
struct net_device *peer;
+   unsigned int max_mtu;
int err;
 
old_prog = priv->_xdp_prog;
@@ -584,6 +602,15 @@ static int veth_xdp_set(struct net_device *dev, struct 
bpf_prog *prog,
goto err;
}
 
+   max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
+ peer->hard_header_len -
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+   if (peer->mtu > max_mtu) {
+   NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to 
set XDP");
+   err = -ERANGE;
+   goto err;
+   }
+
if (dev->flags & IFF_UP) {
err = veth_enable_xdp(dev);
if (err) {
@@ -591,14 +618,29 @@ static int veth_xdp_set(struct net_device *dev, struct 
bpf_prog *prog,
goto err;
}
}
+
+   if (!old_prog) {
+   peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+   peer->max_mtu = max_mtu;
+   }
}
 
if (old_prog) {
-   if (!prog && dev->flags & IFF_UP)
-   veth_disable_xdp(dev);
+   if (!prog) {
+   if (dev->flags & IFF_UP)
+   veth_disable_xdp(dev);
+
+   if (peer) {
+   peer->hw_features |= NETIF_F_GSO_SOFTWARE;
+   peer->max_mtu = ETH_MAX_MTU;
+   }
+   }
bpf_prog_put(old_prog);
}
 
+   if ((!!old_prog ^ !!prog) && peer)
+   netdev_update_features(peer);
+
return 0;
 err:
priv->_xdp_prog = old_prog;
@@ -643,6 +685,7 @@ static const struct net_device_ops veth_netdev_ops = {
.ndo_poll_controller= veth_poll_controller,
 #endif
.ndo_get_iflink = veth_get_iflink,
+   .ndo_fix_features   = veth_fix_features,
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom= veth_set_rx_headroom,
.ndo_bpf= veth_xdp,
-- 
2.14.3

1 2 >

1 - 100 of 138 matches

Mail list logo